From 96ec6856f91f7f9031cfce4273c714d72cfe59ae Mon Sep 17 00:00:00 2001 From: pschwan Date: Thu, 12 Jun 2003 07:12:50 +0000 Subject: [PATCH] - merge 0.7rc1 from b_devel to HEAD (20030612 merge point) - remove extN/ files --- {lustre/lib => lnet}/.cvsignore | 10 +- lnet/AUTHORS | 0 lnet/ChangeLog | 0 lnet/Kernelenv.in | 1 + lnet/Kernelenv.mk | 1 + lnet/Makefile.am | 12 + lnet/Makefile.mk | 6 + lnet/NEWS | 0 lnet/README | 0 lnet/Rules.linux | 25 + lnet/archdep.m4 | 317 + lnet/autogen.sh | 5 + lnet/build.m4 | 95 + lnet/configure.in | 34 + lnet/doc/.cvsignore | 4 + lnet/doc/Data-structures | 65 + lnet/doc/Makefile.am | 46 + lnet/doc/Message-life-cycle | 118 + lnet/doc/NAL-HOWTO | 293 + lnet/doc/file.fig | 111 + lnet/doc/flow_new.fig | 213 + lnet/doc/get.fig | 33 + lnet/doc/ieee.bst | 1112 + lnet/doc/mpi.fig | 117 + lnet/doc/portals.fig | 68 + lnet/doc/portals3.bib | 124 + lnet/doc/portals3.lyx | 15944 ++++++++ lnet/doc/put.fig | 32 + lnet/include/.cvsignore | 4 + lnet/include/Makefile.am | 8 + lnet/include/config.h.in | 11 + lnet/include/linux/Makefile.am | 10 + lnet/include/linux/kp30.h | 943 + lnet/include/linux/portals_compat25.h | 13 + lnet/include/linux/portals_lib.h | 188 + lnet/include/lnet/Makefile.am | 10 + lnet/include/lnet/api-support.h | 27 + lnet/include/lnet/api.h | 159 + lnet/include/lnet/arg-blocks.h | 265 + lnet/include/lnet/defines.h | 116 + lnet/include/lnet/errno.h | 61 + lnet/include/lnet/internal.h | 45 + lnet/include/lnet/lib-dispatch.h | 45 + lnet/include/lnet/lib-lnet.h | 385 + lnet/include/lnet/lib-nal.h | 102 + lnet/include/lnet/lib-p30.h | 385 + lnet/include/lnet/lib-types.h | 282 + lnet/include/lnet/list.h | 245 + lnet/include/lnet/lltrace.h | 175 + lnet/include/lnet/lnet.h | 72 + lnet/include/lnet/lnetctl.h | 75 + lnet/include/lnet/myrnal.h | 26 + lnet/include/lnet/nal.h | 49 + lnet/include/lnet/nalids.h | 4 + lnet/include/lnet/p30.h | 72 + lnet/include/lnet/ppid.h | 52 + lnet/include/lnet/ptlctl.h | 75 + lnet/include/lnet/stringtab.h | 5 + lnet/include/lnet/types.h | 157 + lnet/klnds/.cvsignore | 2 + lnet/klnds/Makefile.am | 7 + lnet/klnds/Makefile.mk | 4 + lnet/klnds/gmlnd/.cvsignore | 3 + lnet/klnds/gmlnd/Makefile.am | 13 + lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch | 43 + lnet/klnds/gmlnd/gmlnd.h | 101 + lnet/klnds/gmlnd/gmlnd_cb.c | 517 + lnet/klnds/gmlnd/gmnal.c | 284 + lnet/klnds/qswlnd/.cvsignore | 3 + lnet/klnds/qswlnd/Makefile.am | 17 + lnet/klnds/qswlnd/qswlnd.c | 608 + lnet/klnds/qswlnd/qswlnd.h | 270 + lnet/klnds/qswlnd/qswlnd_cb.c | 1239 + lnet/klnds/scimaclnd/.cvsignore | 3 + lnet/klnds/scimaclnd/Makefile.am | 11 + lnet/klnds/scimaclnd/README.scimacnal | 14 + lnet/klnds/scimaclnd/scimac.conf | 35 + lnet/klnds/scimaclnd/scimacnal.c | 219 + lnet/klnds/scimaclnd/scimacnal.h | 85 + lnet/klnds/scimaclnd/scimacnal_cb.c | 468 + lnet/klnds/socklnd/.cvsignore | 3 + lnet/klnds/socklnd/Makefile.am | 13 + lnet/klnds/socklnd/Makefile.mk | 10 + lnet/klnds/socklnd/socklnd.c | 860 + lnet/klnds/socklnd/socklnd.h | 292 + lnet/klnds/socklnd/socklnd_cb.c | 1613 + lnet/klnds/toelnd/.cvsignore | 3 + lnet/klnds/toelnd/Makefile.am | 13 + lnet/klnds/toelnd/toenal.c | 629 + lnet/klnds/toelnd/toenal.h | 236 + lnet/klnds/toelnd/toenal_cb.c | 1219 + lnet/libcfs/.cvsignore | 4 + lnet/libcfs/Makefile.am | 29 + lnet/libcfs/Makefile.mk | 9 + lnet/libcfs/debug.c | 830 + lnet/libcfs/module.c | 574 + lnet/libcfs/proc.c | 290 + lnet/lnet/.cvsignore | 3 + lnet/lnet/Makefile.am | 10 + lnet/lnet/Makefile.mk | 9 + lnet/lnet/api-eq.c | 158 + lnet/lnet/api-errno.c | 55 + lnet/lnet/api-init.c | 71 + lnet/lnet/api-me.c | 42 + lnet/lnet/api-ni.c | 197 + lnet/lnet/api-wrap.c | 599 + lnet/lnet/lib-dispatch.c | 80 + lnet/lnet/lib-eq.c | 128 + lnet/lnet/lib-init.c | 474 + lnet/lnet/lib-md.c | 412 + lnet/lnet/lib-me.c | 227 + lnet/lnet/lib-move.c | 1379 + lnet/lnet/lib-msg.c | 163 + lnet/lnet/lib-ni.c | 128 + lnet/lnet/lib-pid.c | 58 + lnet/packaging/.cvsignore | 8 + lnet/packaging/Makefile.am | 6 + lnet/packaging/portals.spec.in | 116 + lnet/router/.cvsignore | 3 + lnet/router/Makefile.am | 16 + lnet/router/Makefile.mk | 9 + lnet/router/proc.c | 78 + lnet/router/router.c | 449 + lnet/router/router.h | 81 + lnet/tests/.cvsignore | 3 + lnet/tests/Makefile.am | 23 + lnet/tests/ping.h | 80 + lnet/tests/ping_cli.c | 300 + lnet/tests/ping_srv.c | 308 + lnet/tests/sping_cli.c | 276 + lnet/tests/sping_srv.c | 295 + lnet/tests/startclient.sh | 37 + lnet/tests/startserver.sh | 38 + lnet/tests/stopclient.sh | 14 + lnet/tests/stopserver.sh | 16 + lnet/ulnds/.cvsignore | 3 + lnet/ulnds/Makefile.am | 5 + lnet/ulnds/README | 53 + lnet/ulnds/address.c | 146 + lnet/ulnds/bridge.h | 29 + lnet/ulnds/connection.c | 294 + lnet/ulnds/connection.h | 32 + lnet/ulnds/debug.c | 119 + lnet/ulnds/dispatch.h | 39 + lnet/ulnds/ipmap.h | 38 + lnet/ulnds/pqtimer.c | 226 + lnet/ulnds/pqtimer.h | 25 + lnet/ulnds/procapi.c | 283 + lnet/ulnds/procbridge.h | 40 + lnet/ulnds/proclib.c | 270 + lnet/ulnds/select.c | 165 + lnet/ulnds/socklnd/Makefile.am | 5 + lnet/ulnds/socklnd/README | 53 + lnet/ulnds/socklnd/address.c | 146 + lnet/ulnds/socklnd/bridge.h | 29 + lnet/ulnds/socklnd/connection.c | 294 + lnet/ulnds/socklnd/connection.h | 32 + lnet/ulnds/socklnd/debug.c | 119 + lnet/ulnds/socklnd/dispatch.h | 39 + lnet/ulnds/socklnd/ipmap.h | 38 + lnet/ulnds/socklnd/pqtimer.c | 226 + lnet/ulnds/socklnd/pqtimer.h | 25 + lnet/ulnds/socklnd/procapi.c | 283 + lnet/ulnds/socklnd/procbridge.h | 40 + lnet/ulnds/socklnd/proclib.c | 270 + lnet/ulnds/socklnd/select.c | 165 + lnet/ulnds/socklnd/table.c | 264 + lnet/ulnds/socklnd/table.h | 39 + lnet/ulnds/socklnd/tcplnd.c | 198 + lnet/ulnds/socklnd/timer.h | 30 + lnet/ulnds/socklnd/utypes.h | 12 + lnet/ulnds/table.c | 264 + lnet/ulnds/table.h | 39 + lnet/ulnds/tcplnd.c | 198 + lnet/ulnds/timer.h | 30 + lnet/ulnds/utypes.h | 12 + lnet/utils/.cvsignore | 8 + lnet/utils/Makefile.am | 27 + lnet/utils/acceptor.c | 466 + lnet/utils/debug.c | 618 + lnet/utils/debugctl.c | 66 + lnet/utils/l_ioctl.c | 281 + lnet/utils/parser.c | 703 + lnet/utils/parser.h | 73 + lnet/utils/portals.c | 985 + lnet/utils/ptlctl.c | 65 + lnet/utils/routerstat.c | 99 + lnet/utils/wirecheck.c | 141 + lustre/.cvsignore | 3 + lustre/ChangeLog | 42 + lustre/Makefile.am | 12 +- lustre/Makefile.mk | 4 + lustre/README | 2 +- lustre/Rules | 14 +- lustre/archdep.m4 | 127 - lustre/autogen.sh | 3 +- lustre/cobd/cache_obd.c | 104 +- lustre/cobd/lproc_cache.c | 14 +- lustre/conf/lustre.dtd | 29 +- lustre/conf/lustre2ldif.xsl | 76 +- lustre/conf/slapd-lustre.conf | 1 - lustre/configure.in | 232 +- lustre/doc/lconf.lyx | 156 +- lustre/doc/lctl.lyx | 90 +- lustre/doc/lmc.lyx | 137 +- lustre/extN/Makefile.am | 144 - lustre/extN/ext3-largefile.diff | 23 - lustre/extN/ext3-unmount_sync.diff | 59 - lustre/extN/extN-2.4.18-exports.diff | 11 - lustre/extN/extN-2.4.18-ino_sb_fixup.diff | 33 - lustre/extN/extN-san.diff | 88 - lustre/extN/extN-wantedi.diff | 163 - lustre/include/.cvsignore | 1 + lustre/include/ioctl.h | 64 + lustre/include/liblustre.h | 260 +- lustre/include/linux/lprocfs_status.h | 168 +- lustre/include/linux/lustre_compat25.h | 76 + lustre/include/linux/lustre_dlm.h | 46 +- lustre/include/linux/lustre_export.h | 24 +- lustre/include/linux/lustre_fsfilt.h | 17 +- lustre/include/linux/lustre_ha.h | 57 +- lustre/include/linux/lustre_idl.h | 181 +- lustre/include/linux/lustre_import.h | 48 +- lustre/include/linux/lustre_lib.h | 175 +- lustre/include/linux/lustre_lite.h | 128 +- lustre/include/linux/lustre_mds.h | 185 +- lustre/include/linux/lustre_net.h | 316 +- lustre/include/linux/obd.h | 165 +- lustre/include/linux/obd_class.h | 523 +- lustre/include/linux/obd_echo.h | 51 +- lustre/include/linux/obd_filter.h | 12 +- lustre/include/linux/obd_lov.h | 21 +- lustre/include/linux/obd_ost.h | 16 +- lustre/include/linux/obd_ptlbd.h | 9 +- lustre/include/linux/obd_support.h | 88 +- lustre/kernel_patches/README | 11 +- .../kernel_configs/config-linux-2.4.18-i386 | 1834 + .../config-linux-2.4.18-p4smp-61chaos | 1035 + .../kernel_configs/config-linux-2.4.18-uml | 458 + .../kernel_configs/config-linux-2.4.20-i386-rh | 1849 + .../kernel_configs/config-linux-2.4.20-uml | 297 + .../kernel_configs/jdike-2.5.69-uml.config | 321 + .../patches/dev_read_only_2.4.20-rh.patch | 77 + ...ad_only_hp.patch => dev_read_only_2.4.20.patch} | 32 +- .../patches/dev_read_only_hp_2.4.20.patch | 77 + lustre/kernel_patches/patches/dsp.patch | 130 + .../patches/export-truncate-2.5.63.patch | 37 + .../kernel_patches/patches/export-truncate.patch | 35 + lustre/kernel_patches/patches/exports.patch | 28 +- ...exports_hp.patch => exports_2.4.20-rh-hp.patch} | 27 +- lustre/kernel_patches/patches/exports_2.4.20.patch | 57 + .../patches/ext-2.4-patch-1-chaos.patch | 2527 ++ .../kernel_patches/patches/ext-2.4-patch-1.patch | 2527 ++ .../kernel_patches/patches/ext-2.4-patch-2.patch | 34 + .../kernel_patches/patches/ext-2.4-patch-3.patch | 96 + .../kernel_patches/patches/ext-2.4-patch-4.patch | 48 + .../patches/ext3-2.4-ino_t.patch} | 50 +- .../patches/ext3-2.4.18-fixes.patch} | 0 .../patches/ext3-2.4.18-ino_sb_macro.patch} | 20 +- .../patches/ext3-2.4.20-fixes.patch} | 83 +- .../patches/ext3-2.5-noread.patch} | 2 +- lustre/kernel_patches/patches/ext3-2.5.63.patch | 150 + .../patches/ext3-delete_thread-2.4.18.patch | 302 + .../patches/ext3-delete_thread-2.4.20.patch | 300 + lustre/kernel_patches/patches/ext3-largefile.patch | 16 + .../patches/ext3-noread-2.4.20.patch | 218 + .../kernel_patches/patches/ext3-orphan_lock.patch | 79 + .../kernel_patches/patches/ext3-san-2.4.20.patch | 117 + .../patches/ext3-truncate_blocks-chaos.patch.patch | 92 + .../patches/ext3-truncate_blocks.patch | 92 + .../kernel_patches/patches/ext3-unmount_sync.patch | 21 + .../patches/ext3-use-after-free.patch} | 30 +- lustre/kernel_patches/patches/ext3-xattr-2.5.patch | 2690 -- .../patches/ext3_orphan_lock-2.4.20-rh.patch | 82 + .../patches/extN-2.4.18-ino_sb_fixup.patch | 33 + .../patches/extN-delete_thread.patch | 278 + .../patches/extN-iget-debug.patch} | 12 +- .../patches/extN-misc-fixup.patch} | 12 +- .../patches/extN-noread.patch} | 132 +- lustre/kernel_patches/patches/extN-san.patch | 106 + lustre/kernel_patches/patches/extN-wantedi.patch | 171 + .../patches/htree-ext3-2.4.18.patch} | 2 +- .../patches/invalidate_show-2.4.20-rh.patch | 114 + .../kernel_patches/patches/invalidate_show.patch | 43 +- .../patches/iod-rmap-exports-2.4.20.patch | 86 + .../kernel_patches/patches/iod-rmap-exports.patch | 43 +- .../patches/iod-stock-24-exports.patch | 48 + .../patches/iod-stock-24-exports_hp.patch | 27 +- lustre/kernel_patches/patches/iopen-2.4.18.patch | 414 + lustre/kernel_patches/patches/iopen-2.4.20.patch | 423 + .../patches/kmem_cache_validate_2.4.20-rh.patch | 124 + .../patches/kmem_cache_validate_2.4.20.patch | 116 + .../patches/kmem_cache_validate_hp.patch | 40 +- .../patches/linux-2.4.18ea-0.8.26.patch} | 144 +- .../patches/linux-2.4.20-xattr-0.8.54-chaos.patch | 5538 +++ .../patches/linux-2.4.20-xattr-0.8.54-hp.patch | 5536 +++ .../patches/linux-2.4.20-xattr-0.8.54.patch | 5595 +++ lustre/kernel_patches/patches/lustre-2.5.63.patch | 862 + lustre/kernel_patches/patches/lustre-2.5.patch | 507 - lustre/kernel_patches/patches/lustre_version.patch | 2 +- lustre/kernel_patches/patches/mcore-2.4.20-8.patch | 2738 ++ .../patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch | 1673 - lustre/kernel_patches/patches/tcp-zero-copy.patch | 455 + .../patches/uml-patch-2.4.20-4.patch | 39358 +++++++++++++++++++ .../patches/uml_check_get_page.patch | 9 +- lustre/kernel_patches/patches/uml_no_panic.patch | 11 +- lustre/kernel_patches/patches/vanilla-2.4.18.patch | 1672 - lustre/kernel_patches/patches/vanilla-2.4.19.patch | 1576 - .../patches/vfs_intent-2.4.18-18.patch | 282 +- ...vfs_intent.patch => vfs_intent-2.4.20-rh.patch} | 1013 +- ...nt_hp.patch => vfs_intent-2.4.20-vanilla.patch} | 567 +- ..._read_only_hp.pc => dev_read_only_2.4.20-rh.pc} | 0 lustre/kernel_patches/pc/dev_read_only_2.4.20.pc | 3 + .../kernel_patches/pc/dev_read_only_hp_2.4.20.pc | 3 + lustre/kernel_patches/pc/dsp.pc | 6 + lustre/kernel_patches/pc/export-truncate-2.5.63.pc | 2 + lustre/kernel_patches/pc/export-truncate.pc | 2 + .../pc/{exports_hp.pc => exports_2.4.20-rh-hp.pc} | 0 lustre/kernel_patches/pc/exports_2.4.20.pc | 4 + lustre/kernel_patches/pc/exports_hp_2.4.20.pc | 4 + lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc | 11 + lustre/kernel_patches/pc/ext-2.4-patch-1.pc | 11 + lustre/kernel_patches/pc/ext-2.4-patch-2.pc | 1 + lustre/kernel_patches/pc/ext-2.4-patch-3.pc | 3 + lustre/kernel_patches/pc/ext-2.4-patch-4.pc | 1 + lustre/kernel_patches/pc/ext3-2.4-ino_t.pc | 3 + lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc | 7 + .../kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc | 10 + lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc | 1 + lustre/kernel_patches/pc/ext3-2.5-noread.pc | 3 + lustre/kernel_patches/pc/ext3-2.5.63.pc | 4 + .../kernel_patches/pc/ext3-delete_thread-2.4.18.pc | 3 + .../kernel_patches/pc/ext3-delete_thread-2.4.20.pc | 3 + lustre/kernel_patches/pc/ext3-largefile.pc | 1 + lustre/kernel_patches/pc/ext3-noread-2.4.20.pc | 3 + lustre/kernel_patches/pc/ext3-orphan_lock.pc | 3 + lustre/kernel_patches/pc/ext3-san-2.4.20.pc | 2 + .../pc/ext3-truncate_blocks-chaos.patch.pc | 1 + lustre/kernel_patches/pc/ext3-truncate_blocks.pc | 1 + lustre/kernel_patches/pc/ext3-unmount_sync.pc | 1 + lustre/kernel_patches/pc/ext3-use-after-free.pc | 1 + .../pc/ext3_orphan_lock-2.4.20-rh.pc | 3 + .../kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc | 1 + lustre/kernel_patches/pc/extN-delete_thread.pc | 3 + lustre/kernel_patches/pc/extN-iget-debug.pc | 2 + lustre/kernel_patches/pc/extN-misc-fixup.pc | 1 + lustre/kernel_patches/pc/extN-noread.pc | 3 + lustre/kernel_patches/pc/extN-san.pc | 2 + lustre/kernel_patches/pc/extN-wantedi.pc | 4 + lustre/kernel_patches/pc/htree-ext3-2.4.18.pc | 4 + .../kernel_patches/pc/invalidate_show-2.4.20-rh.pc | 4 + lustre/kernel_patches/pc/invalidate_show.pc | 3 +- .../kernel_patches/pc/iod-rmap-exports-2.4.20.pc | 5 + lustre/kernel_patches/pc/iod-rmap-exports.pc | 1 - lustre/kernel_patches/pc/iod-stock-24-exports.pc | 3 + .../kernel_patches/pc/iod-stock-24-exports_hp.pc | 3 + lustre/kernel_patches/pc/iopen-2.4.18.pc | 8 + lustre/kernel_patches/pc/iopen-2.4.20.pc | 8 + .../pc/kmem_cache_validate_2.4.20-rh.pc | 5 + .../pc/kmem_cache_validate_2.4.20.pc | 5 + lustre/kernel_patches/pc/kmem_cache_validate_hp.pc | 2 +- lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc | 10 + .../pc/linux-2.4.20-xattr-0.8.54-chaos.pc | 62 + .../pc/linux-2.4.20-xattr-0.8.54-hp.pc | 62 + .../kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc | 64 + lustre/kernel_patches/pc/lustre-2.5.63.pc | 12 + lustre/kernel_patches/pc/mcore-2.4.20-8.pc | 34 + .../pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc | 23 - lustre/kernel_patches/pc/tcp-zero-copy.pc | 5 + lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc | 394 + lustre/kernel_patches/pc/uml_check_get_page.pc | 1 - lustre/kernel_patches/pc/uml_compile_fixes.pc | 1 - lustre/kernel_patches/pc/uml_no_panic.pc | 1 - lustre/kernel_patches/pc/vanilla-2.4.18.pc | 23 - lustre/kernel_patches/pc/vanilla-2.4.19.pc | 19 - lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc | 1 + .../pc/{vfs_intent.pc => vfs_intent-2.4.20-rh.pc} | 2 + ...s_intent_hp.pc => vfs_intent-2.4.20-vanilla.pc} | 2 + lustre/kernel_patches/prepare_tree.sh | 2 +- lustre/kernel_patches/scripts/apatch | 5 +- lustre/kernel_patches/scripts/cat-series | 17 + lustre/kernel_patches/scripts/combine-applied | 26 +- lustre/kernel_patches/scripts/forkpatch | 76 + lustre/kernel_patches/scripts/join-patch | 28 + lustre/kernel_patches/scripts/patchfns | 27 +- lustre/kernel_patches/scripts/poppatch | 2 + lustre/kernel_patches/scripts/pushpatch | 2 + lustre/kernel_patches/scripts/refpatch | 1 + lustre/kernel_patches/scripts/rpatch | 49 +- lustre/kernel_patches/scripts/sum-series | 41 + lustre/kernel_patches/scripts/trypatch | 72 + lustre/kernel_patches/scripts/unused-patches | 39 + lustre/kernel_patches/series/chaos | 13 + lustre/kernel_patches/series/hp-pnnl | 8 - lustre/kernel_patches/series/hp-pnnl-2.4.20 | 25 + lustre/kernel_patches/series/rh-2.4.18-18 | 14 + lustre/kernel_patches/series/rh-2.4.20 | 23 + lustre/kernel_patches/series/vanilla-2.4.18 | 2 - lustre/kernel_patches/series/vanilla-2.4.19 | 3 - lustre/kernel_patches/series/vanilla-2.4.20 | 29 + lustre/kernel_patches/series/vanilla-2.5 | 2 +- lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt | 3 + lustre/kernel_patches/txt/vfs_intent.txt | 3 - lustre/kernel_patches/which_patch | 23 +- lustre/ldlm/Makefile.am | 4 +- lustre/ldlm/ldlm_extent.c | 54 +- lustre/ldlm/ldlm_internal.h | 1 + lustre/ldlm/ldlm_lib.c | 883 + lustre/ldlm/ldlm_lock.c | 92 +- lustre/ldlm/ldlm_lockd.c | 405 +- lustre/ldlm/ldlm_request.c | 202 +- lustre/ldlm/ldlm_resource.c | 14 +- lustre/lib/Makefile.am | 4 - lustre/lib/client.c | 406 - lustre/lib/mds_updates.c | 604 - lustre/lib/obd_pack.c | 64 - lustre/lib/target.c | 524 - lustre/liblustre/Makefile.am | 26 +- lustre/liblustre/file.c | 553 + lustre/liblustre/libtest.c | 28 +- lustre/liblustre/llite_lib.c | 226 + lustre/liblustre/llite_lib.h | 135 + lustre/liblustre/lltest.c | 159 + lustre/liblustre/rw.c | 519 + lustre/liblustre/super.c | 781 + lustre/llite/Makefile.am | 2 +- lustre/llite/commit_callback.c | 22 +- lustre/llite/dcache.c | 93 +- lustre/llite/dir.c | 79 +- lustre/llite/file.c | 593 +- lustre/llite/iod.c | 536 +- lustre/llite/llite_internal.h | 2 + lustre/llite/lproc_llite.c | 8 + lustre/llite/namei.c | 489 +- lustre/llite/recover.c | 56 - lustre/llite/rw.c | 245 +- lustre/llite/super.c | 316 +- lustre/llite/super25.c | 273 +- lustre/llite/symlink.c | 42 +- lustre/lov/Makefile.am | 10 +- lustre/lov/lov_obd.c | 1309 +- lustre/lov/lov_pack.c | 110 +- lustre/mdc/Makefile.am | 13 +- lustre/mdc/mdc_internal.h | 24 + lustre/mdc/mdc_lib.c | 282 + lustre/mdc/mdc_reint.c | 73 +- lustre/mdc/mdc_request.c | 654 +- lustre/mds/Makefile.am | 15 +- lustre/mds/Makefile.mk | 10 + lustre/mds/handler.c | 886 +- lustre/mds/lproc_mds.c | 21 +- lustre/mds/mds_fs.c | 118 +- lustre/mds/mds_internal.h | 15 + lustre/mds/mds_lib.c | 310 + lustre/mds/mds_lov.c | 71 +- lustre/mds/mds_open.c | 216 +- lustre/mds/mds_reint.c | 232 +- lustre/obdclass/Makefile.am | 14 +- lustre/obdclass/class_obd.c | 409 +- lustre/obdclass/debug.c | 54 +- lustre/obdclass/fsfilt.c | 4 +- lustre/obdclass/fsfilt_ext3.c | 336 +- lustre/obdclass/fsfilt_extN.c | 23 +- lustre/obdclass/fsfilt_reiserfs.c | 3 +- lustre/obdclass/genops.c | 418 +- lustre/obdclass/lprocfs_status.c | 262 +- lustre/obdclass/lustre_handles.c | 4 +- lustre/{lib => obdclass}/simple.c | 58 +- lustre/obdclass/statfs_pack.c | 39 +- lustre/obdclass/sysctl.c | 2 +- lustre/obdclass/uuid.c | 35 +- lustre/obdecho/Makefile.am | 1 - lustre/obdecho/echo.c | 90 +- lustre/obdecho/echo_client.c | 161 +- lustre/obdfilter/Makefile.am | 12 +- lustre/obdfilter/filter.c | 1272 +- lustre/obdfilter/lproc_obdfilter.c | 13 + lustre/osc/Makefile.am | 15 +- lustre/osc/osc_lib.c | 76 + lustre/osc/osc_request.c | 1508 +- lustre/ost/Makefile.am | 13 +- lustre/ost/ost_handler.c | 884 +- lustre/portals/.cvsignore | 8 + lustre/portals/AUTHORS | 0 lustre/portals/ChangeLog | 0 lustre/portals/Kernelenv.in | 1 + lustre/portals/Kernelenv.mk | 1 + lustre/portals/Makefile.am | 12 + lustre/portals/Makefile.mk | 6 + lustre/portals/NEWS | 0 lustre/portals/README | 0 lustre/portals/Rules.linux | 25 + lustre/portals/archdep.m4 | 317 + lustre/portals/autogen.sh | 5 + lustre/portals/build.m4 | 95 + lustre/portals/configure.in | 34 + lustre/portals/doc/.cvsignore | 4 + lustre/portals/doc/Data-structures | 65 + lustre/portals/doc/Makefile.am | 46 + lustre/portals/doc/Message-life-cycle | 118 + lustre/portals/doc/NAL-HOWTO | 293 + lustre/portals/doc/file.fig | 111 + lustre/portals/doc/flow_new.fig | 213 + lustre/portals/doc/get.fig | 33 + lustre/portals/doc/ieee.bst | 1112 + lustre/portals/doc/mpi.fig | 117 + lustre/portals/doc/portals.fig | 68 + lustre/portals/doc/portals3.bib | 124 + lustre/portals/doc/portals3.lyx | 15944 ++++++++ lustre/portals/doc/put.fig | 32 + lustre/portals/include/.cvsignore | 4 + lustre/portals/include/Makefile.am | 8 + lustre/portals/include/config.h.in | 11 + lustre/portals/include/linux/Makefile.am | 10 + lustre/portals/include/linux/kp30.h | 943 + lustre/portals/include/linux/portals_compat25.h | 13 + lustre/portals/include/linux/portals_lib.h | 188 + lustre/portals/include/portals/Makefile.am | 10 + lustre/portals/include/portals/api-support.h | 27 + lustre/portals/include/portals/api.h | 159 + lustre/portals/include/portals/arg-blocks.h | 265 + lustre/portals/include/portals/defines.h | 116 + lustre/portals/include/portals/errno.h | 61 + lustre/portals/include/portals/internal.h | 0 lustre/portals/include/portals/lib-dispatch.h | 45 + lustre/portals/include/portals/lib-nal.h | 102 + lustre/portals/include/portals/lib-p30.h | 385 + lustre/portals/include/portals/lib-types.h | 282 + lustre/portals/include/portals/list.h | 245 + lustre/portals/include/portals/lltrace.h | 175 + lustre/portals/include/portals/myrnal.h | 26 + lustre/portals/include/portals/nal.h | 49 + lustre/portals/include/portals/nalids.h | 4 + lustre/portals/include/portals/p30.h | 72 + lustre/portals/include/portals/ppid.h | 52 + lustre/portals/include/portals/ptlctl.h | 75 + lustre/portals/include/portals/stringtab.h | 5 + lustre/portals/include/portals/types.h | 157 + lustre/portals/knals/.cvsignore | 2 + lustre/portals/knals/Makefile.am | 7 + lustre/portals/knals/Makefile.mk | 4 + lustre/portals/knals/gmnal/.cvsignore | 3 + lustre/portals/knals/gmnal/Makefile.am | 13 + .../portals/knals/gmnal/gm-1.5.2.1-exports.patch | 43 + lustre/portals/knals/gmnal/gmnal.c | 284 + lustre/portals/knals/gmnal/gmnal.h | 101 + lustre/portals/knals/gmnal/gmnal_cb.c | 517 + lustre/portals/knals/qswnal/.cvsignore | 3 + lustre/portals/knals/qswnal/Makefile.am | 17 + lustre/portals/knals/qswnal/qswnal.c | 608 + lustre/portals/knals/qswnal/qswnal.h | 270 + lustre/portals/knals/qswnal/qswnal_cb.c | 1239 + lustre/portals/knals/scimacnal/.cvsignore | 3 + lustre/portals/knals/scimacnal/Makefile.am | 11 + lustre/portals/knals/scimacnal/README.scimacnal | 14 + lustre/portals/knals/scimacnal/scimac.conf | 35 + lustre/portals/knals/scimacnal/scimacnal.c | 219 + lustre/portals/knals/scimacnal/scimacnal.h | 85 + lustre/portals/knals/scimacnal/scimacnal_cb.c | 468 + lustre/portals/knals/socknal/.cvsignore | 3 + lustre/portals/knals/socknal/Makefile.am | 13 + lustre/portals/knals/socknal/Makefile.mk | 10 + lustre/portals/knals/socknal/socknal.c | 860 + lustre/portals/knals/socknal/socknal.h | 292 + lustre/portals/knals/socknal/socknal_cb.c | 1613 + lustre/portals/knals/toenal/.cvsignore | 3 + lustre/portals/knals/toenal/Makefile.am | 13 + lustre/portals/knals/toenal/toenal.c | 629 + lustre/portals/knals/toenal/toenal.h | 236 + lustre/portals/knals/toenal/toenal_cb.c | 1219 + lustre/portals/libcfs/.cvsignore | 4 + lustre/portals/libcfs/Makefile.am | 29 + lustre/portals/libcfs/Makefile.mk | 9 + lustre/portals/libcfs/debug.c | 830 + lustre/portals/libcfs/module.c | 574 + lustre/portals/libcfs/proc.c | 290 + lustre/portals/packaging/.cvsignore | 8 + lustre/portals/packaging/Makefile.am | 6 + lustre/portals/packaging/portals.spec.in | 116 + lustre/portals/portals/.cvsignore | 3 + lustre/portals/portals/Makefile.am | 10 + lustre/portals/portals/Makefile.mk | 9 + lustre/portals/portals/api-eq.c | 158 + lustre/portals/portals/api-errno.c | 55 + lustre/portals/portals/api-init.c | 71 + lustre/portals/portals/api-me.c | 42 + lustre/portals/portals/api-ni.c | 197 + lustre/portals/portals/api-wrap.c | 599 + lustre/portals/portals/lib-dispatch.c | 80 + lustre/portals/portals/lib-eq.c | 128 + lustre/portals/portals/lib-init.c | 474 + lustre/portals/portals/lib-md.c | 412 + lustre/portals/portals/lib-me.c | 227 + lustre/portals/portals/lib-move.c | 1379 + lustre/portals/portals/lib-msg.c | 163 + lustre/portals/portals/lib-ni.c | 128 + lustre/portals/portals/lib-pid.c | 58 + lustre/portals/router/.cvsignore | 3 + lustre/portals/router/Makefile.am | 16 + lustre/portals/router/Makefile.mk | 9 + lustre/portals/router/proc.c | 78 + lustre/portals/router/router.c | 449 + lustre/portals/router/router.h | 81 + lustre/portals/tests/.cvsignore | 3 + lustre/portals/tests/Makefile.am | 23 + lustre/portals/tests/ping.h | 80 + lustre/portals/tests/ping_cli.c | 300 + lustre/portals/tests/ping_srv.c | 308 + lustre/portals/tests/sping_cli.c | 276 + lustre/portals/tests/sping_srv.c | 295 + lustre/portals/tests/startclient.sh | 37 + lustre/portals/tests/startserver.sh | 38 + lustre/portals/tests/stopclient.sh | 14 + lustre/portals/tests/stopserver.sh | 16 + lustre/portals/unals/.cvsignore | 3 + lustre/portals/unals/Makefile.am | 5 + lustre/portals/unals/README | 53 + lustre/portals/unals/address.c | 146 + lustre/portals/unals/bridge.h | 29 + lustre/portals/unals/connection.c | 294 + lustre/portals/unals/connection.h | 32 + lustre/portals/unals/debug.c | 119 + lustre/portals/unals/dispatch.h | 39 + lustre/portals/unals/ipmap.h | 38 + lustre/portals/unals/pqtimer.c | 226 + lustre/portals/unals/pqtimer.h | 25 + lustre/portals/unals/procapi.c | 283 + lustre/portals/unals/procbridge.h | 40 + lustre/portals/unals/proclib.c | 270 + lustre/portals/unals/select.c | 165 + lustre/portals/unals/table.c | 264 + lustre/portals/unals/table.h | 39 + lustre/portals/unals/tcpnal.c | 198 + lustre/portals/unals/timer.h | 30 + lustre/portals/unals/utypes.h | 12 + lustre/portals/utils/.cvsignore | 8 + lustre/portals/utils/Makefile.am | 27 + lustre/portals/utils/acceptor.c | 466 + lustre/portals/utils/debug.c | 618 + lustre/portals/utils/debugctl.c | 66 + lustre/portals/utils/l_ioctl.c | 281 + lustre/portals/utils/parser.c | 703 + lustre/portals/utils/parser.h | 73 + lustre/portals/utils/portals.c | 985 + lustre/portals/utils/ptlctl.c | 65 + lustre/portals/utils/routerstat.c | 99 + lustre/portals/utils/wirecheck.c | 141 + lustre/ptlbd/blk.c | 71 +- lustre/ptlbd/client.c | 166 +- lustre/ptlbd/rpc.c | 349 +- lustre/ptlbd/server.c | 6 +- lustre/ptlrpc/Makefile.am | 7 +- lustre/ptlrpc/client.c | 1512 +- lustre/ptlrpc/connection.c | 51 +- lustre/ptlrpc/events.c | 358 +- lustre/ptlrpc/lproc_ptlrpc.c | 139 +- lustre/ptlrpc/niobuf.c | 666 +- lustre/ptlrpc/pack_generic.c | 1018 +- lustre/ptlrpc/pinger.c | 174 + lustre/ptlrpc/ptlrpc_internal.h | 93 + lustre/ptlrpc/ptlrpc_lib.c | 119 + lustre/ptlrpc/ptlrpc_module.c | 237 + lustre/ptlrpc/recovd.c | 372 - lustre/ptlrpc/recover.c | 561 +- lustre/ptlrpc/rpc.c | 312 - lustre/ptlrpc/service.c | 164 +- lustre/scripts/llite-group.sh | 67 + lustre/scripts/lustre.spec.in | 58 +- lustre/scripts/version_tag.pl | 5 +- lustre/tests/.cvsignore | 6 + lustre/tests/Makefile.am | 34 +- lustre/tests/acceptance-small.sh | 10 +- lustre/tests/ba-echo.sh | 2 +- lustre/tests/checkstat.c | 9 +- lustre/tests/cobd.sh | 10 - lustre/tests/createtest.c | 4 +- lustre/tests/directio.c | 24 +- lustre/tests/echo.sh | 7 +- lustre/tests/fchdir_test.c | 41 + lustre/tests/llecho.sh | 8 +- lustre/tests/llmount.sh | 14 +- lustre/tests/llmountcleanup.sh | 20 +- lustre/tests/llrmount.sh | 15 +- lustre/tests/local.sh | 18 +- lustre/tests/mcr-routed-config.sh | 2 +- lustre/tests/mkdirdeep.c | 275 + lustre/tests/opendevunlink.c | 111 + lustre/tests/opendirunlink.c | 122 + lustre/tests/openfile.c | 162 + lustre/tests/recovery-cleanup.sh | 27 +- lustre/tests/recovery-small-upcall.sh | 3 + lustre/tests/recovery-small.sh | 55 +- lustre/tests/runas.c | 55 +- lustre/tests/runobdstat | 7 + lustre/tests/runvmstat | 5 +- lustre/tests/sanity-ldlm.sh | 61 + lustre/tests/sanity.sh | 300 +- lustre/tests/sanityN.sh | 47 +- lustre/tests/test_brw.c | 13 +- lustre/tests/uml.sh | 23 +- lustre/tests/unlinkmany.c | 74 + lustre/tests/writeme.c | 18 +- lustre/utils/.cvsignore | 2 +- lustre/utils/Lustre/.cvsignore | 4 + lustre/utils/Lustre/Makefile.am | 2 + lustre/utils/Lustre/__init__.py | 7 + lustre/utils/Lustre/cmdline.py | 178 + lustre/utils/Lustre/error.py | 10 + lustre/utils/Lustre/lustredb.py | 413 + lustre/utils/Makefile.am | 6 +- lustre/utils/lactive | 85 + lustre/utils/{lconf.in => lconf} | 1747 +- lustre/utils/lctl.c | 55 +- lustre/utils/llparser.pm | 399 - lustre/utils/llstat.pl | 122 + lustre/utils/lmc | 507 +- lustre/utils/load_ldap.sh | 41 + lustre/utils/lstripe.c | 7 +- lustre/utils/obd.c | 223 +- lustre/utils/obdctl.c | 3 +- lustre/utils/obdctl.h | 9 +- lustre/utils/obdiolib.c | 6 +- lustre/utils/obdiolib.h | 29 +- lustre/utils/obdstat.c | 36 +- lustre/utils/parser.c | 5 +- lustre/utils/wirecheck.c | 588 + 726 files changed, 193738 insertions(+), 21568 deletions(-) rename {lustre/lib => lnet}/.cvsignore (60%) create mode 100644 lnet/AUTHORS create mode 100644 lnet/ChangeLog create mode 100644 lnet/Kernelenv.in create mode 100644 lnet/Kernelenv.mk create mode 100644 lnet/Makefile.am create mode 100644 lnet/Makefile.mk create mode 100644 lnet/NEWS create mode 100644 lnet/README create mode 100644 lnet/Rules.linux create mode 100644 lnet/archdep.m4 create mode 100644 lnet/autogen.sh create mode 100644 lnet/build.m4 create mode 100644 lnet/configure.in create mode 100644 lnet/doc/.cvsignore create mode 100644 lnet/doc/Data-structures create mode 100644 lnet/doc/Makefile.am create mode 100644 lnet/doc/Message-life-cycle create mode 100644 lnet/doc/NAL-HOWTO create mode 100644 lnet/doc/file.fig create mode 100644 lnet/doc/flow_new.fig create mode 100644 lnet/doc/get.fig create mode 100644 lnet/doc/ieee.bst create mode 100644 lnet/doc/mpi.fig create mode 100644 lnet/doc/portals.fig create mode 100644 lnet/doc/portals3.bib create mode 100644 lnet/doc/portals3.lyx create mode 100644 lnet/doc/put.fig create mode 100644 lnet/include/.cvsignore create mode 100644 lnet/include/Makefile.am create mode 100644 lnet/include/config.h.in create mode 100644 lnet/include/linux/Makefile.am create mode 100644 lnet/include/linux/kp30.h create mode 100644 lnet/include/linux/portals_compat25.h create mode 100644 lnet/include/linux/portals_lib.h create mode 100644 lnet/include/lnet/Makefile.am create mode 100644 lnet/include/lnet/api-support.h create mode 100644 lnet/include/lnet/api.h create mode 100644 lnet/include/lnet/arg-blocks.h create mode 100644 lnet/include/lnet/defines.h create mode 100644 lnet/include/lnet/errno.h create mode 100644 lnet/include/lnet/internal.h create mode 100644 lnet/include/lnet/lib-dispatch.h create mode 100644 lnet/include/lnet/lib-lnet.h create mode 100644 lnet/include/lnet/lib-nal.h create mode 100644 lnet/include/lnet/lib-p30.h create mode 100644 lnet/include/lnet/lib-types.h create mode 100644 lnet/include/lnet/list.h create mode 100644 lnet/include/lnet/lltrace.h create mode 100644 lnet/include/lnet/lnet.h create mode 100644 lnet/include/lnet/lnetctl.h create mode 100644 lnet/include/lnet/myrnal.h create mode 100644 lnet/include/lnet/nal.h create mode 100644 lnet/include/lnet/nalids.h create mode 100644 lnet/include/lnet/p30.h create mode 100644 lnet/include/lnet/ppid.h create mode 100644 lnet/include/lnet/ptlctl.h create mode 100644 lnet/include/lnet/stringtab.h create mode 100644 lnet/include/lnet/types.h create mode 100644 lnet/klnds/.cvsignore create mode 100644 lnet/klnds/Makefile.am create mode 100644 lnet/klnds/Makefile.mk create mode 100644 lnet/klnds/gmlnd/.cvsignore create mode 100644 lnet/klnds/gmlnd/Makefile.am create mode 100644 lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch create mode 100644 lnet/klnds/gmlnd/gmlnd.h create mode 100644 lnet/klnds/gmlnd/gmlnd_cb.c create mode 100644 lnet/klnds/gmlnd/gmnal.c create mode 100644 lnet/klnds/qswlnd/.cvsignore create mode 100644 lnet/klnds/qswlnd/Makefile.am create mode 100644 lnet/klnds/qswlnd/qswlnd.c create mode 100644 lnet/klnds/qswlnd/qswlnd.h create mode 100644 lnet/klnds/qswlnd/qswlnd_cb.c create mode 100644 lnet/klnds/scimaclnd/.cvsignore create mode 100644 lnet/klnds/scimaclnd/Makefile.am create mode 100644 lnet/klnds/scimaclnd/README.scimacnal create mode 100644 lnet/klnds/scimaclnd/scimac.conf create mode 100644 lnet/klnds/scimaclnd/scimacnal.c create mode 100644 lnet/klnds/scimaclnd/scimacnal.h create mode 100644 lnet/klnds/scimaclnd/scimacnal_cb.c create mode 100644 lnet/klnds/socklnd/.cvsignore create mode 100644 lnet/klnds/socklnd/Makefile.am create mode 100644 lnet/klnds/socklnd/Makefile.mk create mode 100644 lnet/klnds/socklnd/socklnd.c create mode 100644 lnet/klnds/socklnd/socklnd.h create mode 100644 lnet/klnds/socklnd/socklnd_cb.c create mode 100644 lnet/klnds/toelnd/.cvsignore create mode 100644 lnet/klnds/toelnd/Makefile.am create mode 100644 lnet/klnds/toelnd/toenal.c create mode 100644 lnet/klnds/toelnd/toenal.h create mode 100644 lnet/klnds/toelnd/toenal_cb.c create mode 100644 lnet/libcfs/.cvsignore create mode 100644 lnet/libcfs/Makefile.am create mode 100644 lnet/libcfs/Makefile.mk create mode 100644 lnet/libcfs/debug.c create mode 100644 lnet/libcfs/module.c create mode 100644 lnet/libcfs/proc.c create mode 100644 lnet/lnet/.cvsignore create mode 100644 lnet/lnet/Makefile.am create mode 100644 lnet/lnet/Makefile.mk create mode 100644 lnet/lnet/api-eq.c create mode 100644 lnet/lnet/api-errno.c create mode 100644 lnet/lnet/api-init.c create mode 100644 lnet/lnet/api-me.c create mode 100644 lnet/lnet/api-ni.c create mode 100644 lnet/lnet/api-wrap.c create mode 100644 lnet/lnet/lib-dispatch.c create mode 100644 lnet/lnet/lib-eq.c create mode 100644 lnet/lnet/lib-init.c create mode 100644 lnet/lnet/lib-md.c create mode 100644 lnet/lnet/lib-me.c create mode 100644 lnet/lnet/lib-move.c create mode 100644 lnet/lnet/lib-msg.c create mode 100644 lnet/lnet/lib-ni.c create mode 100644 lnet/lnet/lib-pid.c create mode 100644 lnet/packaging/.cvsignore create mode 100644 lnet/packaging/Makefile.am create mode 100644 lnet/packaging/portals.spec.in create mode 100644 lnet/router/.cvsignore create mode 100644 lnet/router/Makefile.am create mode 100644 lnet/router/Makefile.mk create mode 100644 lnet/router/proc.c create mode 100644 lnet/router/router.c create mode 100644 lnet/router/router.h create mode 100644 lnet/tests/.cvsignore create mode 100644 lnet/tests/Makefile.am create mode 100644 lnet/tests/ping.h create mode 100644 lnet/tests/ping_cli.c create mode 100644 lnet/tests/ping_srv.c create mode 100644 lnet/tests/sping_cli.c create mode 100644 lnet/tests/sping_srv.c create mode 100644 lnet/tests/startclient.sh create mode 100644 lnet/tests/startserver.sh create mode 100644 lnet/tests/stopclient.sh create mode 100644 lnet/tests/stopserver.sh create mode 100644 lnet/ulnds/.cvsignore create mode 100644 lnet/ulnds/Makefile.am create mode 100644 lnet/ulnds/README create mode 100644 lnet/ulnds/address.c create mode 100644 lnet/ulnds/bridge.h create mode 100644 lnet/ulnds/connection.c create mode 100644 lnet/ulnds/connection.h create mode 100644 lnet/ulnds/debug.c create mode 100644 lnet/ulnds/dispatch.h create mode 100644 lnet/ulnds/ipmap.h create mode 100644 lnet/ulnds/pqtimer.c create mode 100644 lnet/ulnds/pqtimer.h create mode 100644 lnet/ulnds/procapi.c create mode 100644 lnet/ulnds/procbridge.h create mode 100644 lnet/ulnds/proclib.c create mode 100644 lnet/ulnds/select.c create mode 100644 lnet/ulnds/socklnd/Makefile.am create mode 100644 lnet/ulnds/socklnd/README create mode 100644 lnet/ulnds/socklnd/address.c create mode 100644 lnet/ulnds/socklnd/bridge.h create mode 100644 lnet/ulnds/socklnd/connection.c create mode 100644 lnet/ulnds/socklnd/connection.h create mode 100644 lnet/ulnds/socklnd/debug.c create mode 100644 lnet/ulnds/socklnd/dispatch.h create mode 100644 lnet/ulnds/socklnd/ipmap.h create mode 100644 lnet/ulnds/socklnd/pqtimer.c create mode 100644 lnet/ulnds/socklnd/pqtimer.h create mode 100644 lnet/ulnds/socklnd/procapi.c create mode 100644 lnet/ulnds/socklnd/procbridge.h create mode 100644 lnet/ulnds/socklnd/proclib.c create mode 100644 lnet/ulnds/socklnd/select.c create mode 100644 lnet/ulnds/socklnd/table.c create mode 100644 lnet/ulnds/socklnd/table.h create mode 100644 lnet/ulnds/socklnd/tcplnd.c create mode 100644 lnet/ulnds/socklnd/timer.h create mode 100644 lnet/ulnds/socklnd/utypes.h create mode 100644 lnet/ulnds/table.c create mode 100644 lnet/ulnds/table.h create mode 100644 lnet/ulnds/tcplnd.c create mode 100644 lnet/ulnds/timer.h create mode 100644 lnet/ulnds/utypes.h create mode 100644 lnet/utils/.cvsignore create mode 100644 lnet/utils/Makefile.am create mode 100644 lnet/utils/acceptor.c create mode 100644 lnet/utils/debug.c create mode 100644 lnet/utils/debugctl.c create mode 100644 lnet/utils/l_ioctl.c create mode 100644 lnet/utils/parser.c create mode 100644 lnet/utils/parser.h create mode 100644 lnet/utils/portals.c create mode 100644 lnet/utils/ptlctl.c create mode 100644 lnet/utils/routerstat.c create mode 100644 lnet/utils/wirecheck.c create mode 100644 lustre/Makefile.mk delete mode 100644 lustre/archdep.m4 delete mode 100644 lustre/extN/Makefile.am delete mode 100644 lustre/extN/ext3-largefile.diff delete mode 100644 lustre/extN/ext3-unmount_sync.diff delete mode 100644 lustre/extN/extN-2.4.18-exports.diff delete mode 100644 lustre/extN/extN-2.4.18-ino_sb_fixup.diff delete mode 100644 lustre/extN/extN-san.diff delete mode 100644 lustre/extN/extN-wantedi.diff create mode 100644 lustre/include/ioctl.h create mode 100644 lustre/include/linux/lustre_compat25.h create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh create mode 100644 lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml create mode 100644 lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config create mode 100644 lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch rename lustre/kernel_patches/patches/{dev_read_only_hp.patch => dev_read_only_2.4.20.patch} (62%) create mode 100644 lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch create mode 100644 lustre/kernel_patches/patches/dsp.patch create mode 100644 lustre/kernel_patches/patches/export-truncate-2.5.63.patch create mode 100644 lustre/kernel_patches/patches/export-truncate.patch rename lustre/kernel_patches/patches/{exports_hp.patch => exports_2.4.20-rh-hp.patch} (61%) create mode 100644 lustre/kernel_patches/patches/exports_2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-1.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-2.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-3.patch create mode 100644 lustre/kernel_patches/patches/ext-2.4-patch-4.patch rename lustre/{extN/ext3-2.4-ino_t.diff => kernel_patches/patches/ext3-2.4-ino_t.patch} (73%) rename lustre/{extN/ext3-2.4.18-fixes.diff => kernel_patches/patches/ext3-2.4.18-fixes.patch} (100%) rename lustre/{extN/ext3-2.4.18-ino_sb_macro.diff => kernel_patches/patches/ext3-2.4.18-ino_sb_macro.patch} (99%) rename lustre/{extN/patch-2.4.18-chaos22 => kernel_patches/patches/ext3-2.4.20-fixes.patch} (60%) rename lustre/{extN/ext3-2.5-noread.diff => kernel_patches/patches/ext3-2.5-noread.patch} (99%) create mode 100644 lustre/kernel_patches/patches/ext3-2.5.63.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch create mode 100644 lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-largefile.patch create mode 100644 lustre/kernel_patches/patches/ext3-noread-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-orphan_lock.patch create mode 100644 lustre/kernel_patches/patches/ext3-san-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch create mode 100644 lustre/kernel_patches/patches/ext3-truncate_blocks.patch create mode 100644 lustre/kernel_patches/patches/ext3-unmount_sync.patch rename lustre/{extN/ext3-use-after-free.diff => kernel_patches/patches/ext3-use-after-free.patch} (56%) delete mode 100644 lustre/kernel_patches/patches/ext3-xattr-2.5.patch create mode 100644 lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch create mode 100644 lustre/kernel_patches/patches/extN-delete_thread.patch rename lustre/{extN/extN-iget-debug.diff => kernel_patches/patches/extN-iget-debug.patch} (78%) rename lustre/{extN/extN-misc-fixup.diff => kernel_patches/patches/extN-misc-fixup.patch} (58%) rename lustre/{extN/extN-noread.diff => kernel_patches/patches/extN-noread.patch} (54%) create mode 100644 lustre/kernel_patches/patches/extN-san.patch create mode 100644 lustre/kernel_patches/patches/extN-wantedi.patch rename lustre/{extN/htree-ext3-2.4.18.diff => kernel_patches/patches/htree-ext3-2.4.18.patch} (99%) create mode 100644 lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/iod-stock-24-exports.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.18.patch create mode 100644 lustre/kernel_patches/patches/iopen-2.4.20.patch create mode 100644 lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch create mode 100644 lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch rename lustre/{extN/linux-2.4.18ea-0.8.26.diff => kernel_patches/patches/linux-2.4.18ea-0.8.26.patch} (93%) create mode 100644 lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch create mode 100644 lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch create mode 100644 lustre/kernel_patches/patches/lustre-2.5.63.patch delete mode 100644 lustre/kernel_patches/patches/lustre-2.5.patch create mode 100644 lustre/kernel_patches/patches/mcore-2.4.20-8.patch delete mode 100644 lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy.patch create mode 100644 lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch delete mode 100644 lustre/kernel_patches/patches/vanilla-2.4.18.patch delete mode 100644 lustre/kernel_patches/patches/vanilla-2.4.19.patch rename lustre/kernel_patches/patches/{vfs_intent.patch => vfs_intent-2.4.20-rh.patch} (50%) rename lustre/kernel_patches/patches/{vfs_intent_hp.patch => vfs_intent-2.4.20-vanilla.patch} (76%) rename lustre/kernel_patches/pc/{dev_read_only_hp.pc => dev_read_only_2.4.20-rh.pc} (100%) create mode 100644 lustre/kernel_patches/pc/dev_read_only_2.4.20.pc create mode 100644 lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc create mode 100644 lustre/kernel_patches/pc/dsp.pc create mode 100644 lustre/kernel_patches/pc/export-truncate-2.5.63.pc create mode 100644 lustre/kernel_patches/pc/export-truncate.pc rename lustre/kernel_patches/pc/{exports_hp.pc => exports_2.4.20-rh-hp.pc} (100%) create mode 100644 lustre/kernel_patches/pc/exports_2.4.20.pc create mode 100644 lustre/kernel_patches/pc/exports_hp_2.4.20.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-1.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-2.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-3.pc create mode 100644 lustre/kernel_patches/pc/ext-2.4-patch-4.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4-ino_t.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.5-noread.pc create mode 100644 lustre/kernel_patches/pc/ext3-2.5.63.pc create mode 100644 lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc create mode 100644 lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/ext3-largefile.pc create mode 100644 lustre/kernel_patches/pc/ext3-noread-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/ext3-orphan_lock.pc create mode 100644 lustre/kernel_patches/pc/ext3-san-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc create mode 100644 lustre/kernel_patches/pc/ext3-truncate_blocks.pc create mode 100644 lustre/kernel_patches/pc/ext3-unmount_sync.pc create mode 100644 lustre/kernel_patches/pc/ext3-use-after-free.pc create mode 100644 lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc create mode 100644 lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc create mode 100644 lustre/kernel_patches/pc/extN-delete_thread.pc create mode 100644 lustre/kernel_patches/pc/extN-iget-debug.pc create mode 100644 lustre/kernel_patches/pc/extN-misc-fixup.pc create mode 100644 lustre/kernel_patches/pc/extN-noread.pc create mode 100644 lustre/kernel_patches/pc/extN-san.pc create mode 100644 lustre/kernel_patches/pc/extN-wantedi.pc create mode 100644 lustre/kernel_patches/pc/htree-ext3-2.4.18.pc create mode 100644 lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc create mode 100644 lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/iod-stock-24-exports.pc create mode 100644 lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc create mode 100644 lustre/kernel_patches/pc/iopen-2.4.18.pc create mode 100644 lustre/kernel_patches/pc/iopen-2.4.20.pc create mode 100644 lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc create mode 100644 lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc create mode 100644 lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc create mode 100644 lustre/kernel_patches/pc/lustre-2.5.63.pc create mode 100644 lustre/kernel_patches/pc/mcore-2.4.20-8.pc delete mode 100644 lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc create mode 100644 lustre/kernel_patches/pc/tcp-zero-copy.pc create mode 100644 lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc delete mode 100644 lustre/kernel_patches/pc/vanilla-2.4.18.pc delete mode 100644 lustre/kernel_patches/pc/vanilla-2.4.19.pc rename lustre/kernel_patches/pc/{vfs_intent.pc => vfs_intent-2.4.20-rh.pc} (82%) rename lustre/kernel_patches/pc/{vfs_intent_hp.pc => vfs_intent-2.4.20-vanilla.pc} (82%) create mode 100755 lustre/kernel_patches/scripts/cat-series create mode 100755 lustre/kernel_patches/scripts/forkpatch create mode 100755 lustre/kernel_patches/scripts/join-patch create mode 100755 lustre/kernel_patches/scripts/sum-series create mode 100755 lustre/kernel_patches/scripts/trypatch create mode 100755 lustre/kernel_patches/scripts/unused-patches delete mode 100644 lustre/kernel_patches/series/hp-pnnl create mode 100644 lustre/kernel_patches/series/hp-pnnl-2.4.20 create mode 100644 lustre/kernel_patches/series/rh-2.4.20 delete mode 100644 lustre/kernel_patches/series/vanilla-2.4.18 delete mode 100644 lustre/kernel_patches/series/vanilla-2.4.19 create mode 100644 lustre/kernel_patches/series/vanilla-2.4.20 create mode 100644 lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt delete mode 100644 lustre/kernel_patches/txt/vfs_intent.txt create mode 100644 lustre/ldlm/ldlm_internal.h create mode 100644 lustre/ldlm/ldlm_lib.c delete mode 100644 lustre/lib/Makefile.am delete mode 100644 lustre/lib/client.c delete mode 100644 lustre/lib/mds_updates.c delete mode 100644 lustre/lib/obd_pack.c delete mode 100644 lustre/lib/target.c create mode 100644 lustre/liblustre/file.c create mode 100644 lustre/liblustre/llite_lib.c create mode 100644 lustre/liblustre/llite_lib.h create mode 100644 lustre/liblustre/lltest.c create mode 100644 lustre/liblustre/rw.c create mode 100644 lustre/liblustre/super.c create mode 100644 lustre/llite/llite_internal.h delete mode 100644 lustre/llite/recover.c create mode 100644 lustre/mdc/mdc_internal.h create mode 100644 lustre/mdc/mdc_lib.c create mode 100644 lustre/mds/Makefile.mk create mode 100644 lustre/mds/mds_internal.h create mode 100644 lustre/mds/mds_lib.c rename lustre/{lib => obdclass}/simple.c (83%) create mode 100644 lustre/osc/osc_lib.c create mode 100644 lustre/portals/.cvsignore create mode 100644 lustre/portals/AUTHORS create mode 100644 lustre/portals/ChangeLog create mode 100644 lustre/portals/Kernelenv.in create mode 100644 lustre/portals/Kernelenv.mk create mode 100644 lustre/portals/Makefile.am create mode 100644 lustre/portals/Makefile.mk create mode 100644 lustre/portals/NEWS create mode 100644 lustre/portals/README create mode 100644 lustre/portals/Rules.linux create mode 100644 lustre/portals/archdep.m4 create mode 100755 lustre/portals/autogen.sh create mode 100644 lustre/portals/build.m4 create mode 100644 lustre/portals/configure.in create mode 100644 lustre/portals/doc/.cvsignore create mode 100644 lustre/portals/doc/Data-structures create mode 100644 lustre/portals/doc/Makefile.am create mode 100644 lustre/portals/doc/Message-life-cycle create mode 100644 lustre/portals/doc/NAL-HOWTO create mode 100644 lustre/portals/doc/file.fig create mode 100644 lustre/portals/doc/flow_new.fig create mode 100644 lustre/portals/doc/get.fig create mode 100644 lustre/portals/doc/ieee.bst create mode 100644 lustre/portals/doc/mpi.fig create mode 100644 lustre/portals/doc/portals.fig create mode 100644 lustre/portals/doc/portals3.bib create mode 100644 lustre/portals/doc/portals3.lyx create mode 100644 lustre/portals/doc/put.fig create mode 100644 lustre/portals/include/.cvsignore create mode 100644 lustre/portals/include/Makefile.am create mode 100644 lustre/portals/include/config.h.in create mode 100644 lustre/portals/include/linux/Makefile.am create mode 100644 lustre/portals/include/linux/kp30.h create mode 100644 lustre/portals/include/linux/portals_compat25.h create mode 100644 lustre/portals/include/linux/portals_lib.h create mode 100644 lustre/portals/include/portals/Makefile.am create mode 100644 lustre/portals/include/portals/api-support.h create mode 100644 lustre/portals/include/portals/api.h create mode 100644 lustre/portals/include/portals/arg-blocks.h create mode 100644 lustre/portals/include/portals/defines.h create mode 100644 lustre/portals/include/portals/errno.h create mode 100644 lustre/portals/include/portals/internal.h create mode 100644 lustre/portals/include/portals/lib-dispatch.h create mode 100644 lustre/portals/include/portals/lib-nal.h create mode 100644 lustre/portals/include/portals/lib-p30.h create mode 100644 lustre/portals/include/portals/lib-types.h create mode 100644 lustre/portals/include/portals/list.h create mode 100644 lustre/portals/include/portals/lltrace.h create mode 100644 lustre/portals/include/portals/myrnal.h create mode 100644 lustre/portals/include/portals/nal.h create mode 100644 lustre/portals/include/portals/nalids.h create mode 100644 lustre/portals/include/portals/p30.h create mode 100644 lustre/portals/include/portals/ppid.h create mode 100644 lustre/portals/include/portals/ptlctl.h create mode 100644 lustre/portals/include/portals/stringtab.h create mode 100644 lustre/portals/include/portals/types.h create mode 100644 lustre/portals/knals/.cvsignore create mode 100644 lustre/portals/knals/Makefile.am create mode 100644 lustre/portals/knals/Makefile.mk create mode 100644 lustre/portals/knals/gmnal/.cvsignore create mode 100644 lustre/portals/knals/gmnal/Makefile.am create mode 100644 lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch create mode 100644 lustre/portals/knals/gmnal/gmnal.c create mode 100644 lustre/portals/knals/gmnal/gmnal.h create mode 100644 lustre/portals/knals/gmnal/gmnal_cb.c create mode 100644 lustre/portals/knals/qswnal/.cvsignore create mode 100644 lustre/portals/knals/qswnal/Makefile.am create mode 100644 lustre/portals/knals/qswnal/qswnal.c create mode 100644 lustre/portals/knals/qswnal/qswnal.h create mode 100644 lustre/portals/knals/qswnal/qswnal_cb.c create mode 100644 lustre/portals/knals/scimacnal/.cvsignore create mode 100644 lustre/portals/knals/scimacnal/Makefile.am create mode 100644 lustre/portals/knals/scimacnal/README.scimacnal create mode 100644 lustre/portals/knals/scimacnal/scimac.conf create mode 100644 lustre/portals/knals/scimacnal/scimacnal.c create mode 100644 lustre/portals/knals/scimacnal/scimacnal.h create mode 100644 lustre/portals/knals/scimacnal/scimacnal_cb.c create mode 100644 lustre/portals/knals/socknal/.cvsignore create mode 100644 lustre/portals/knals/socknal/Makefile.am create mode 100644 lustre/portals/knals/socknal/Makefile.mk create mode 100644 lustre/portals/knals/socknal/socknal.c create mode 100644 lustre/portals/knals/socknal/socknal.h create mode 100644 lustre/portals/knals/socknal/socknal_cb.c create mode 100644 lustre/portals/knals/toenal/.cvsignore create mode 100644 lustre/portals/knals/toenal/Makefile.am create mode 100644 lustre/portals/knals/toenal/toenal.c create mode 100644 lustre/portals/knals/toenal/toenal.h create mode 100644 lustre/portals/knals/toenal/toenal_cb.c create mode 100644 lustre/portals/libcfs/.cvsignore create mode 100644 lustre/portals/libcfs/Makefile.am create mode 100644 lustre/portals/libcfs/Makefile.mk create mode 100644 lustre/portals/libcfs/debug.c create mode 100644 lustre/portals/libcfs/module.c create mode 100644 lustre/portals/libcfs/proc.c create mode 100644 lustre/portals/packaging/.cvsignore create mode 100644 lustre/portals/packaging/Makefile.am create mode 100644 lustre/portals/packaging/portals.spec.in create mode 100644 lustre/portals/portals/.cvsignore create mode 100644 lustre/portals/portals/Makefile.am create mode 100644 lustre/portals/portals/Makefile.mk create mode 100644 lustre/portals/portals/api-eq.c create mode 100644 lustre/portals/portals/api-errno.c create mode 100644 lustre/portals/portals/api-init.c create mode 100644 lustre/portals/portals/api-me.c create mode 100644 lustre/portals/portals/api-ni.c create mode 100644 lustre/portals/portals/api-wrap.c create mode 100644 lustre/portals/portals/lib-dispatch.c create mode 100644 lustre/portals/portals/lib-eq.c create mode 100644 lustre/portals/portals/lib-init.c create mode 100644 lustre/portals/portals/lib-md.c create mode 100644 lustre/portals/portals/lib-me.c create mode 100644 lustre/portals/portals/lib-move.c create mode 100644 lustre/portals/portals/lib-msg.c create mode 100644 lustre/portals/portals/lib-ni.c create mode 100644 lustre/portals/portals/lib-pid.c create mode 100644 lustre/portals/router/.cvsignore create mode 100644 lustre/portals/router/Makefile.am create mode 100644 lustre/portals/router/Makefile.mk create mode 100644 lustre/portals/router/proc.c create mode 100644 lustre/portals/router/router.c create mode 100644 lustre/portals/router/router.h create mode 100644 lustre/portals/tests/.cvsignore create mode 100644 lustre/portals/tests/Makefile.am create mode 100644 lustre/portals/tests/ping.h create mode 100644 lustre/portals/tests/ping_cli.c create mode 100644 lustre/portals/tests/ping_srv.c create mode 100644 lustre/portals/tests/sping_cli.c create mode 100644 lustre/portals/tests/sping_srv.c create mode 100755 lustre/portals/tests/startclient.sh create mode 100755 lustre/portals/tests/startserver.sh create mode 100755 lustre/portals/tests/stopclient.sh create mode 100644 lustre/portals/tests/stopserver.sh create mode 100644 lustre/portals/unals/.cvsignore create mode 100644 lustre/portals/unals/Makefile.am create mode 100644 lustre/portals/unals/README create mode 100644 lustre/portals/unals/address.c create mode 100644 lustre/portals/unals/bridge.h create mode 100644 lustre/portals/unals/connection.c create mode 100644 lustre/portals/unals/connection.h create mode 100644 lustre/portals/unals/debug.c create mode 100644 lustre/portals/unals/dispatch.h create mode 100644 lustre/portals/unals/ipmap.h create mode 100644 lustre/portals/unals/pqtimer.c create mode 100644 lustre/portals/unals/pqtimer.h create mode 100644 lustre/portals/unals/procapi.c create mode 100644 lustre/portals/unals/procbridge.h create mode 100644 lustre/portals/unals/proclib.c create mode 100644 lustre/portals/unals/select.c create mode 100644 lustre/portals/unals/table.c create mode 100644 lustre/portals/unals/table.h create mode 100644 lustre/portals/unals/tcpnal.c create mode 100644 lustre/portals/unals/timer.h create mode 100644 lustre/portals/unals/utypes.h create mode 100644 lustre/portals/utils/.cvsignore create mode 100644 lustre/portals/utils/Makefile.am create mode 100644 lustre/portals/utils/acceptor.c create mode 100644 lustre/portals/utils/debug.c create mode 100644 lustre/portals/utils/debugctl.c create mode 100644 lustre/portals/utils/l_ioctl.c create mode 100644 lustre/portals/utils/parser.c create mode 100644 lustre/portals/utils/parser.h create mode 100644 lustre/portals/utils/portals.c create mode 100644 lustre/portals/utils/ptlctl.c create mode 100644 lustre/portals/utils/routerstat.c create mode 100644 lustre/portals/utils/wirecheck.c create mode 100644 lustre/ptlrpc/pinger.c create mode 100644 lustre/ptlrpc/ptlrpc_internal.h create mode 100644 lustre/ptlrpc/ptlrpc_lib.c create mode 100644 lustre/ptlrpc/ptlrpc_module.c delete mode 100644 lustre/ptlrpc/recovd.c delete mode 100644 lustre/ptlrpc/rpc.c create mode 100644 lustre/scripts/llite-group.sh create mode 100644 lustre/tests/fchdir_test.c create mode 100644 lustre/tests/mkdirdeep.c create mode 100644 lustre/tests/opendevunlink.c create mode 100644 lustre/tests/opendirunlink.c create mode 100644 lustre/tests/openfile.c create mode 100755 lustre/tests/recovery-small-upcall.sh create mode 100644 lustre/tests/runobdstat create mode 100644 lustre/tests/sanity-ldlm.sh create mode 100644 lustre/tests/unlinkmany.c create mode 100644 lustre/utils/Lustre/.cvsignore create mode 100644 lustre/utils/Lustre/Makefile.am create mode 100644 lustre/utils/Lustre/__init__.py create mode 100644 lustre/utils/Lustre/cmdline.py create mode 100644 lustre/utils/Lustre/error.py create mode 100644 lustre/utils/Lustre/lustredb.py create mode 100644 lustre/utils/lactive rename lustre/utils/{lconf.in => lconf} (60%) delete mode 100644 lustre/utils/llparser.pm create mode 100755 lustre/utils/llstat.pl create mode 100755 lustre/utils/load_ldap.sh create mode 100644 lustre/utils/wirecheck.c diff --git a/lustre/lib/.cvsignore b/lnet/.cvsignore similarity index 60% rename from lustre/lib/.cvsignore rename to lnet/.cvsignore index e530020..99ac885 100644 --- a/lustre/lib/.cvsignore +++ b/lnet/.cvsignore @@ -1,8 +1,8 @@ -.Xrefs +Kernelenv +Makefile +Makefile.in +aclocal.m4 +autom4te.cache config.log config.status configure -Makefile -Makefile.in -.deps -TAGS diff --git a/lnet/AUTHORS b/lnet/AUTHORS new file mode 100644 index 0000000..e69de29 diff --git a/lnet/ChangeLog b/lnet/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/lnet/Kernelenv.in b/lnet/Kernelenv.in new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lnet/Kernelenv.in @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lnet/Kernelenv.mk b/lnet/Kernelenv.mk new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lnet/Kernelenv.mk @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lnet/Makefile.am b/lnet/Makefile.am new file mode 100644 index 0000000..1a223f2 --- /dev/null +++ b/lnet/Makefile.am @@ -0,0 +1,12 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = Rules.linux archdep.m4 include +DIST_SUBDIRS = libcfs portals knals unals utils tests doc router +if LIBLUSTRE +SUBDIRS = portals unals utils +else +SUBDIRS = libcfs portals knals unals utils tests doc router +endif diff --git a/lnet/Makefile.mk b/lnet/Makefile.mk new file mode 100644 index 0000000..be0e51a --- /dev/null +++ b/lnet/Makefile.mk @@ -0,0 +1,6 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += libcfs/ +obj-y += knals/ +obj-y += router/ diff --git a/lnet/NEWS b/lnet/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/lnet/README b/lnet/README new file mode 100644 index 0000000..e69de29 diff --git a/lnet/Rules.linux b/lnet/Rules.linux new file mode 100644 index 0000000..93943b7 --- /dev/null +++ b/lnet/Rules.linux @@ -0,0 +1,25 @@ +# included in Linux kernel directories +# Rules for module building + +if LINUX25 + +basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g') +AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2 -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename) + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + +else + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + +endif + +tags: + rm -f $(top_srcdir)/TAGS + rm -f $(top_srcdir)/tags + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a diff --git a/lnet/archdep.m4 b/lnet/archdep.m4 new file mode 100644 index 0000000..7cb00cf --- /dev/null +++ b/lnet/archdep.m4 @@ -0,0 +1,317 @@ + +# -------- in kernel compilation? (2.5 only) ------------- +AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles]) +AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) +echo "Makefile for in kernel build: $INKERNEL" + +# -------- liblustre compilation -------------- +AC_ARG_WITH(lib, [ --with-lib compile lustre library], host_cpu="lib") + +# -------- set linuxdir ------------ + +AC_ARG_WITH(linux, [ --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux) +AC_SUBST(LINUX) + +# --------- UML? -------------------- +AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...) +if test $host_cpu = "lib" ; then + host_cpu="lib" + AC_MSG_RESULT(no building Lustre library) +else + if test -e $LINUX/include/asm-um ; then + if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then + host_cpu="um"; + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no (asm doesn't point at asm-um)) + fi + + else + AC_MSG_RESULT(no (asm-um missing)) + fi +fi + +# --------- Linux 25 ------------------ + +AC_MSG_CHECKING(if you are running linux 2.5) +if test -e $LINUX/include/linux/namei.h ; then + linux25="yes" + AC_MSG_RESULT(yes) +else + linux25="no" + AC_MSG_RESULT(no) +fi +AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) +echo "Makefiles for in linux 2.5 build: $LINUX25" + +# ------- Makeflags ------------------ + +AC_MSG_CHECKING(setting make flags system architecture: ) +case ${host_cpu} in + lib ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall ' + KCPPFLAGS='-D__arch_lib__ ' + libdir='${exec_prefix}/lib/lustre' + MOD_LINK=elf_i386 +;; + um ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common ' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include ' + ;; + esac + + MOD_LINK=elf_i386 +;; + i*86 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + ;; + esac + MOD_LINK=elf_i386 +;; + + alphaev6 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alphaev67 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alpha* ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + ia64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' + KCPPFLAGS='-D__KERNEL__ -DMODULE' + MOD_LINK=elf64_ia64 +;; + + sparc64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf64_sparc + +;; + + powerpc ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf32ppclinux +;; + + *) + AC_ERROR("Unknown Linux Platform: $host_cpu") +;; +esac + +# ----------- make dep run? ------------------ + +if test $host_cpu != "lib" ; then + AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) ) + if test -f $LINUX/include/linux/config.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.) + fi +fi + +# ------------ include paths ------------------ + +if test $host_cpu != "lib" ; then + KINCFLAGS="-I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include -I$LINUX/include" +else + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include' +fi +CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS" + +if test $host_cpu != "lib" ; then +# ------------ autoconf.h ------------------ + AC_MSG_CHECKING(if autoconf.h is in kernel source) + if test -f $LINUX/include/linux/autoconf.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.) + fi + +# ------------ RELEASE and moduledir ------------------ + AC_MSG_CHECKING(for Linux release) + + dnl We need to rid ourselves of the nasty [ ] quotes. + changequote(, ) + dnl Get release from version.h + RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`" + changequote([, ]) + + moduledir='$(libdir)/modules/'$RELEASE/kernel + AC_SUBST(moduledir) + + modulefsdir='$(moduledir)/fs/$(PACKAGE)' + AC_SUBST(modulefsdir) + + AC_MSG_RESULT($RELEASE) + AC_SUBST(RELEASE) + +# ---------- modversions? -------------------- + AC_MSG_CHECKING(for MODVERSIONS) + if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1; + then + MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB" + AC_MSG_RESULT(yes) + else + MFLAGS= + AC_MSG_RESULT(no) + fi +fi + +# ---------- Portals flags -------------------- + +#AC_PREFIX_DEFAULT([]) +#if test "x$prefix" = xNONE || test "x$prefix" = x; then +# usrprefix=/usr +#else +# usrprefix='${prefix}' +#fi +#AC_SUBST(usrprefix) + +AC_MSG_CHECKING(if kernel has CPU affinity support) +if test "$target_cpu" != ia64 ; then + enable_affinity_temp="-DCPU_AFFINITY=1" + AC_MSG_RESULT(yes) +else + enable_affinity_temp="" + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(if kernel has zero-copy TCP support) +ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`" +if test "$ZCCD" != 0 ; then + enable_zerocopy_temp="-DSOCKNAL_ZC=1" + AC_MSG_RESULT(yes) +else + enable_zerocopy_temp="" + AC_MSG_RESULT(no) +fi + +AC_ARG_ENABLE(zerocopy, [ --enable-zerocopy enable socknal zerocopy],enable_zerocopy=$enable_zerocopy_temp, enable_zerocopy="") + +AC_ARG_ENABLE(affinity, [ --enable-affinity enable process/irq affinity],enable_affinity="-DCPU_AFFINITY=1", enable_affinity=$enable_affinity_temp) +##################################### + +AC_MSG_CHECKING(if quadrics kernel headers are present) +if test -d $LINUX/drivers/net/qsnet ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/net/qsnet/include" + : +elif test -d $LINUX/drivers/qsnet1 ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/qsnet1/include -DPROPRIETARY_ELAN" + : +elif test -d $LINUX/drivers/quadrics ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/quadrics/include -DPROPRIETARY_ELAN" + : +#elif test -d /usr/include/elan3 ; then +# AC_MSG_RESULT(yes) +# QSWNAL="qswnal" +# with_quadrics="" +# : +else + AC_MSG_RESULT(no) + QSWNAL="" + with_quadrics="" + : +fi +AC_SUBST(with_quadrics) +AC_SUBST(QSWNAL) + +# R. Read 5/02 +GMNAL="" +echo "checking with-gm=" ${with_gm} +if test "${with_gm+set}" = set; then + if test "${with_gm}" = yes; then + with_gm="-I/usr/local/gm/include" + else + with_gm=-I"$with_gm/include" + fi + GMNAL="gmnal" +else +# default case - no GM + with_gm="" +fi +AC_SUBST(with_gm) +AC_SUBST(GMNAL) + + +def_scamac=/opt/scali/include +AC_ARG_WITH(scamac, [ --with-scamac=[yes/no/path] Path to ScaMAC includes (default=/opt/scali/include)], with_scamac=$withval, with_scamac=$def_scamac) +AC_MSG_CHECKING(if ScaMAC headers are present) +if test "$with_scamac" = yes; then + with_scamac=$def_scamac +fi +if test "$with_scamac" != no -a -f ${with_scamac}/scamac.h; then + AC_MSG_RESULT(yes) + SCIMACNAL="scimacnal" + with_scamac="-I${with_scamac} -I${with_scamac}/icm" +else + AC_MSG_RESULT(no) + SCIMACNAL="" + with_scamac="" +fi + +AC_SUBST(with_scamac) +AC_SUBST(SCIMACNAL) + +CFLAGS="$KCFLAGS" +CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac " + +AC_SUBST(MOD_LINK) +AC_SUBST(LINUX25) +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) + +# ---------- Red Hat 2.4.20 backports some 2.5 bits -------- +# This needs to run after we've defined the KCPPFLAGS + +AC_MSG_CHECKING(for kernel version) +AC_TRY_LINK([#define __KERNEL__ + #include ], + [struct task_struct p; + p.sighand = NULL;], + [RH_2_4_20=1], + [RH_2_4_20=0]) + +if test $RH_2_4_20 = 1; then + AC_MSG_RESULT(redhat-2.4.20) + CPPFLAGS="$CPPFLAGS -DCONFIG_RH_2_4_20" +else + AC_MSG_RESULT($RELEASE) +fi diff --git a/lnet/autogen.sh b/lnet/autogen.sh new file mode 100644 index 0000000..9deed73 --- /dev/null +++ b/lnet/autogen.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +aclocal && +automake --add-missing && +${AUTOCONF:-autoconf} diff --git a/lnet/build.m4 b/lnet/build.m4 new file mode 100644 index 0000000..025f243 --- /dev/null +++ b/lnet/build.m4 @@ -0,0 +1,95 @@ +# ---------- other tests and settings --------- + + +# --------- unsigned long long sane? ------- + +AC_CHECK_SIZEOF(unsigned long long, 0) +echo "---> size SIZEOF $SIZEOF_unsigned_long_long" +echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" +if test $ac_cv_sizeof_unsigned_long_long != 8 ; then + AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) +fi + +# directories for binaries +ac_default_prefix= +bindir='${exec_prefix}/usr/bin' +sbindir='${exec_prefix}/usr/sbin' +includedir='${prefix}/usr/include' + +# Directories for documentation and demos. +docdir='${prefix}/usr/share/doc/$(PACKAGE)' +AC_SUBST(docdir) +demodir='$(docdir)/demo' +AC_SUBST(demodir) +pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples' +AC_SUBST(pkgexampledir) +pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre' +AC_SUBST(pymoddir) +modulenetdir='$(moduledir)/net/$(PACKAGE)' +AC_SUBST(modulenetdir) + + +# ---------- BAD gcc? ------------ +AC_PROG_RANLIB +AC_PROG_CC +AC_MSG_CHECKING(for buggy compiler) +CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` +bad_cc() { + echo + echo " '$CC_VERSION'" + echo " has been known to generate bad code, " + echo " please get an updated compiler." + AC_MSG_ERROR(sorry) +} +TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` +if test "$TMP_VERSION" = "gcc version 2.95"; then + bad_cc +fi +case "$CC_VERSION" in + # ost_pack_niobuf putting 64bit NTOH temporaries on the stack + # without "sub $0xc,%esp" to protect the stack from being + # stomped on by interrupts (bug 606) + "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") + bad_cc + ;; + # mandrake's similar sub 0xc compiler bug + # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2 + "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") + bad_cc + ;; + *) + AC_MSG_RESULT(no known problems) + ;; +esac +# end ------ BAD gcc? ------------ + +# -------- Check for required packages -------------- + +# this doesn't seem to work on older autoconf +# AC_CHECK_LIB(readline, readline,,) +AC_ARG_ENABLE(readline, [ --enable-readline use readline library],, + enable_readline="yes") + +if test "$enable_readline" = "yes" ; then + LIBREADLINE="-lreadline -lncurses" + HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1" +else + LIBREADLINE="" + HAVE_LIBREADLINE="" +fi +AC_SUBST(LIBREADLINE) +AC_SUBST(HAVE_LIBREADLINE) + +AC_ARG_ENABLE(efence, [ --enable-efence use efence library],, + enable_efence="no") + +if test "$enable_efence" = "yes" ; then + LIBEFENCE="-lefence" + HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1" +else + LIBEFENCE="" + HAVE_LIBEFENCE="" +fi +AC_SUBST(LIBEFENCE) +AC_SUBST(HAVE_LIBEFENCE) + diff --git a/lnet/configure.in b/lnet/configure.in new file mode 100644 index 0000000..31d3492 --- /dev/null +++ b/lnet/configure.in @@ -0,0 +1,34 @@ +# This version is here to make autoconf happy; the name is a file which is +# "unique" to this directory so that configure knows where it should run. +AC_INIT(knals/Makefile.am, 3.0) +AC_CANONICAL_SYSTEM +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +# Automake variables. Steal the version number from packaging/intersync.spec +AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c])) +# AM_MAINTAINER_MODE + +sinclude(build.m4) +sinclude(archdep.m4) + +if test x$enable_inkernel = xyes ; then +cp Kernelenv.mk Kernelenv.in +cp Makefile.mk Makefile.in +cp libcfs/Makefile.mk libcfs/Makefile.in +cp portals/Makefile.mk portals/Makefile.in +cp knals/Makefile.mk knals/Makefile.in +cp knals/socknal/Makefile.mk knals/socknal/Makefile.in +cp router/Makefile.mk router/Makefile.in +fi + +AM_CONFIG_HEADER(include/config.h) + +AC_OUTPUT([Makefile Kernelenv libcfs/Makefile portals/Makefile \ + unals/Makefile knals/Makefile router/Makefile \ + knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \ + knals/scimacnal/Makefile knals/toenal/Makefile \ + utils/Makefile tests/Makefile doc/Makefile ]) + diff --git a/lnet/doc/.cvsignore b/lnet/doc/.cvsignore new file mode 100644 index 0000000..827dca4 --- /dev/null +++ b/lnet/doc/.cvsignore @@ -0,0 +1,4 @@ +Makefile +Makefile.in +*.eps +*.pdf diff --git a/lnet/doc/Data-structures b/lnet/doc/Data-structures new file mode 100644 index 0000000..b5532b1 --- /dev/null +++ b/lnet/doc/Data-structures @@ -0,0 +1,65 @@ +In this document I will try to draw the data structures and how they +interrelate in the Portals 3 reference implementation. It is probably +best shown with a drawing, so there may be an additional xfig or +Postscript figure. + + +MEMORY POOLS: +------------ + +First, a digression on memory allocation in the library. As mentioned +in the NAL Writer's Guide, the library does not link against any +standard C libraries and as such is unable to dynamically allocate +memory on its own. It requires that the NAL implement a method +for allocation that is appropriate for the protection domain in +which the library lives. This is only called when a network +interface is initialized to allocate the Portals object pools. + +These pools are preallocate blocks of objects that the library +can rapidly make active and manage with a minimum of overhead. +It is also cuts down on overhead for setting up structures +since the NAL->malloc() callback does not need to be called +for each object. + +The objects are maintained on a per-object type singly linked free +list and contain a pointer to the next free object. This pointer +is NULL if the object is not on the free list and is non-zero +if it is on the list. The special sentinal value of 0xDEADBEEF +is used to mark the end of the free list since NULL could +indicate that the last object in the list is not free. + +When one of the lib_*_alloc() functions is called, the library +returns the head of the free list and advances the head pointer +to the next item on the list. The special case of 0xDEADBEEF is +checked and a NULL pointer is returned if there are no more +objects of this type available. The lib_*_free() functions +are even simpler -- check to ensure that the object is not already +free, set its next pointer to the current head and then set +the head to be this newly freed object. + +Since C does not have templates, I did the next best thing and wrote +the memory pool allocation code as a macro that expands based on the +type of the argument. The mk_alloc(T) macro expands to +write the _lib_T_alloc() and lib_T_free() functions. +It requires that the object have a pointer of the type T named +"next_free". There are also functions that map _lib_T_alloc() +to lib_T_alloc() so that the library can add some extra +functionality to the T constructor. + + + +LINKED LISTS: +------------ + +Many of the active Portals objects are stored in doubly linked lists +when they are active. These are always implemented with the pointer +to the next object and a pointer to the next pointer of the +previous object. This avoids the "dummy head" object or +special cases for inserting at the beginning or end of the list. +The pointer manipulations are a little hairy at times, but +I hope that they are understandable. + +The actual linked list code is implemented as macros in , +although the object has to know about + + diff --git a/lnet/doc/Makefile.am b/lnet/doc/Makefile.am new file mode 100644 index 0000000..7c65e6c --- /dev/null +++ b/lnet/doc/Makefile.am @@ -0,0 +1,46 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +LYX2PDF = lyx --export pdf +LYX2TXT = lyx --export text +LYX2HTML = lyx --export html +SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps + +DOCS = portals3.pdf +IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps +LYXFILES= portals3.lyx + +MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(GENERATED) +GENERATED = +EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) + +all: $(DOCS) + +# update date and version in document +date := $(shell date +%x) +tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') +addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g' + +# Regenerate when the $(VERSION) or $Name: $ changes. +.INTERMEDIATE: $(GENERATED) +$(GENERATED) : %.lyx: %.lin Makefile + $(addversion) $< > $@ + +.lyx.pdf: + @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" + +.lyx.txt: + @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" +.lyx.html: + @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" +.fig.eps: + -fig2dev -L eps $< > $@ + +portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx + +syncweb: portals3.pdf +# cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf +# ( cd /usr/src/www ; make lustre ; make synclustre ) + diff --git a/lnet/doc/Message-life-cycle b/lnet/doc/Message-life-cycle new file mode 100644 index 0000000..e8cc7e2 --- /dev/null +++ b/lnet/doc/Message-life-cycle @@ -0,0 +1,118 @@ +This documents the life cycle of message as it arrives and is handled by +a basic async, packetized NAL. There are four types of messages that have +slightly different life cycles, so they are addressed independently. + + +Put request +----------- + +1. NAL notices that there is a incoming message header on the network +and reads an ptl_hdr_t in from the wire. + +2. It may store additional NAL specific data that provides context +for this event in a void* that it will interpret in some fashion +later. + +3. The NAL calls lib_parse() with a pointer to the header and its +private data structure. + +4. The library decodes the header and may build a message state +object that describes the event to be written and the ACK to be +sent, if any. It then calls nal->recv() with the private data +that the NAL passed in, a pointer to the message state object +and a translated user address. + + The NAL will have been given a chance to pretranslate + all user addresses when the buffers are created. This + process is described in the NAL-HOWTO. + +5. The NAL should restore what ever context it required from the +private data pointer, begin receiving the bytes and possibly store +some extra state of its own. It should return at this point. + + + +Get request +----------- + +1. As with a Put, the NAL notices the incoming message header and +passes it to lib_parse(). + +2. The library decodes the header and calls nal->recv() with a +zero byte length, offset and destination to instruct it to clean +up the wire after reading the header. The private data will +be passed in as well, allowing the NAL to retrieve any state +or context that it requires. + +3. The library may build a message state object to possibly +write an event log or invalidate a memory region. + +4. The library will build a ptl_msg_t header that specifies the +Portals protocol information for delivery at the remote end. + +5. The library calls nal->send() with the pre-built header, +the optional message state object, the four part address +component, a translated user pointer + offset, and some +other things. + +6. The NAL is to put the header on the wire or copy it at +this point (since it off the stack). It should store some +amount of state about its current position in the message and +the destination address. + +7. And then return to the library. + + +Reply request +------------- + +1. Starting at "The library decodes the header..." + +2. The library decodes the header and calls nal->recv() +to bring in the rest of the message. Flow continues in +exactly the same fashion as with all other receives. + + +Ack request +----------- + +1. The library decodes the header, builds the appropriate data +structures for the event in a message state object and calls nal->recv() +with a zero byte length, etc. + + +Packet arrival +-------------- + +1. The NAL should notice the arrival of a packet, retrieve whatever +state it needs from the message ID or other NAL specific header data +and place the data bytes directly into the user address that were +given to nal->recv(). + + How this happens is outside the scope of the Portals library + and soley determined by the NAL... + +2. If this is the last packet in a message, the NAL should retrieve +the lib_msg_t *cookie that it was given in the call to nal->recv() +and pass it to lib_finalize(). lib_finalize() may call nal->send() +to send an ACK, nal->write() to record an entry in the event log, +nal->invalidate() to unregister a region of memory or do nothing at all. + +3. It should then clean up any remaining NAL specific state about +the message and go back into the main loop. + + +Outgoing packets +---------------- + +1. When the NAL has pending output, it should put the packets on +the wire wrapped with whatever implementation specified wrappers. + +2. Once it has output all the packets of a message it should +call lib_finalize() with the message state object that was +handed to nal->send(). This will allows the library to clean +up its state regarding the message and write any pending event +entries. + + + diff --git a/lnet/doc/NAL-HOWTO b/lnet/doc/NAL-HOWTO new file mode 100644 index 0000000..ea38aed --- /dev/null +++ b/lnet/doc/NAL-HOWTO @@ -0,0 +1,293 @@ +This document is a first attempt at describing how to write a NAL +for the Portals 3 library. It also defines the library architecture +and the abstraction of protection domains. + + +First, an overview of the architecture: + + Application + +----|----+-------- + | + API === NAL (User space) + | +---------+---|----- + | + LIB === NAL (Library space) + | +---------+---|----- + + Physical wire (NIC space) + + +Application + API +API-side NAL +------------ +LIB-side NAL + LIB +LIB-side NAL + wire + +Communication is through the indicated paths via well defined +interfaces. The API and LIB portions are written to be portable +across platforms and do not depend on the network interface. + +Communcation between the application and the API code is +defined in the Portals 3 API specification. This is the +user-visible portion of the interface and should be the most +stable. + + + +API-side NAL: +------------ + +The user space NAL needs to implement only a few functions +that are stored in a nal_t data structure and called by the +API-side library: + + int forward( nal_t *nal, + int index, + void *args, + size_t arg_len, + void *ret, + size_t ret_len + ); + +Most of the data structures in the portals library are held in +the LIB section of the code, so it is necessary to forward API +calls across the protection domain to the library. This is +handled by the NAL's forward method. Once the argument and return +blocks are on the remote side the NAL should call lib_dispatch() +to invoke the appropriate API function. + + int validate( nal_t *nal, + void *base, + size_t extent, + void **trans_base, + void **trans_data + ); + +The validate method provides a means for the NAL to prevalidate +and possibly pretranslate user addresses into a form suitable +for fast use by the network card or kernel module. The trans_base +pointer will be used by the library everytime it needs to +refer to the block of memory. The trans_data result is a +cookie that will be handed to the NAL along with the trans_base. + +The library never performs calculations on the trans_base value; +it only computes offsets that are then handed to the NAL. + + + int shutdown( nal_t *nal, int interface ); + +Brings down the network interface. The remote NAL side should +call lib_fini() to bring down the library side of the network. + + void yield( nal_t *nal ); + +This allows the user application to gracefully give up the processor +while busy waiting. Performance critical applications may not +want to take the time to call this function, so it should be an +option to the PtlEQWait call. Right now it is not implemented as such. + +Lastly, the NAL must implement a function named PTL_IFACE_*, where +* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR. +This initialization function is to set up communication with the +library-side NAL, which should call lib_init() to bring up the +network interface. + + + +LIB-side NAL: +------------ + +On the library-side, the NAL has much more responsibility. It +is responsible for calling lib_dispatch() on behalf of the user, +it is also responsible for bringing packets off the wire and +pushing bits out. As on the user side, the methods are stored +in a nal_cb_t structure that is defined on a per network +interface basis. + +The calls to lib_dispatch() need to be examined. The prototype: + + void lib_dispatch( + nal_cb_t *nal, + void *private, + int index, + void *arg_block, + void *ret_block + ); + +has two complications. The private field is a NAL-specific +value that will be passed to any callbacks produced as a result +of this API call. Kernel module implementations may use this +for task structures, or perhaps network card data. It is ignored +by the library. + +Secondly, the arg_block and ret_block must be in the same protection +domain as the library. The NAL's two halves must communicate the +sizes and perform the copies. After the call, the buffer pointed +to by ret_block will be filled in and should be copied back to +the user space. How this is to be done is NAL specific. + + int lib_parse( + nal_cb_t *nal, + ptl_hdr_t *hdr, + void *private + ); + +This is the only other entry point into the library from the NAL. +When the NAL detects an incoming message on the wire it should read +sizeof(ptl_hdr_t) bytes and pass a pointer to the header to +lib_parse(). It may set private to be anything that it needs to +tie the incoming message to callbacks that are made as a result +of this event. + +The method calls are: + + int (*send)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int nid, + int pid, + int gid, + int rid, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t len + ); + +This is a tricky function -- it must support async output +of messages as well as properly syncronized event log writing. +The private field is the same that was passed into lib_dispatch() +or lib_parse() and may be used to tie this call to the event +that initiated the entry to the library. + +The cookie is a pointer to a library private value that must +be passed to lib_finalize() once the message has been completely +sent. It should not be examined by the NAL for any meaning. + +The four ID fields are passed in, although some implementations +may not use all of them. + +The single base pointer has been replaced with the translated +address that the API NAL generated in the api_nal->validate() +call. The trans_data is unchanged and the offset is in bytes. + + + int (*recv)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t mlen, + size_t rlen + ); + +This callback will only be called in response to lib_parse(). +The cookie, trans_addr and trans_data are as discussed in send(). +The NAL should read mlen bytes from the wire, deposit them into +trans_base + offset and then discard (rlen - mlen) bytes. +Once the entire message has been received the NAL should call +lib_finalize() with the lib_msg_t *cookie. + +The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0 +is used to indicate that the NAL should clean up the wire. This could +be implemented as a blocking call, although having it return as quickly +as possible is desirable. + + int (*write)( + nal_cb_t *nal, + void *private, + user_ptr trans_addr, + user_ptr trans_data, + size_t offset, + + void *src_addr, + size_t len + ); + +This is essentially a cross-protection domain memcpy(). The user address +has been pretranslated by the api_nal->translate() call. + + void *(*malloc)( + nal_cb_t *nal, + size_t len + ); + + void (*free)( + nal_cb_t *nal, + void *buf + ); + +Since the NAL may be in a non-standard hosted environment it can +not call malloc(). This allows the library side NAL to implement +the system specific malloc(). In the current reference implementation +the libary only calls nal->malloc() when the network interface is +initialized and then calls free when it is brought down. The library +maintains its own pool of objects for allocation so only one call to +malloc is made per object type. + + void (*invalidate)( + nal_cb_t *nal, + user_ptr trans_base, + user_ptr trans_data, + size_t extent + ); + +User addresses are validated/translated at the user-level API NAL +method, which is likely to push them to this level. Meanwhile, +the library NAL will be notified when the library no longer +needs the buffer. Overlapped buffers are not detected by the +library, so the NAL should ref count each page involved. + +Unfortunately we have a few bugs when the invalidate method is +called. It is still in progress... + + void (*printf)( + nal_cb_t *nal, + const char *fmt, + ... + ); + +As with malloc(), the library does not have any way to do printf +or printk. It is not necessary for the NAL to implement the this +call, although it will make debugging difficult. + + void (*cli)( + nal_cb_t *nal, + unsigned long *flags + ); + + void (*sti)( + nal_cb_t *nal, + unsigned long *flags + ); + +These are used by the library to mark critical sections. + + int (*gidrid2nidpid)( + nal_cb_t *nal, + ptl_id_t gid, + ptl_id_t rid, + ptl_id_t *nid, + ptl_id_t *pid + ); + + + int (*nidpid2gidrid)( + nal_cb_t *nal, + ptl_id_t nid, + ptl_id_t pid, + ptl_id_t *gid, + ptl_id_t *rid + ); + +Rolf added these. I haven't looked at how they have to work yet. diff --git a/lnet/doc/file.fig b/lnet/doc/file.fig new file mode 100644 index 0000000..914c294 --- /dev/null +++ b/lnet/doc/file.fig @@ -0,0 +1,111 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1200 750 1650 1050 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1050 1650 750 1200 750 1200 1050 1650 1050 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001 +-6 +6 1200 2325 1650 2625 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2625 1650 2325 1200 2325 1200 2625 1650 2625 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001 +-6 +6 1200 1800 1650 2100 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2100 1650 1800 1200 1800 1200 2100 1650 2100 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001 +-6 +6 1200 1275 1650 1575 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1575 1650 1275 1200 1275 1200 1575 1650 1575 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001 +-6 +6 450 750 900 1200 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 825 450 1050 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1050 900 825 +-6 +6 450 2325 900 2775 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 2400 450 2625 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2625 900 2400 +-6 +6 450 1800 900 2250 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1875 450 2100 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2100 900 1875 +-6 +6 450 1275 900 1725 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1350 450 1575 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1575 900 1350 +-6 +6 2250 750 3450 2625 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1200 3150 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1500 3150 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1800 3150 1800 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2100 3150 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 975 3150 975 3150 2625 2550 2625 2550 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2400 3150 2400 +4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2400 2550 1350 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1875 2550 1050 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1425 2550 1950 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 900 2550 1650 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 900 1200 900 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1425 1200 1425 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1950 1200 1950 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2475 1200 2475 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2025 2550 2250 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2550 2550 2475 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1875 2850 1875 600 225 600 225 2850 1875 2850 +4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001 diff --git a/lnet/doc/flow_new.fig b/lnet/doc/flow_new.fig new file mode 100644 index 0000000..d828dea --- /dev/null +++ b/lnet/doc/flow_new.fig @@ -0,0 +1,213 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 525 2175 1575 2925 +6 675 2287 1425 2812 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001 +4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 2550 1050 2175 525 2550 1050 2925 1575 2550 +-6 +6 3450 1275 4350 1725 +6 3600 1312 4200 1687 +4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001 +4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3450 1275 4350 1275 4350 1725 3450 1725 3450 1275 +-6 +6 4650 1275 5550 1725 +6 4725 1312 5475 1687 +4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001 +4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4650 1275 5550 1275 5550 1725 4650 1725 4650 1275 +-6 +6 1350 525 2250 975 +6 1350 562 2250 937 +4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001 +4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 525 2250 525 2250 975 1350 975 1350 525 +-6 +6 525 1125 1575 1875 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 1500 1050 1125 525 1500 1050 1875 1575 1500 +4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001 +-6 +6 2340 1237 2940 1687 +6 2340 1237 2940 1687 +4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001 +4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001 +4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001 +-6 +-6 +6 525 3225 1575 3975 +6 675 3375 1425 3750 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 525 3600 1050 3225 1575 3600 1050 3975 525 3600 +-6 +6 3300 3375 4350 3825 +6 3300 3412 4350 3787 +4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3300 3375 4350 3375 4350 3825 3300 3825 3300 3375 +-6 +6 1950 3225 3000 3975 +6 2250 3450 2700 3750 +4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 3000 3600 2475 3225 1950 3600 2475 3975 3000 3600 +-6 +6 3150 4500 4200 4950 +6 3150 4537 4200 4912 +4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3150 4500 4200 4500 4200 4950 3150 4950 3150 4500 +-6 +6 600 4500 1500 4950 +6 675 4537 1425 4912 +4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001 +4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 600 4500 1500 4500 1500 4950 600 4950 600 4500 +-6 +6 4650 4350 5700 5100 +6 4950 4537 5400 4912 +6 4950 4537 5400 4912 +4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001 +4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001 +-6 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 5700 4725 5175 4350 4650 4725 5175 5100 5700 4725 +-6 +6 6000 4500 6900 4950 +6 6225 4575 6675 4875 +4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001 +4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 6000 4500 6900 4500 6900 4950 6000 4950 6000 4500 +-6 +6 1800 4350 2850 5100 +6 2100 4575 2550 4875 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 2850 4725 2325 4350 1800 4725 2325 5100 2850 4725 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 1875 1050 2175 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 1500 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 450 1050 1125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1350 750 1050 750 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 2925 1050 3225 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3150 1500 3450 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4350 1500 4650 1500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2100 1500 2625 1125 3150 1500 2625 1875 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 3600 1950 3600 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 3975 1050 4500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 3600 3300 3600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 4725 1800 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 5700 4725 6000 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2850 4725 3150 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4200 4725 4650 4725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 6900 4725 7950 4725 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1575 2550 1650 2550 1800 2550 1800 2400 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 2250 750 2475 750 2625 750 2625 900 2625 1125 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 7500 4725 7500 1650 7500 1500 7350 1500 5550 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 2475 3225 2475 2400 2475 2250 2325 2250 1800 2250 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 3825 3375 3825 2175 3825 2025 3675 2025 1800 2025 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125 + 4425 4275 4425 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125 + 7275 4275 7275 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001 +4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001 diff --git a/lnet/doc/get.fig b/lnet/doc/get.fig new file mode 100644 index 0000000..28db949 --- /dev/null +++ b/lnet/doc/get.fig @@ -0,0 +1,33 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 2775 900 3525 1200 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001 +-6 +6 1350 1725 2175 2025 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 750 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 825 2700 1275 +2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1350 900 1950 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 diff --git a/lnet/doc/ieee.bst b/lnet/doc/ieee.bst new file mode 100644 index 0000000..4df7c50 --- /dev/null +++ b/lnet/doc/ieee.bst @@ -0,0 +1,1112 @@ +% --------------------------------------------------------------- +% +% by Paolo.Ienne@di.epfl.ch +% +% --------------------------------------------------------------- +% +% no guarantee is given that the format corresponds perfectly to +% IEEE 8.5" x 11" Proceedings, but most features should be ok. +% +% --------------------------------------------------------------- +% +% `ieee' from BibTeX standard bibliography style `abbrv' +% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. +% Copyright (C) 1985, all rights reserved. +% Copying of this file is authorized only if either +% (1) you make absolutely no changes to your copy, including name, or +% (2) if you do make changes, you name it something other than +% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. +% This restriction helps ensure that all standard styles are identical. +% The file btxbst.doc has the documentation for this style. + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ } + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { month empty$ + { "" } + { "there's a month but no year in " cite$ * warning$ + month + } + if$ + } + { month empty$ + 'year + { month " " * year * } + if$ + } + if$ +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":" * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { "In " booktitle emphasize * } + { "In " format.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + key empty$ not and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In {\em " booktitle * "\/}" * } + if$ + } + { "In " key * } + if$ + } + { "In " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + format.date "year" output.check + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.btitle "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization output + address output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title howpublished new.block.checkb + format.title output + howpublished new.block.checka + howpublished output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.btitle "title" output.check + new.block + "PhD thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors output.nonnull } + if$ + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address empty$ + { editor empty$ + { publisher new.sentence.checka } + { organization publisher new.sentence.checkb + organization output + } + if$ + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + editor empty$ + 'skip$ + { organization output } + if$ + publisher output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.organization.sort} +{ author empty$ + { organization empty$ + { key empty$ + { "to sort, need author, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.organization.sort} +{ editor empty$ + { organization empty$ + { key empty$ + { "to sort, need editor, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.organization.sort + { type$ "manual" = + 'author.organization.sort + 'author.sort + if$ + } + if$ + } + if$ + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * + "}\setlength{\itemsep}{-1ex}\small" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} + +% end of file ieee.bst +% --------------------------------------------------------------- diff --git a/lnet/doc/mpi.fig b/lnet/doc/mpi.fig new file mode 100644 index 0000000..e1a91b5 --- /dev/null +++ b/lnet/doc/mpi.fig @@ -0,0 +1,117 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 150 1650 900 2025 +4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001 +4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001 +-6 +6 150 150 900 525 +4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001 +4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001 +-6 +6 2550 4125 3150 4725 +4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001 +4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001 +4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001 +-6 +6 1050 1575 1950 1875 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 1575 1950 1575 1950 1875 1050 1875 1050 1575 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001 +-6 +6 5400 1575 6300 2175 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 1575 6300 1575 6300 2175 5400 2175 5400 1575 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001 +-6 +6 5400 2400 6300 3000 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 2400 6300 2400 6300 3000 5400 3000 5400 2400 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001 +-6 +6 1050 2400 1950 2700 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 2400 1950 2400 1950 2700 1050 2700 1050 2400 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001 +-6 +6 1050 825 1950 1125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 825 1950 825 1950 1125 1050 1125 1050 825 +4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1575 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2025 4050 3375 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 675 6600 675 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 1350 6600 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 4125 3300 4125 3300 4725 2400 4725 2400 4125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 4500 4050 3675 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 1725 5400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2550 5400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2850 4050 3450 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1800 1500 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 825 3300 825 3300 1275 2400 1275 2400 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 2625 1500 4125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 4125 1950 4125 1950 4425 1050 4425 1050 4125 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 300 1500 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 975 2400 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 1725 2400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 2550 2400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 4275 2400 4275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 1575 3300 1575 3300 2175 2400 2175 2400 1575 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 2400 3300 2400 3300 3000 2400 3000 2400 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4050 3300 5250 3300 5250 3750 4050 3750 4050 3300 +4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001 +4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001 +4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001 +4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001 +4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001 +4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001 +4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001 diff --git a/lnet/doc/portals.fig b/lnet/doc/portals.fig new file mode 100644 index 0000000..9b1271b --- /dev/null +++ b/lnet/doc/portals.fig @@ -0,0 +1,68 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 900 1650 900 1650 1200 1350 1200 1350 900 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1800 1350 2100 1350 2100 1650 1800 1650 1800 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2250 1800 2550 1800 2550 2100 2250 2100 2250 1800 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 4200 375 4200 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 525 600 1125 600 1125 2100 525 2100 525 600 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4425 1275 4875 1275 4875 1950 4425 1950 4425 1275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 1200 3150 1200 3150 1500 2550 1500 2550 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 1425 4425 1425 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3600 825 3750 825 3750 1125 3600 1125 3600 825 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2025 1425 2550 1425 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 4425 750 4875 750 4875 1125 4425 1125 4425 750 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3675 975 4425 975 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2 + 0 0 1.00 60.00 120.00 + 825 1050 1350 1050 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1350 1500 1500 1650 1500 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1950 1575 1950 1800 1950 1950 2100 1950 2250 1950 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 975 1125 975 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 1125 1125 1125 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7 + 0 0 1.00 60.00 120.00 + 3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975 + 3600 975 + 0.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001 +4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001 +4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001 +4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001 +4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001 +4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001 +4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001 diff --git a/lnet/doc/portals3.bib b/lnet/doc/portals3.bib new file mode 100644 index 0000000..323b99f --- /dev/null +++ b/lnet/doc/portals3.bib @@ -0,0 +1,124 @@ +@Article{ Cplant, + title = { {M}assively {P}arallel {C}omputing with + {C}ommodity {C}omponents }, + author = { Ron Brightwell and David S. Greenberg and Arthur + B. Maccabe and Rolf Riesen }, + journal = { Parallel Computing }, + volume = { 26 }, + month = { February }, + pages = { 243-266 }, + year = { 2000 } +} + +@Manual{ Portals, + organization = { Sandia National Laboratories }, + title = { {P}uma {P}ortals }, + note = { http://www.cs.sandia.gov/puma/portals }, + year = { 1997 } +} + +@Techreport{ VIA, + title = { {V}irtual {I}nterface {A}rchitecture + {S}pecification {V}ersion 1.0 }, + author = { {Compaq, Microsoft, and Intel} }, + institution = { Compaq, Microsoft, and Intel }, + month = { December }, + year = { 1997 } +} + +@Techreport{ ST, + title = { {I}nformation {T}echnology - {S}cheduled + {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 }, + author = { {Task Group of Technical Committee T11} }, + institution = { Accredited Standards Committee NCITS }, + month = { July }, + year = { 1998 } +} + +@Manual{ TFLOPS, + organization = { Sandia National Laboratories }, + title = { ASCI Red }, + note = { http://www.sandia.gov/ASCI/TFLOP }, + year = { 1996 } +} + +@Techreport{ GM, + title = { The {GM} {M}essage {P}assing {S}ystem }, + author = { {Myricom, Inc.} }, + institution = { {Myricom, Inc.} }, + year = { 1997 }, +} + +@Article{ MPIstandard, + title = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard }, + author = { {Message Passing Interface Forum} }, + journal = { The International Journal of Supercomputer Applications + and High Performance Computing }, + volume = { 8 }, + year = { 1994 } +} + +@Inproceedings{ PumaOS, + author = "Lance Shuler and Chu Jong and Rolf Riesen and + David van Dresser and Arthur B. Maccabe and + Lee Ann Fisk and T. Mack Stallcup", + booktitle = "Proceeding of the 1995 Intel Supercomputer + User's Group Conference", + title = "The {P}uma Operating System for Massively Parallel Computers", + organization = "Intel Supercomputer User's Group", + year = 1995 +} + +@InProceedings{ SUNMOS, +author = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and + Stephen R. Wheat", +title = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide", +booktitle = "Proceedings of the {Intel} Supercomputer Users' Group. 1994 + Annual North America Users' Conference.", +year = 1994, +pages = "245--251", +month = "June", +location = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps" +} + +@InProceedings { PumaMPI, + title = { Design and Implementation of {MPI} on {P}uma Portals }, + author = { Ron Brightwell and Lance Shuler }, + booktitle = { Proceedings of the Second MPI Developer's Conference }, + pages = { 18-25 }, + month = { July }, + year = { 1996 } +} + +@Inproceedings{ FM2, + author = { Mario Lauria and Scott Pakin and Andrew Chien }, + title = { {E}fficient {L}ayering for {H}igh {S}peed + {C}ommunication: {F}ast {M}essages 2.x }, + Booktitle = { Proceedings of the IEEE International Symposium + on High Performance Distributed Computing }, + year = { 1998 } +} + +@Manual { CraySHMEM, + title = "SHMEM Technical Note for C, SG-2516 2.3", + organization = "Cray Research, Inc.", + month = "October", + year = 1994 +} + +@Manual { MPI2, + title = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface", + organization = "Message Passing Interface Forum", + note = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html", + month = "July", + year = 1997 +} + +@InProceedings { PMMPI, + title = { {The Design and Implementation of Zero Copy MPI Using + Commodity Hardware with a High Performance Network} }, + author = { Francis O'Carroll and Hiroshi Tezuka and Atsushi Hori + and Yutaka Ishikawa }, + booktitle = { Proceedings of the ICS }, + year = { 1998 } +} diff --git a/lnet/doc/portals3.lyx b/lnet/doc/portals3.lyx new file mode 100644 index 0000000..8429280 --- /dev/null +++ b/lnet/doc/portals3.lyx @@ -0,0 +1,15944 @@ +#LyX 1.2 created this file. For more info see http://www.lyx.org/ +\lyxformat 220 +\textclass report +\begin_preamble +\usepackage{fullpage} +\renewenvironment{comment}% +{\begin{quote}\textbf{Discussion}: \slshape}% +{\end{quote}} +\pagestyle{myheadings} +\end_preamble +\language american +\inputencoding auto +\fontscheme pslatex +\graphics default +\paperfontsize 10 +\spacing single +\papersize letterpaper +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 2 +\tocdepth 2 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 2 +\paperpagestyle headings + +\layout Title + +The Portals 3.2 Message Passing Interface +\newline + Revision 1.1 +\layout Author + +Ron Brightwell +\begin_inset Foot +collapsed true + +\layout Standard + +R. + Brightwell and R. + Riesen are with the Scalable Computing Systems Department, Sandia National + Laboratories, P.O. + Box 5800, Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov. +\end_inset + +, Arthur B. + Maccabe +\begin_inset Foot +collapsed true + +\layout Standard + +A. + B. + Maccabe is with the Computer Science Department, University of New Mexico, + Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87131-1386, maccabe@cs.unm.edu. +\end_inset + +, Rolf Riesen and Trammell Hudson +\layout Abstract + +This report presents a specification for the Portals 3.2 message passing + interface. + Portals 3.2 is intended to allow scalable, high-performance network communicatio +n between nodes of a parallel computing system. + Specifically, it is designed to support a parallel computing platform composed + of clusters of commodity workstations connected by a commodity system area + network fabric. + In addition, Portals 3.2 is well suited to massively parallel processing + and embedded systems. + Portals 3.2 represents an adaption of the data movement layer developed + for massively parallel processing platforms, such as the 4500-node Intel + TeraFLOPS machine. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +clearpage +\backslash +pagenumbering{roman} +\backslash +setcounter{page}{3} +\end_inset + + +\layout Standard + + +\begin_inset LatexCommand \tableofcontents{} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList figure + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList table + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Chapter* + +Summary of Changes for Revision 1.1 +\layout Enumerate + +Updated version number to 3.2 throughout the document +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sub:PtlGetId} + +\end_inset + +: added +\family typewriter +PTL_SEGV +\family default + to error list for +\shape italic +PtlGetId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: added +\family typewriter +PTL_ML_TOOLONG +\family default + to error list for +\shape italic +PtlMEAttach +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meunlink} + +\end_inset + +: removed text referring to a list of associated memory descriptors. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added text to describe unlinking a free-floating memory descriptor. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added entry for +\family typewriter +ptl_seq_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added definition of +\family typewriter +max_offset +\family default +. +\layout Enumerate + +added text to clarify +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default +. +\end_deeper +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: modified text for +\family typewriter +unlink_op +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: added text to clarify multiple calls to +\shape italic +PtlNIInit +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: added text to clarify +\family typewriter +unlink_nofit +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:receiving} + +\end_inset + +: removed text indicating that an MD will reject a message if the associated + EQ is full. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + error code and text to indicate that only MDs with no pending operations + can be unlinked. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + return code. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added user id field, MD handle field, and NI specific failure field to + the +\family typewriter +ptl_event_t +\family default + structure. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added +\family typewriter +ptl_ni_fail_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added +\family typewriter +PTL_EVENT_UNLINK +\family default + event type. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: removed +\shape slanted +PtlTransId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +: listed allowable constants with relevant fields. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: added +\shape italic +PtlMEAttachAny +\shape default + function. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_PT_FULL +\family default + return code for +\shape italic +PtlMEAttachAny +\shape default +. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + +: updated to reflect new event types. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: added +\family typewriter +ptl_nid_t +\family default +, +\family typewriter +ptl_pid_t +\family default +, and +\family typewriter +ptl_uid_t +\family default +. +\layout Chapter* + +Summary of Changes for Version 3.1 +\layout Section* + +Thread Issues +\layout Standard + +The most significant change to the interface from version 3.0 to 3.1 involves + the clarification of how the interface interacts with multi-threaded applicatio +ns. + We adopted a generic thread model in which processes define an address + space and threads share the address space. + Consideration of the API in the light of threads lead to several clarifications + throughout the document: +\layout Enumerate + +Glossary: +\begin_deeper +\layout Enumerate + +added a definition for +\emph on +thread +\emph default +, +\layout Enumerate + +reworded the definition for +\emph on +process +\emph default +. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +: added section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:threads} + +\end_inset + + to describe the multi-threading model used by the Portals API. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlinit} + +\end_inset + +: +\emph on +PtlInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlfini} + +\end_inset + +: +\emph on +PtlFini +\emph default + should be called once as the process is terminating and not as each thread + terminates. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +: Portals does not define thread ids. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + +: network interfaces are associated with processes, not threads. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: +\emph on +PtlNIInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqget} + +\end_inset + +: +\emph on +PtlEQGet +\emph default + returns +\family typewriter +PTL_EQ_EMPTY +\family default + if a thread is blocked on +\emph on +PtlEQWait +\emph default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqwait} + +\end_inset + +: waiting threads are awakened in FIFO order. + +\layout Standard + +Two functions, +\emph on +PtlNIBarrier +\emph default + and +\emph on +PtlEQCount +\emph default + were removed from the API. + +\emph on +PtlNIBarrier +\emph default + was defined to block the calling process until all of the processes in + the application group had invoked +\emph on +PtlNIBarrier +\emph default +. + We now consider this functionality, along with the concept of groups (see + the discussion under +\begin_inset Quotes eld +\end_inset + +other changes +\begin_inset Quotes erd +\end_inset + +), to be part of the runtime system, not part of the Portals API. + +\emph on +PtlEQCount +\emph default + was defined to return the number of events in an event queue. + Because external operations may lead to new events being added and other + threads may remove events, the value returned by +\emph on +PtlEQCount +\emph default + would have to be a hint about the number of events in the event queue. +\layout Section* + +Handling small, unexpected messages +\layout Standard + +Another set of changes relates to handling small unexpected messages in + MPI. + In designing version 3.0, we assumed that each unexpected message would + be placed in a unique memory descriptor. + To avoid the need to process a long list of memory descriptors, we moved + the memory descriptors out of the match list and hung them off of a single + match list entry. + In this way, large unexpected messages would only encounter a single +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + match list entry before encountering the +\begin_inset Quotes eld +\end_inset + +long message +\begin_inset Quotes erd +\end_inset + + match list entry. + Experience with this strategy identified resource management problems with + this approach. + In particular, a long sequence of very short (or zero length) messages + could quickly exhaust the memory descriptors constructed for handling unexpecte +d messages. + Our new strategy involves the use of several very large memory descriptors + for small unexpected messages. + Consecutive unexpected messages will be written into the first of these + memory descriptors until the memory descriptor fills up. + When the first of the +\begin_inset Quotes eld +\end_inset + +small memory +\begin_inset Quotes erd +\end_inset + + descriptors fills up, it will be unlinked and subsequent short messages + will be written into the next +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor. + In this case, a +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor will be declared full when it does not have sufficient + space for the largest small unexpected message. +\layout Standard + +This lead to two significant changes. + First, each match list entry now has a single memory descriptor rather + than a list of memory descriptors. + Second, in addition to exceeding the operation threshold, a memory descriptor + can be unlinked when the local offset exceeds a specified value. + These changes have lead to several changes in this document: +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{subsec:paddress} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed references to the memory descriptor list, +\layout Enumerate + +changed the portals address translation description to indicate that unlinking + a memory descriptor implies unlinking the associated match list entry--match + list entries can no longer be unlinked independently from the memory descriptor. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed unlink from argument list, +\layout Enumerate + +removed description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +changed wording of the error condition when the Portal table index already + has an associated match list. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +: removed unlink from argument list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: added +\family typewriter +max_offset +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +removed reference to memory descriptor lists, +\layout Enumerate + +changed wording of the error condition when match list entry already has + an associated memory descriptor, +\layout Enumerate + +changed the description of the +\family typewriter +unlink +\family default + argument. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +: removed +\family typewriter +PtlMDInsert +\family default + operation. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: removed references to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: removed references to PtlMDInsert. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + +: revised the MPI example to reflect the changes to the interface. + +\layout Standard + +Several changes have been made to improve the general documentation of the + interface. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_EQ_NONE +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_ID_ANY +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: documented the return value +\family typewriter +PTL_INV_EQ +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + +: clarified the description of the +\emph on +PtlMDUpdate +\emph default + function. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:implvals} + +\end_inset + +: introduced a new section to document the implementation defined values. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: modified Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + to indicate where each constant is introduced and where it is used. + +\layout Section* + +Other changes +\layout Subsection* + +Implementation defined limits (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +The earlier version provided implementation defined limits for the maximum + number of match entries, the maximum number of memory descriptors, etc. + Rather than spanning the entire implementation, these limits are now associated + with individual network interfaces. +\layout Subsection* + +Added User Ids (Section +\begin_inset LatexCommand \ref{sec:uid} + +\end_inset + +) +\layout Standard + +Group Ids had been used to simplify access control entries. + In particular, a process could allow access for all of the processes in + a group. + User Ids have been introduced to regain this functionality. + We use user ids to fill this role. +\layout Subsection* + +Removed Group Ids and Rank Ids (Section +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +) +\layout Standard + +The earlier version of Portals had two forms for addressing processes: and . + A process group was defined as the collection processes created during + application launch. + Each process in the group was given a unique rank id in the range 0 to + +\begin_inset Formula $n-1$ +\end_inset + + where +\begin_inset Formula $n$ +\end_inset + + was the number of processes in the group. + We removed groups because they are better handled in the runtime system. +\layout Subsection* + +Match lists (Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +) +\layout Standard + +It is no longer illegal to have an existing match entry when calling PtlMEAttach. + A position argument was added to the list of arguments supplied to +\emph on +PtlMEAttach +\emph default + to specify whether the new match entry is prepended or appended to the + existing list. + If there is no existing match list, the position argument is ignored. +\layout Subsection* + +Unlinking Memory Descriptors (Section +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +) +\layout Standard + +Previously, a memory descriptor could be unlinked if the offset exceeded + a threshold upon the completion of an operation. + In this version, the unlinking is delayed until there is a matching operation + which requires more memory than is currently available in the descriptor. + In addition to changes in section, this lead to a revision of Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + +. +\layout Subsection* + +Split Phase Operations and Events (Section +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + +) +\layout Standard + +Previously, there were five types of events: +\family typewriter +PTL_EVENT_PUT +\family default +, +\family typewriter +PTL_EVENT_GET +\family default +, +\family typewriter +PTL_EVENT_REPLY +\family default +, +\family typewriter +PTL_EVENT_SENT +\family default +, and +\family typewriter +PTL_EVENT_ACK. + +\family default +The first four of these reflected the completion of potentially long operations. + We have introduced new event types to reflect the fact that long operations + have a distinct starting point and a distinct completion point. + Moreover, the completion may be successful or unsuccessful. +\layout Standard + +In addition to providing a mechanism for reporting failure to higher levels + of software, this split provides an opportunity for for improved ordering + semantics. + Previously, if one process intiated two operations (e.g., two put operations) + on a remote process, these operations were guaranteed to complete in the + same order that they were initiated. + Now, we only guarantee that the initiation events are delivered in the + same order. + In particular, the operations do not need to complete in the order that + they were intiated. +\layout Subsection* + +Well known proces ids (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +To support the notion of +\begin_inset Quotes eld +\end_inset + +well known process ids, +\begin_inset Quotes erd +\end_inset + + we added a process id argument to the arguments for PtlNIInit. +\layout Chapter* + +Glossary +\layout Description + +API Application Programming Interface. + A definition of the functions and semantics provided by library of functions. + +\layout Description + +Initiator A +\emph on +process +\emph default + that initiates a message operation. + +\layout Description + +Message An application-defined unit of data that is exchanged between +\emph on +processes +\emph default +. + +\layout Description + +Message\SpecialChar ~ +Operation Either a put operation, which writes data, or a get operation, + which reads data. + +\layout Description + +Network A network provides point-to-point communication between +\emph on +nodes +\emph default +. + Internally, a network may provide multiple routes between endpoints (to + improve fault tolerance or to improve performance characteristics); however, + multiple paths will not be exposed outside of the network. + +\layout Description + +Node A node is an endpoint in a +\emph on +network +\emph default +. + Nodes provide processing capabilities and memory. + A node may provide multiple processors (an SMP node) or it may act as a + +\emph on +gateway +\emph default + between networks. + +\layout Description + +Process A context of execution. + A process defines a virtual memory (VM) context. + This context is not shared with other processes. + Several threads may share the VM context defined by a process. + +\layout Description + +Target A +\emph on +process +\emph default + that is acted upon by a message operation. + +\layout Description + +Thread A context of execution that shares a VM context with other threads. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\layout Standard + +\backslash +setcounter{page}{1} +\backslash +pagenumbering{arabic} +\end_inset + + +\layout Chapter + +Introduction +\begin_inset LatexCommand \label{sec:intro} + +\end_inset + + +\layout Section + +Overview +\layout Standard + +This document describes an application programming interface for message + passing between nodes in a system area network. + The goal of this interface is to improve the scalability and performance + of network communication by defining the functions and semantics of message + passing required for scaling a parallel computing system to ten thousand + nodes. + This goal is achieved by providing an interface that will allow a quality + implementation to take advantage of the inherently scalable design of Portals. +\layout Standard + +This document is divided into several sections: +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:intro} + +\end_inset + +---Introduction This section describes the purpose and scope of the Portals + API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +---An\SpecialChar ~ +Overview\SpecialChar ~ +of\SpecialChar ~ +the\SpecialChar ~ +Portals\SpecialChar ~ +3.1\SpecialChar ~ +API This section gives a brief overview of the + Portals API. + The goal is to introduce the key concepts and terminology used in the descripti +on of the API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:api} + +\end_inset + +---The\SpecialChar ~ +Portals\SpecialChar ~ +3.2\SpecialChar ~ +API This section describes the functions and semantics of + the Portals application programming interface. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +--The\SpecialChar ~ +Semantics\SpecialChar ~ +of\SpecialChar ~ +Message\SpecialChar ~ +Transmission This section describes the semantics + of message transmission. + In particular, the information transmitted in each type of message and + the processing of incoming messages. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:examples} + +\end_inset + +---Examples This section presents several examples intended to illustrates + the use of the Portals API. + +\layout Section + +Purpose +\layout Standard + +Existing message passing technologies available for commodity cluster networking + hardware do not meet the scalability goals required by the Cplant\SpecialChar ~ + +\begin_inset LatexCommand \cite{Cplant} + +\end_inset + + project at Sandia National Laboratories. + The goal of the Cplant project is to construct a commodity cluster that + can scale to the order of ten thousand nodes. + This number greatly exceeds the capacity for which existing message passing + technologies have been designed and implemented. +\layout Standard + +In addition to the scalability requirements of the network, these technologies + must also be able to support a scalable implementation of the Message Passing + Interface (MPI)\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPIstandard} + +\end_inset + + standard, which has become the +\shape italic +de facto +\shape default + standard for parallel scientific computing. + While MPI does not impose any scalability limitations, existing message + passing technologies do not provide the functionality needed to allow implement +ations of MPI to meet the scalability requirements of Cplant. +\layout Standard + +The following are properties of a network architecture that do not impose + any inherent scalability limitations: +\layout Itemize + +Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + + and TCP/IP sockets, have limitations on the number of peer connections + that can be established. + +\layout Itemize + +Network independence - Many communication systems depend on the host processor + to perform operations in order for messages in the network to be consumed. + Message consumption from the network should not be dependent on host processor + activity, such as the operating system scheduler or user-level thread scheduler. + +\layout Itemize + +User-level flow control - Many communication systems manage flow control + internally to avoid depleting resources, which can significantly impact + performance as the number of communicating processes increases. + +\layout Itemize + +OS Bypass - High performance network communication should not involve memory + copies into or out of a kernel-managed protocol stack. + +\layout Standard + +The following are properties of a network architecture that do not impose + scalability limitations for an implementation of MPI: +\layout Itemize + +Receiver-managed - Sender-managed message passing implementations require + a persistent block of memory to be available for every process, requiring + memory resources to increase with job size and requiring user-level flow + control mechanisms to manage these resources. + +\layout Itemize + +User-level Bypass - While OS Bypass is necessary for high-performance, it + alone is not sufficient to support the Progress Rule of MPI asynchronous + operations. + +\layout Itemize + +Unexpected messages - Few communication systems have support for receiving + messages for which there is no prior notification. + Support for these types of messages is necessary to avoid flow control + and protocol overhead. + +\layout Section + +Background +\layout Standard + +Portals was originally designed for and implemented on the nCube machine + as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~ + +\begin_inset LatexCommand \cite{SUNMOS} + +\end_inset + + and Puma\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaOS} + +\end_inset + + lightweight kernel development projects. + Portals went through two design phases, the latter of which is used on + the 4500-node Intel TeraFLOPS machine\SpecialChar ~ + +\begin_inset LatexCommand \cite{TFLOPS} + +\end_inset + +. + Portals have been very successful in meeting the needs of such a large + machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaMPI} + +\end_inset + +, but also for implementing the scalable run-time environment and parallel + I/O capabilities of the machine. +\layout Standard + +The second generation Portals implementation was designed to take full advantage + of the hardware architecture of large MPP machines. + However, efforts to implement this same design on commodity cluster technology + identified several limitations, due to the differences in network hardware + as well as to shortcomings in the design of Portals. +\layout Section + +Scalability +\layout Standard + +The primary goal in the design of Portals is scalability. + Portals are designed specifically for an implementation capable of supporting + a parallel job running on tens of thousands of nodes. + Performance is critical only in terms of scalability. + That is, the level of message passing performance is characterized by how + far it allows an application to scale and not by how it performs in micro-bench +marks (e.g., a two node bandwidth or latency test). +\layout Standard + +The Portals API is designed to allow for scalability, not to guarantee it. + Portals cannot overcome the shortcomings of a poorly designed application + program. + Applications that have inherent scalability limitations, either through + design or implementation, will not be transformed by Portals into scalable + applications. + Scalability must be addressed at all levels. + Portals do not inhibit scalability, but do not guarantee it either. +\layout Standard + +To support scalability, the Portals interface maintains a minimal amount + of state. + Portals provide reliable, ordered delivery of messages between pairs of + processes. + They are connectionless: a process is not required to explicitly establish + a point-to-point connection with another process in order to communicate. + Moreover, all buffers used in the transmission of messages are maintained + in user space. + The target process determines how to respond to incoming messages, and + messages for which there are no buffers are discarded. +\layout Section + +Communication Model +\layout Standard + +Portals combine the characteristics of both one-side and two-sided communication. + They define a +\begin_inset Quotes eld +\end_inset + +matching put +\begin_inset Quotes erd +\end_inset + + operation and a +\begin_inset Quotes eld +\end_inset + +matching get +\begin_inset Quotes erd +\end_inset + + operation. + The destination of a put (or send) is not an explicit address; instead, + each message contains a set of match bits that allow the receiver to determine + where incoming messages should be placed. + This flexibility allows Portals to support both traditional one-sided operation +s and two-sided send/receive operations. +\layout Standard + +Portals allows the target to determine whether incoming messages are acceptable. + A target process can choose to accept message operations from any specific + process or can choose to ignore message operations from any specific process. +\layout Section + +Zero Copy, OS Bypass and Application Bypass +\layout Standard + +In traditional system architectures, network packets arrive at the network + interface card (NIC), are passed through one or more protocol layers in + the operating system, and eventually copied into the address space of the + application. + As network bandwidth began to approach memory copy rates, reduction of + memory copies became a critical concern. + This concern lead to the development of zero-copy message passing protocols + in which message copies are eliminated or pipelined to avoid the loss of + bandwidth. +\layout Standard + +A typical zero-copy protocol has the NIC generate an interrupt for the CPU + when a message arrives from the network. + The interrupt handler then controls the transfer of the incoming message + into the address space of the appropriate application. + The interrupt latency, the time from the initiation of an interrupt until + the interrupt handler is running, is fairly significant. + To avoid this cost, some modern NICs have processors that can be programmed + to implement part of a message passing protocol. + Given a properly designed protocol, it is possible to program the NIC to + control the transfer of incoming messages, without needing to interrupt + the CPU. + Because this strategy does not need to involve the OS on every message + transfer, it is frequently called +\begin_inset Quotes eld +\end_inset + +OS Bypass. +\begin_inset Quotes erd +\end_inset + + ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + +, FM\SpecialChar ~ + +\begin_inset LatexCommand \cite{FM2} + +\end_inset + +, GM\SpecialChar ~ + +\begin_inset LatexCommand \cite{GM} + +\end_inset + +, and Portals are examples of OS Bypass protocols. +\layout Standard + +Many protocols that support OS Bypass still require that the application + actively participate in the protocol to ensure progress. + As an example, the long message protocol of PM requires that the application + receive and reply to a request to put or get a long message. + This complicates the runtime environment, requiring a thread to process + incoming requests, and significantly increases the latency required to + initiate a long message protocol. + The Portals message passing protocol does not require activity on the part + of the application to ensure progress. + We use the term +\begin_inset Quotes eld +\end_inset + +Application Bypass +\begin_inset Quotes erd +\end_inset + + to refer to this aspect of the Portals protocol. +\layout Section + +Faults +\layout Standard + +Given the number of components that we are dealing with and the fact that + we are interested in supporting applications that run for very long times, + failures are inevitable. + The Portals API recognizes that the underlying transport may not be able + to successfully complete an operation once it has been initiated. + This is reflected in the fact that the Portals API reports three types + of events: events indicating the initiation of an operation, events indicating + the successful completion of an operation, and events indicating the unsuccessf +ul completion of an operation. + Every initiation event is eventually followed by a successful completion + event or an unsuccessful completion event. +\layout Standard + +Between the time an operation is started and the time that the operation + completes (successfully or unsuccessfully), any memory associated with + the operation should be considered volatile. + That is, the memory may be changed in unpredictable ways while the operation + is progressing. + Once the operation completes, the memory associated with the operation + will not be subject to further modification (from this operation). + Notice that unsuccessful operations may alter memory in an essentially + unpredictable fashion. +\layout Chapter + +An Overview of the Portals API +\begin_inset LatexCommand \label{sec:apiover} + +\end_inset + + +\layout Standard + +In this section, we give a conceptual overview of the Portals API. + The goal is to provide a context for understanding the detailed description + of the API presented in the next section. +\layout Section + +Data Movement +\begin_inset LatexCommand \label{sec:dmsemantics} + +\end_inset + + +\layout Standard + +A Portal represents an opening in the address space of a process. + Other processes can use a Portal to read (get) or write (put) the memory + associated with the portal. + Every data movement operation involves two processes, the +\series bold +initiator +\series default + and the +\series bold +target +\series default +. + The initiator is the process that initiates the data movement operation. + The target is the process that responds to the operation by either accepting + the data for a put operation, or replying with the data for a get operation. +\layout Standard + +In this discussion, activities attributed to a process may refer to activities + that are actually performed by the process or +\emph on +on behalf of the process +\emph default +. + The inclusiveness of our terminology is important in the context of +\emph on +application bypass +\emph default +. + In particular, when we note that the target sends a reply in the case of + a get operation, it is possible that reply will be generated by another + component in the system, bypassing the application. +\layout Standard + +Figures\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:put} + +\end_inset + + and +\begin_inset LatexCommand \ref{fig:get} + +\end_inset + + present graphical interpretations of the Portal data movement operations: + put and get. + In the case of a put operation, the initiator sends a put request message + containing the data to the target. + The target translates the Portal addressing information in the request + using its local Portal structures. + When the request has been processed, the target optionally sends an acknowledge +ment message. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename put.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Put (Send) +\begin_inset LatexCommand \label{fig:put} + +\end_inset + + +\end_inset + + +\layout Standard + +In the case of a get operation, the initiator sends a get request to the + target. + As with the put operation, the target translates the Portal addressing + information in the request using its local Portal structures. + Once it has translated the Portal addressing information, the target sends + a reply that includes the requested data. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename get.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Get +\begin_inset LatexCommand \label{fig:get} + +\end_inset + + +\end_inset + + +\layout Standard + +We should note that Portal address translations are only performed on nodes + that respond to operations initiated by other nodes. + Acknowledgements and replies to get operations bypass the portals address + translation structures. +\layout Section + +Portal Addressing +\begin_inset LatexCommand \label{subsec:paddress} + +\end_inset + + +\layout Standard + +One-sided data movement models (e.g., shmem\SpecialChar ~ + +\begin_inset LatexCommand \cite{CraySHMEM} + +\end_inset + +, ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, MPI-2\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPI2} + +\end_inset + +) typically use a triple to address memory on a remote node. + This triple consists of a process id, memory buffer id, and offset. + The process id identifies the target process, the memory buffer id specifies + the region of memory to be used for the operation, and the offset specifies + an offset within the memory buffer. +\layout Standard + +In addition to the standard address components (process id, memory buffer + id, and offset), a Portal address includes a set of match bits. + This addressing model is appropriate for supporting one-sided operations + as well as traditional two-sided message passing operations. + Specifically, the Portals API provides the flexibility needed for an efficient + implementation of MPI-1, which defines two-sided operations with one-sided + completion semantics. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:portals} + +\end_inset + + presents a graphical representation of the structures used by a target + in the interpretation of a Portal address. + The process id is used to route the message to the appropriate node and + is not reflected in this diagram. + The memory buffer id, called the +\series bold +portal id +\series default +, is used as an index into the Portal table. + Each element of the Portal table identifies a match list. + Each element of the match list specifies two bit patterns: a set of +\begin_inset Quotes eld +\end_inset + +don't care +\begin_inset Quotes erd +\end_inset + + bits, and a set of +\begin_inset Quotes eld +\end_inset + +must match +\begin_inset Quotes erd +\end_inset + + bits. + In addition to the two sets of match bits, each match list element has + at most one memory descriptor. + Each memory descriptor identifies a memory region and an optional event + queue. + The memory region specifies the memory to be used in the operation and + the event queue is used to record information about these operations. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename portals.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 305pt + lyxheight 106pt +\end_inset + + +\layout Caption + +Portal Addressing Structures +\begin_inset LatexCommand \label{fig:portals} + +\end_inset + + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + + illustrates the steps involved in translating a Portal address, starting + from the first element in a match list. + If the match criteria specified in the match list entry are met and the + memory descriptor list accepts the operation +\begin_inset Foot +collapsed true + +\layout Standard + +Memory descriptors can reject operations because a threshold has been exceeded + or because the memory region does not have sufficient space, see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + +, the operation (put or get) is performed using the memory region specified + in the memory descriptor. + If the memory descriptor specifies that it is to be unlinked when a threshold + has been exceeded, the match list entry is removed from the match list + and the resources associated with the memory descriptor and match list + entry are reclaimed. + Finally, if there is an event queue specified in the memory descriptor, + the operation is logged in the event queue. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename flow_new.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 447pt + lyxheight 282pt +\end_inset + + +\layout Caption + +Portals Address Translation +\begin_inset LatexCommand \label{fig:flow} + +\end_inset + + +\end_inset + + +\layout Standard + +If the match criteria specified in the match list entry are not met, or + there is no memory descriptor associated with the match list entry, or + the memory descriptor associated with the match list entry rejects the + operation, the address translation continues with the next match list entry. + If the end of the match list has been reached, the address translation + is aborted and the incoming requested is discarded. +\layout Section + +Access Control +\layout Standard + +A process can control access to its portals using an access control list. + Each entry in the access control list specifies a process id and a Portal + table index. + The access control list is actually an array of entries. + Each incoming request includes an index into the access control list (i.e., + a +\begin_inset Quotes eld +\end_inset + +cookie +\begin_inset Quotes erd +\end_inset + + or hint). + If the id of the process issuing the request doesn't match the id specified + in the access control list entry or the Portal table index specified in + the request doesn't match the Portal table index specified in the access + control list entry, the request is rejected. + Process identifiers and Portal table indexes may include wild card values + to increase the flexibility of this mechanism. + +\layout Standard + +Two aspects of this design merit further discussion. + First, the model assumes that the information in a message header, the + sender's id in particular, is trustworthy. + In most contexts, we assume that the entity that constructs the header + is trustworthy; however, using cryptographic techniques, we could easily + devise a protocol that would ensure the authenticity of the sender. +\layout Standard + +Second, because the access check is performed by the receiver, it is possible + that a malicious process will generate thousands of messages that will + be denied by the receiver. + This could saturate the network and/or the receiver, resulting in a +\emph on +denial of service +\emph default + attack. + Moving the check to the sender using capabilities, would remove the potential + for this form of attack. + However, the solution introduces the complexities of capability management + (exchange of capabilities, revocation, protections, etc). +\layout Section + +Multi-threaded Applications +\begin_inset LatexCommand \label{sec:threads} + +\end_inset + + +\layout Standard + +The Portals API supports a generic view of multi-threaded applications. + From the perspective of the Portals API, an application program is defined + by a set of processes. + Each process defines a unique address space. + The Portals API defines access to this address space from other processes + (using portals addressing and the data movement operations). + A process may have one or more +\emph on +threads +\emph default + executing in its address space. + +\layout Standard + +With the exception of +\emph on +PtlEQWait +\emph default + every function in the Portals API is non-blocking and atomic with respect + to both other threads and external operations that result from data movement + operations. + While individual operations are atomic, sequences of these operations may + be interleaved between different threads and with external operations. + The Portals API does not provide any mechanisms to control this interleaving. + It is expected that these mechanisms will be provided by the API used to + create threads. +\layout Chapter + +The Portals API +\begin_inset LatexCommand \label{sec:api} + +\end_inset + + +\layout Section + +Naming Conventions +\begin_inset LatexCommand \label{sec:conv} + +\end_inset + + +\layout Standard + +The Portals API defines two types of entities: functions and types. + Function always start with +\emph on +Ptl +\emph default + and use mixed upper and lower case. + When used in the body of this report, function names appear in italic face, + e.g., +\emph on +PtlInit +\emph default +. + The functions associated with an object type will have names that start + with +\emph on +Ptl +\emph default +, followed by the two letter object type code shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + As an example, the function +\emph on +PtlEQAlloc +\emph default + allocates resources for an event queue. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Object Type Codes +\begin_inset LatexCommand \label{tab:objcodes} + +\end_inset + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\newline + +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\emph on +xx +\end_inset + + +\begin_inset Text + +\layout Standard + + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +EQ +\end_inset + + +\begin_inset Text + +\layout Standard + + Event Queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + MD +\end_inset + + +\begin_inset Text + +\layout Standard + + Memory Descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + ME +\end_inset + + +\begin_inset Text + +\layout Standard + + Match list Entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + NI +\end_inset + + +\begin_inset Text + +\layout Standard + + Network Interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Type names use lower case with underscores to separate words. + Each type name starts with +\family typewriter +ptl +\family default +_ and ends with +\family typewriter +_t +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +ptl_match_bits_t +\family default +. +\layout Standard + +Names for constants use upper case with underscores to separate words. + Each constant name starts with +\family typewriter +PTL_ +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +PTL_OK +\family default +. +\layout Section + +Base Types +\layout Standard + +The Portals API defines a variety of base types. + These types represent a simple renaming of the base types provided by the + C programming language. + In most cases these new type names have been introduced to improve type + safety and to avoid issues arising from differences in representation sizes + (e.g., 16-bit or 32-bit integers). +\layout Subsection + +Sizes +\begin_inset LatexCommand \label{sec:size-t} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_size_t +\family default + is an unsigned 64-bit integral type used for representing sizes. +\layout Subsection + +Handles +\begin_inset LatexCommand \label{sec:handle-type} + +\end_inset + + +\layout Standard + +Objects maintained by the API are accessed through handles. + Handle types have names of the form +\family typewriter +ptl_handle_ +\emph on +xx +\emph default +_t +\family default +, where +\emph on +xx +\emph default + is one of the two letter object type codes shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + For example, the type +\family typewriter +ptl_handle_ni_t +\family default + is used for network interface handles. +\layout Standard + +Each type of object is given a unique handle type to enhance type checking. + The type, +\family typewriter +ptl_handle_any_t +\family default +, can be used when a generic handle is needed. + Every handle value can be converted into a value of type +\family typewriter +ptl_handle_any_t +\family default + without loss of information. +\layout Standard + +Handles are not simple values. + Every portals object is associated with a specific network interface and + an identifier for this interface (along with an object identifier) is part + of the handle for the object. +\layout Standard + +The special value +\family typewriter +PTL_EQ_NONE +\family default +, of type +\family typewriter +ptl_handle_eq_t +\family default +, is used to indicate the absence of an event queue. + See sections +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + for uses of this value. +\layout Subsection + +Indexes +\begin_inset LatexCommand \label{sec:index-type} + +\end_inset + + +\layout Standard + +The types +\family typewriter +ptl_pt_index_t +\family default + and +\family typewriter +ptl_ac_index_t +\family default + are integral types used for representing Portal table indexes and access + control tables indexes, respectively. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + + for limits on values of these types. +\layout Subsection + +Match Bits +\begin_inset LatexCommand \label{sec:mb-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_match_bits_t +\family default + is capable of holding unsigned 64-bit integer values. +\layout Subsection + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_interface_t +\family default + is an integral type used for identifying different network interfaces. + Users will need to consult the local documentation to determine appropriate + values for the interfaces available. + The special value +\family typewriter +PTL_IFACE_DEFAULT +\family default + identifies the default interface. +\layout Subsection + +Identifiers +\begin_inset LatexCommand \label{sec:id-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_nid_t +\family default + is an integral type used for representing node ids +\family typewriter +, ptl_pid_t +\family default + is an integral type for representing process ids, and +\family typewriter +ptl_uid_t +\family default +is an integral type for representing user ids. +\layout Standard + +The special values +\family typewriter +PTL_PID_ANY +\family default + matches any process identifier, PTL_NID_ANY matches any node identifier, + and +\family typewriter +PTL_UID_ANY +\family default + matches any user identifier. + See sections +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + for uses of these values. +\layout Subsection + +Status Registers +\begin_inset LatexCommand \label{sec:stat-type} + +\end_inset + + +\layout Standard + +Each network interface maintains an array of status registers that can be + accessed using the +\family typewriter +PtlNIStatus +\family default + function (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + The type +\family typewriter +ptl_sr_index_t +\family default + defines the types of indexes that can be used to access the status registers. + The only index defined for all implementations is +\family typewriter +PTL_SR_DROP_COUNT +\family default + which identifies the status register that counts the dropped requests for + the interface. + Other indexes (and registers) may be defined by the implementation. +\layout Standard + +The type +\family typewriter +ptl_sr_value_t +\family default + defines the types of values held in status registers. + This is a signed integer type. + The size is implementation dependent, but must be at least 32 bits. +\layout Section + +Initialization and Cleanup +\begin_inset LatexCommand \label{sec:init} + +\end_inset + + +\layout Standard + +The Portals API includes a function, +\emph on +PtlInit +\emph default +, to initialize the library and a function, +\emph on +PtlFini +\emph default +, to cleanup after the application is done using the library. +\layout Subsection + +PtlInit +\begin_inset LatexCommand \label{sec:ptlinit} + +\end_inset + + +\layout LyX-Code + +int PtlInit( int *max_interfaces ); +\layout Standard +\noindent +The +\emph on +PtlInit +\emph default + function initializes the Portals library. + PtlInit must be called at least once by a process before any thread makes + a Portals function call, but may be safely called more than once. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_FAIL Indicates an error during initialization. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +max_interfaces +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +max_interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the maximum number of interfaces + that can be initialized. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlFini +\begin_inset LatexCommand \label{sec:ptlfini} + +\end_inset + + +\layout LyX-Code + +void PtlFini( void ); +\layout Standard +\noindent +The +\emph on +PtlFini +\emph default + function cleans up after the Portals library is no longer needed by a process. + After this function is called, calls to any of the functions defined by + the Portal API or use of the structures set up by the Portals API will + result in undefined behavior. + This function should be called once and only once during termination by + a process. + Typically, this function will be called in the exit sequence of a process. + Individual threads should not call PtlFini when they terminate. +\layout Section + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni} + +\end_inset + + +\layout Standard + +The Portals API supports the use of multiple network interfaces. + However, each interface is treated as an independent entity. + Combining interfaces (e.g., +\begin_inset Quotes eld +\end_inset + +bonding +\begin_inset Quotes erd +\end_inset + + to create a higher bandwidth connection) must be implemented by the application + or embedded in the underlying network. + Interfaces are treated as independent entities to make it easier to cache + information on individual network interface cards. +\layout Standard + +Once initialized, each interface provides a Portal table, an access control + table, and a collection of status registers. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for a discussion of updating Portal table entries using the +\emph on +PtlMEAttach +\emph default + function. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + for a discussion of the initialization and updating of entries in the access + control table. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + for a discussion of the +\emph on +PtlNIStatus +\emph default + function which can be used to determine the value of a status register. +\layout Standard + +Every other type of Portal object (e.g., memory descriptor, event queue, or + match list entry) is associated with a specific network interface. + The association to a network interface is established when the object is + created and is encoded in the handle for the object. +\layout Standard + +Each network interface is initialized and shutdown independently. + The initialization routine, +\emph on +PtlNIInit +\emph default +, returns a handle for an interface object which is used in all subsequent + Portal operations. + The +\emph on +PtlNIFini +\emph default + function is used to shutdown an interface and release any resources that + are associated with the interface. + Network interface handles are associated with processes, not threads. + All threads in a process share all of the network interface handles. +\layout Standard + +The Portals API also defines the +\emph on +PtlNIStatus +\emph default + function to query the status registers for a network interface, the +\emph on +PtlNIDist +\emph default + function to determine the +\begin_inset Quotes eld +\end_inset + +distance +\begin_inset Quotes erd +\end_inset + + to another process, and the +\emph on +PtlNIHandle +\emph default + function to determine the network interface that an object is associated + with. +\layout Subsection + +PtlNIInit +\begin_inset LatexCommand \label{sec:niinit} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + int max_match_entries; +\newline + int max_mem_descriptors; +\newline + int max_event_queues; +\newline + ptl_ac_index_t max_atable_index; +\newline + ptl_pt_index_t max_ptable_index; +\newline +} ptl_ni_limits_t; +\newline + +\newline +int PtlNIInit( ptl_interface_t interface +\newline + ptl_pid_t pid, +\newline + ptl_ni_limits_t* desired, +\newline + ptl_ni_limits_t* actual, +\newline + ptl_handle_ni_t* handle ); +\layout Standard + +Values of type +\family typewriter +ptl_ni_limits_t +\family default + include the following members: +\layout Description + +max_match_entries Maximum number of match entries that can be allocated + at any one time. +\layout Description + +max_mem_descriptors Maximum number of memory descriptors that can be allocated + at any one time. +\layout Description + +max_event_queues Maximum number of event queues that can be allocated at + any one time. +\layout Description + +max_atable_index Largest access control table index for this interface, + valid indexes range from zero to +\family typewriter +max_atable_index +\family default +, inclusive. +\layout Description + +max_ptable_index Largest Portal table index for this interface, valid indexes + range from zero to +\family typewriter +max_ptable_index +\family default +, inclusive. +\layout Standard +\noindent +The +\emph on +PtlNIInit +\emph default + function is used to initialized the Portals API for a network interface. + This function must be called at least once by each process before any other + operations that apply to the interface by any process or thread. + For subsequent calls to +\shape italic +PtlNIInit +\shape default + from within the same process (either by different threads or the same thread), + the desired limits will be ignored and the call will return the existing + NI handle. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INIT_DUP Indicates a duplicate initialization of +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INIT_INV Indicates that +\family typewriter +interface +\family default + is not a valid network interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to initialize the + interface. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +pid +\family default + is not a valid process id. +\layout Description + +PTL_SEGV Indicates that +\family typewriter +actual +\family default +or +\family typewriter + handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the network interface to be initialized. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + for a discussion of values used to identify network interfaces.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +pid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the desired process id (for well known process ids). + The value +\family typewriter +PTL_PID_ANY +\family default + may be used to have the process id assigned by the underlying library. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +desired +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If non-NULL, points to a structure that holds the desired limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +actual +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, the location pointed to by actual will hold the actual + limits. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the interface. +\end_inset + + + + +\end_inset + + +\layout Comment + +The use of desired is implementation dependent. + In particular, an implementation may choose to ignore this argument. +\layout Subsection + +PtlNIFini +\begin_inset LatexCommand \label{sec:nifini} + +\end_inset + + +\layout LyX-Code + +int PtlNIFini( ptl_handle_ni_t interface ); +\layout Standard +\noindent +The +\emph on +PtlNIFini +\emph default + function is used to release the resources allocated for a network interface. + Once the +\emph on +PtlNIFini +\emph default + operation has been started, the results of pending API operations (e.g., + operations initiated by another thread) for this interface are undefined. + Similarly, the effects of incoming operations (puts and gets) or return + values (acknowledgements and replies) for this interface are undefined. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the interface to shutdown. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlNIStatus +\begin_inset LatexCommand \label{sec:nistatus} + +\end_inset + + +\layout LyX-Code + +int PtlNIStatus( ptl_handle_ni_t interface, +\newline + ptl_sr_index_t status_register, +\newline + ptl_sr_value_t* status ); +\layout Standard +\noindent +The +\emph on +PtlNIStatus +\emph default + function returns the value of a status register for the specified interface. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + for more information on status register indexes and status register values.) +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_SR_INDX Indicates that +\family typewriter +status_register +\family default + is not a valid status register. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +status +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status_register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An index for the status register to read. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +status +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the current value of the status + register. +\end_inset + + + + +\end_inset + + +\layout Comment + +The only status register that must be defined is a drop count register ( +\family typewriter +PTL_SR_DROP_COUNT +\family default +). + Implementations may define additional status registers. + Identifiers for the indexes associated with these registers should start + with the prefix +\family typewriter +PTL_SR_ +\family default +. +\layout Subsection + +PtlNIDist +\layout LyX-Code + +int PtlNIDist( ptl_handle_ni_t interface, +\newline + ptl_process_id_t process, +\newline + unsigned long* distance ); +\layout Standard +\noindent +The +\emph on +PtlNIDist +\emph default + function returns the distance to another process using the specified interface. + Distances are only defined relative to an interface. + Distance comparisons between different interfaces on the same process may + be meaningless. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +process +\family default + is not a valid process identifier. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +distance +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +An identifier for the process whose distance is being requested. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +distance +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the distance to the remote + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +This function should return a static measure of distance. + Examples include minimum latency, the inverse of available bandwidth, or + the number of switches between the two endpoints. +\layout Subsection + +PtlNIHandle +\layout LyX-Code + +int PtlNIHandle( ptl_handle_any_t handle, +\newline + ptl_handle_ni_t* interface ); +\layout Standard +\noindent +The +\emph on +PtlNIHandle +\emph default + function returns a handle for the network interface with which the object + identified by +\family typewriter +handle +\family default + is associated. + If the object identified by +\family typewriter +handle +\family default + is a network interface, this function returns the same value it is passed. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_HANDLE Indicates that +\family typewriter +handle +\family default + is not a valid handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the object. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the network interface + associated with +\family typewriter +handle +\family default +. +\end_inset + + + + +\end_inset + + +\layout Comment + +Every handle should encode the network interface and the object id relative + to this handle. + Both are presumably encoded using integer values. +\layout Section + +User Identification +\begin_inset LatexCommand \label{sec:uid} + +\end_inset + + +\layout Standard + +Every process runs on behalf of a user. + +\layout Subsection + +PtlGetUid +\layout LyX-Code + +int PtlGetUid( ptl_handle_ni_t ni_handle, +\newline + ptl_uid_t* uid ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the user id for the calling + process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that user identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, a process may have multiple + user identifiers. +\layout Section + +Process Identification +\begin_inset LatexCommand \label{sec:pid} + +\end_inset + + +\layout Standard + +Processes that use the Portals API, can be identified using a node id and + process id. + Every node accessible through a network interface has a unique node identifier + and every process running on a node has a unique process identifier. + As such, any process in the computing system can be identified by its node + id and process id. + +\layout Standard + +The Portals API defines a type, +\family typewriter +ptl_process_id_t +\family default + for representing process ids and a function, +\emph on +PtlGetId +\emph default +, which can be used to obtain the id of the current process. +\layout Comment + +The portals API does not include thread identifiers. + Messages are delivered to processes (address spaces) not threads (contexts + of execution). +\layout Subsection + +The Process Id Type +\begin_inset LatexCommand \label{sec:pid-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_nid_t nid; /* node id */ +\newline + ptl_pid_t pid; /* process id */ +\newline +} ptl_process_id_t; +\layout Standard +\noindent +The +\family typewriter +ptl_process_id_t +\family default + type uses two identifiers to represent a process id: a node id and a process + id. + +\layout Subsection + +PtlGetId +\begin_inset LatexCommand \label{sub:PtlGetId} + +\end_inset + + +\layout LyX-Code + +int PtlGetId( ptl_handle_ni_t ni_handle, +\newline + ptl_process_id_t* id ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +id +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the id for the calling process. +\end_inset + + + + +\end_inset + + +\layout Comment + +Note that process identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, it may have multiple + node identifiers. +\layout Section + +Match List Entries and Match Lists +\begin_inset LatexCommand \label{sec:me} + +\end_inset + + +\layout Standard + +A match list is a chain of match list entries. + Each match list entry includes a memory descriptor and a set of match criteria. + The match criteria can be used to reject incoming requests based on process + id or the match bits provided in the request. + A match list is created using the +\emph on +PtlMEAttach +\emph default + or +\shape italic +PtlMEAttachAny +\shape default + functions, which create a match list consisting of a single match list + entry, attaches the match list to the specified Portal index, and returns + a handle for the match list entry. + Match entries can be dynamically inserted and removed from a match list + using the +\emph on +PtlMEInsert +\emph default + and +\emph on +PtlMEUnlink +\emph default + functions. +\layout Subsection + +PtlMEAttach +\begin_inset LatexCommand \label{sec:meattach} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t; +\newline + +\layout LyX-Code + +typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t; +\newline + +\layout LyX-Code + +int PtlMEAttach( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ins_pos_t +\family default + are used to control where a new item is inserted. + The value +\family typewriter +PTL_INS_BEFORE +\family default + is used to insert the new item before the current item or before the head + of the list. + The value +\family typewriter +PTL_INS_AFTER +\family default + is used to insert the new item after the current item or after the last + item in the list. + +\layout Standard + +The +\emph on +PtlMEAttach +\emph default + function creates a match list consisting of a single entry and attaches + this list to the Portal table for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PTINDEX Indicates that +\family typewriter +index +\family default + is not a valid Portal table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The Portal table index where the match list should be attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specifies the match criteria for the process id of the requestor. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +match_bits, ignorebits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Specify the match criteria to apply to the match bits in the incoming request. + The +\family typewriter +ignorebits +\family default + are used to mask out insignificant bits in the incoming match bits. + The resulting bits are then compared to the match list entry's match + bits to determine if the incoming request meets the match criteria. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates the match list entry should be unlinked when the last memory descripto +r associated with this match list entry is unlinked. + (Note, the check for unlinking a match entry only occurs when a memory + descriptor is unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be prepended or appended to + the existing match list. + If there is no existing list, this argument is ignored and the new match + entry becomes the only entry in the list. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEAttachAny +\begin_inset LatexCommand \label{sec:attachany} + +\end_inset + + +\layout LyX-Code + +int PtlMEAttachAny( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t *index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEAttachAny +\emph default + function creates a match list consisting of a single entry and attaches + this list to an unused Portal table entry for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_PT_FULL Indicates that there are no free entries in the Portal table. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On succesfful return, this location will hold the Portal index where the + match list has been attached. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid, match_bits, ignorebits, unlink +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\shape italic +PtlMEAttach +\shape default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEInsert +\begin_inset LatexCommand \label{sec:meinsert} + +\end_inset + + +\layout LyX-Code + +int PtlMEInsert( ptl_handle_me_t current, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEInsert +\emph default + function creates a new match list entry and inserts this entry into the + match list containing +\family typewriter +current +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +current +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match entry. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +current +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for a match entry. + The new match entry will be inserted immediately before or immediately + after this match entry. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\family default +, +\family typewriter +match_bits +\family default +, +\family typewriter +ignorebits +\family default +, +\family typewriter +unlink +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default + +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be inserted before or after + the +\family typewriter +current +\family default + entry. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default +. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMEUnlink +\begin_inset LatexCommand \label{sec:meunlink} + +\end_inset + + +\layout LyX-Code + +int PtlMEUnlink( ptl_handle_me_t entry ); +\layout Standard +\noindent +The +\emph on +PtlMEUnlink +\emph default + function can be used to unlink a match entry from a match list. + This operation also releases any resources associated with the match entry + (including the associated memory descriptor). + It is an error to use the match entry handle after calling +\emph on +PtlMEUnlink +\emph default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +entry +\family default + is not a valid match entry handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the match entry to be unlinked. +\end_inset + + + + +\end_inset + + +\layout Section + +Memory Descriptors +\begin_inset LatexCommand \label{sec:md} + +\end_inset + + +\layout Standard + +A memory descriptor contains information about a region of an application + process' memory and an event queue where information about the operations + performed on the memory descriptor are recorded. + The Portals API provides two operations to create memory descriptors: +\emph on +PtlMDAttach +\emph default +, and +\emph on +PtlMDBind +\emph default +; an operation to update a memory descriptor, +\emph on +PtlMDUpdate +\emph default +; and an operation to unlink and release the resources associated with a + memory descriptor, +\emph on +PtlMDUnlink +\emph default +. +\layout Subsection + +The Memory Descriptor Type +\begin_inset LatexCommand \label{sec:md-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + void* start; +\newline + ptl_size_t length; +\newline + int threshold; +\newline + unsigned int max_offset; +\newline + unsigned int options; +\newline + void* user_ptr; +\newline + ptl_handle_eq_t eventq; +\newline +} ptl_md_t; +\layout Standard +\noindent +The +\family typewriter +ptl_md_t +\family default + type defines the application view of a memory descriptor. + Values of this type are used to initialize and update the memory descriptors. +\layout Subsubsection + +Members +\layout Description + +start,\SpecialChar ~ +length Specify the memory region associated with the memory descriptor. + The +\family typewriter +start +\family default + member specifies the starting address for the memory region and the +\family typewriter +length +\family default + member specifies the length of the region. + The +\family typewriter +start member +\family default + can be NULL provided that the +\family typewriter +length +\family default + member is zero. + (Zero length buffers are useful to record events.) There are no alignment + restrictions on the starting address or the length of the region; although, + unaligned messages may be slower (i.e., lower bandwidth and/or longer latency) + on some implementations. + +\layout Description + +threshold Specifies the maximum number of operations that can be performed + on the memory descriptor. + An operation is any action that could possibly generate an event (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + for the different types of events). + In the usual case, the threshold value is decremented for each operation + on the memory descriptor. + When the threshold value is zero, the memory descriptor is +\emph on +inactive +\emph default +, and does not respond to operations. + A memory descriptor can have an initial threshold value of zero to allow + for manipulation of an inactive memory descriptor by the local process. + A threshold value of +\family typewriter +PTL_MD_THRESH_INF +\family default + indicates that there is no bound on the number of operations that may be + applied to a memory descriptor. + Note that local operations (e.g., +\emph on +PtlMDUpdate +\emph default +) are not applied to the threshold count. + +\layout Description + +max_offset Specifies the maximum local offset of a memory descriptor. + When the local offset of a memory descriptor exceeds this maximum, the + memory descriptor becomes +\shape italic +inactive +\shape default + and does not respond to further operations. +\layout Description + +options Specifies the behavior of the memory descriptor. + There are five options that can be selected: enable put operations (yes + or no), enable get operations (yes or no), offset management (local or + remote), message truncation (yes or no), and acknowledgement (yes or no). + Values for this argument can be constructed using a bitwise or of the following + values: +\begin_deeper +\begin_deeper +\layout Description + +PTL_MD_OP_PUT Specifies that the memory descriptor will respond to +\emph on +put +\emph default + operations. + By default, memory descriptors reject +\emph on +put +\emph default + operations. + +\layout Description + +PTL_MD_OP_GET Specifies that the memory descriptor will respond to +\emph on +get +\emph default + operations. + By default, memory descriptors reject +\emph on +get +\emph default + operations. + +\layout Description + +PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory + region is provided by the incoming request. + By default, the offset is maintained locally. + When the offset is maintained locally, the offset is incremented by the + length of the request so that the next operation (put and/or get) will + access the next part of the memory region. +\layout Description + +PTL_MD_TRUNCATE Specifies that the length provided in the incoming request + can be reduced to match the memory available in the region. + (The memory available in a memory region is determined by subtracting the + offset from the length of the memory region.) By default, if the length + in the incoming operation is greater than the amount of memory available, + the operation is rejected. + +\layout Description + +PTL_MD_ACK_DISABLE Specifies that an acknowledgement should +\emph on +not +\emph default + be sent for incoming +\emph on +put +\emph default + operations, even if requested. + By default, acknowledgements are sent for +\emph on +put +\emph default + operations that request an acknowledgement. + Acknowledgements are never sent for +\emph on +get +\emph default + operations. + The value sent in the reply serves as an implicit acknowledgement. + +\end_deeper +\layout Standard + + +\series bold +Note +\series default +: It is not considered an error to have a memory descriptor that does not + respond to either +\emph on +put +\emph default + or +\emph on +get +\emph default + operations: Every memory descriptor responds to +\emph on +reply +\emph default + operations. + Nor is it considered an error to have a memory descriptor that responds + to both +\emph on +put +\emph default + and +\emph on +get +\emph default + operations. + +\end_deeper +\layout Description + +user_ptr A user-specified value that is associated with the memory descriptor. + The value does not need to be a pointer, but must fit in the space used + by a pointer. + This value (along with other values) is recorded in events associated with + operations on this memory descriptor. +\begin_inset Foot +collapsed true + +\layout Standard + +Tying the memory descriptor to a user-defined value can be useful when multiple + memory descriptor share the same event queue or when the memory descriptor + needs to be associated with a data structure maintained by the application. + For example, an MPI implementation can set the +\family typewriter +user_ptr +\family default + argument to the value of an MPI Request. + This direct association allows for processing of memory descriptor's by + the MPI implementation without a table lookup or a search for the appropriate + MPI Request. +\end_inset + + +\layout Description + +eventq A handle for the event queue used to log the operations performed + on the memory region. + If this argument is +\family typewriter +PTl_EQ_NONE +\family default +, operations performed on this memory descriptor are not logged. + +\layout Subsection + +PtlMDAttach +\begin_inset LatexCommand \label{sec:mdattach} + +\end_inset + + +\layout LyX-Code + +int PtlMDAttach( ptl_handle_me_t match, +\newline + ptl_md_t mem_desc, +\newline + ptl_unlink_t unlink_op, +\newline + ptl_unlink_t unlink_nofit, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_unlink_t +\family default + are used to control whether an item is unlinked from a list. + The value +\family typewriter +PTL_UNLINK +\family default + enables unlinking. + The value +\family typewriter +PTL_RETAIN +\family default + disables unlinking. +\layout Standard + +The +\emph on +PtlMDAttach +\emph default + operation is used to create a memory descriptor and attach it to a match + list entry. + An error code is returned if this match list entry already has an associated + memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INUSE Indicates that +\family typewriter +match +\family default + already has a memory descriptor attached. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +match +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface associated with +\family typewriter +match +\family default +. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the match entry that the memory descriptor will be associated + with. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_op +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when it becomes + inactive, either because the operation threshold drops to zero or because + the maximum offset has been exceeded. + (Note, the check for unlinking a memory descriptor only occurs after a + the completion of a successful operation. + If the threshold is set to zero during initialization or using +\emph on +PtlMDUpdate +\emph default +, the memory descriptor is +\series bold +not +\series default + unlinked.) +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_nofit +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when the space + remaining in the memory descriptor is not sufficient for a matching operation. + If an incoming message arrives arrives at a memory descriptor that does + not have sufficient space and the +\series bold +PTL_MD_TRUNCATE +\series default + operation is not specified, the memory descriptor will be unlinked. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument can be NULL, in which case the handle will not be returned. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDBind +\begin_inset LatexCommand \label{sec:mdbind} + +\end_inset + + +\layout LyX-Code + +int PtlMDBind( ptl_handle_ni_t interface, +\newline + ptl_md_t mem_desc, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlMDBind +\emph default + operation is used to create a +\begin_inset Quotes eld +\end_inset + +free floating +\begin_inset Quotes erd +\end_inset + + memory descriptor, i.e., a memory descriptor that is not associated with + a match list entry. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface, +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INV_EQ Indicates that the event queue associated with +\family typewriter +mem_desc +\family default + is not valid. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the network interface with which the memory descriptor will + be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument must be a valid address and cannot be NULL. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUnlink +\begin_inset LatexCommand \label{sec:mdfree} + +\end_inset + + +\layout LyX-Code + +int PtlMDUnlink( ptl_handle_md_t mem_desc ); +\layout Standard +\noindent +The +\emph on +PtlMDUnlink +\emph default + function unlinks the memory descriptor from any match list entry it may + be linked to and releases the resources associated with a memory descriptor. + (This function does not free the memory region associated with the memory + descriptor.) This function also releases the resources associated with a + floating memory descriptor. + Only memory descriptors with no pending operations may be unlinked. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. +\layout Description + +PTL_MD_INUSE Indicates that +\family typewriter +mem_desc +\family default + has pending operations and cannot be unlinked. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlMDUpdate +\begin_inset LatexCommand \label{sec:mdupdate} + +\end_inset + + +\layout LyX-Code + +int PtlMDUpdate( ptl_handle_md_t mem_desc, +\newline + ptl_md_t* old_md, +\newline + ptl_md_t* new_md, +\newline + ptl_handle_eq_t testq ); +\layout Standard +\noindent +The +\emph on +PtlMDUpdate +\emph default + function provides a conditional, atomic update operation for memory descriptors. + The memory descriptor identified by +\family typewriter +mem_desc +\family default + is only updated if the event queue identified by +\family typewriter +testq +\family default + is empty. + The intent is to only enable updates to the memory descriptor when no new + messages have arrived since the last time the queue was checked. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + + for an example of how this function can be used. +\layout Standard + +If +\family typewriter +new +\family default + is not NULL the memory descriptor identified by handle will be updated + to reflect the values in the structure pointed to by +\family typewriter +new +\family default + if +\family typewriter +testq +\family default + has the value +\family typewriter +PTL_EQ_NONE +\family default + or if the event queue identified by +\family typewriter +testq +\family default + is empty. + If +\family typewriter +old +\family default + is not NULL, the current value of the memory descriptor identified by +\family typewriter +mem_desc +\family default + is recorded in the location identified by +\family typewriter +old +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_NOUPDATE Indicates that the update was not performed because +\family typewriter +testq +\family default + was not empty. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. + +\layout Description + +PTL_ILL_MD Indicates that the value pointed to by +\family typewriter +new +\family default + is not a legal memory descriptor (e.g., the memory region specified by the + memory descriptor may be invalid). + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +testq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +new +\family default + or +\family typewriter +old +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +old_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +old_md +\family default + is not the value +\family typewriter +NULL +\family default +, the current value of the memory descriptor will be stored in the location + identified by +\family typewriter +old +\family default +_md. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +new_md +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +new_md +\family default + is not the value +\family typewriter +NULL +\family default +, this argument provides the new values for the memory descriptor, if the + update is performed. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +testq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for an event queue used to predicate the update. + If +\family typewriter +testq +\family default + is equal to +\family typewriter +PTL_EQ_NONE +\family default +, the update is performed unconditionally. + Otherwise, the update is performed if and only if +\family typewriter +testq +\family default + is empty. + If the update is not performed, the function returns the value +\family typewriter +PTL_NOUPDATE +\family default +. + (Note, the +\family typewriter +testq +\family default + argument does not need to be the same as the event queue associated with + the memory descriptor.) +\end_inset + + + + +\end_inset + + +\layout Standard + +The conditional update can be used to ensure that the memory descriptor + has not changed between the time it was examined and the time it is updated. + In particular, it is needed to support an MPI implementation where the + activity of searching an unexpected message queue and posting a receive + must be atomic. +\layout Section + +Events and Event Queues +\begin_inset LatexCommand \label{sec:eq} + +\end_inset + + +\layout Standard + +Event queues are used to log operations performed on memory descriptors. + They can also be used to hold acknowledgements for completed +\emph on +put +\emph default + operations and to note when the data specified in a +\emph on +put +\emph default + operation has been sent (i.e., when it is safe to reuse the buffer that holds + this data). + Multiple memory descriptors can share a single event queue. +\layout Standard + +In addition to the +\family typewriter +ptl_handle_eq_t +\family default + type, the Portals API defines two types associated with events: The +\family typewriter + +\newline +ptl_event_kind_t +\family default + type defines the kinds of events that can be stored in an event queue. + The +\family typewriter +ptl_event_t +\family default + type defines a structure that holds the information associated with an + event. +\layout Standard + +The Portals API also provides four functions for dealing with event queues: + The +\emph on +PtlEQAlloc +\emph default + function is used to allocate the API resources needed for an event queue, + the +\emph on +PtlEQFree +\emph default + function is used to release these resources, the +\emph on +PtlEQGet +\emph default + function can be used to get the next event from an event queue, and the + +\emph on +PtlEQWait +\emph default + function can be used to block a process (or thread) until an event queue + has at least one event. +\layout Subsection + +Kinds of Events +\begin_inset LatexCommand \label{sec:ek-type} + +\end_inset + + +\layout LyX-Code + +typedef enum { +\newline + PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL, +\newline + PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL, +\newline + PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL, +\newline + PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL, +\newline + PTL_EVENT_ACK, +\newline + PTL_EVENT_UNLINK +\newline +} ptl_event_kind_t; +\layout Standard +\noindent +The Portals API defines fourteen types of events that can be logged in an + event queue: +\layout Description + +PTL_EVENT_GET_START A remote +\emph on +get +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_GET_END A previously initiated +\emph on +get +\emph default + operation completed successfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_GET_FAIL A previously initiated +\emph on +get +\emph default + operation completed unsuccessfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_PUT_START A remote +\emph on +put +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should should be considered + volatile until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_PUT_END A previously initiated +\emph on +put +\emph default + operation completed successfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_PUT_FAIL A previously initiated +\emph on +put +\emph default + operation completed unsuccessfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_REPLY_START A +\emph on +reply +\emph default + operation has been started on the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_END A previously initiated +\emph on +reply +\emph default + operation has completed successfully . + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_FAIL A previously initiated +\emph on +reply +\emph default + operation has completed unsuccessfully. + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_ACK An +\emph on +acknowledgement +\emph default + was received. + This event is logged when the acknowledgement is received +\layout Description + +PTL_EVENT_SEND_START An outgoing +\emph on +send +\emph default + operation has been started. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_SEND_END A previously initiated +\emph on +send +\emph default + operation has completed successfully. + This event is logged after the entire buffer has been sent and it is safe + for the application to reuse the buffer. + +\layout Description + +PTL_EVENT_SEND_FAIL A previously initiated +\emph on +send +\emph default + operation has completed unsuccessfully. + The process can safely manipulate the memory or free the memory descriptor + once it sees this event. +\layout Description + +PTL_EVENT_UNLINK A memory descriptor associated with this event queue has + been automatically unlinked. + This event is not generated when a memory descriptor is explicitly unlinked + by calling +\shape italic +PtlMDUnlink +\shape default +. + This event does not decrement the threshold count. +\layout Subsection + +Event Ordering +\layout Standard + +The Portals API guarantees that a when a process initiates two operations + on a remote process, the operations will be initiated on the remote process + in the same order that they were initiated on the original process. + As an example, if process A intitates two +\emph on +put +\emph default + operations, +\emph on +x +\emph default + and +\emph on +y +\emph default +, on process B, the Portals API guarantees that process A will receive the + +\family typewriter +PTL_EVENT_SEND_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default + in the same order that process B receives the +\family typewriter +PTL_EVENT_PUT_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default +. + Notice that the API does not guarantee that the start events will be delivered + in the same order that process A initiated the +\emph on +x +\emph default + and +\emph on +y +\emph default + operations. + If process A needs to ensure the ordering of these operations, it should + include code to wait for the initiation of +\emph on +x +\emph default + before it initiates +\emph on +y +\emph default +. +\layout Subsection + +Failure Notification +\layout Standard + +Operations may fail to complete successfully; however, unless the node itself + fails, every operation that is started will eventually complete. + While an operation is in progress, the memory associated with the operation + should not be viewed (in the case of a put or a reply) or altered (in the + case of a send or get). + Operation completion, whether successful or unsuccessful, is final. + That is, when an operation completes, the memory associated with the operation + will no longer be read or altered by the operation. + A network interface can use the +\family typewriter +ptl_ni_fail_t +\family default + to define more specific information regarding the failure of the operation + and record this information in the +\family typewriter +ni_fail_type +\family default + field of the event. +\layout Subsection + +The Event Type +\begin_inset LatexCommand \label{sec:event-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_event_kind_t type; +\newline + ptl_process_id_t initiator; +\newline + ptl_uid_t uid; +\layout LyX-Code + + ptl_pt_index_t portal; +\newline + ptl_match_bits_t match_bits; +\newline + ptl_size_t rlength; +\newline + ptl_size_t mlength; +\newline + ptl_size_t offset; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_hdr_data_t hdr_data; +\newline + ptl_seq_t link; +\newline + ptl_ni_fail_t ni_fail_type; +\newline + volatile ptl_seq_t sequence; +\newline +} ptl_event_t; +\layout Standard +\noindent +An event structure includes the following members: +\layout Description + +type Indicates the type of the event. + +\layout Description + +initiator The id of the initiator. + +\layout Description + +portal The Portal table index specified in the request. + +\layout Description + +match_bits A copy of the match bits specified in the request. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for more information on match bits. + +\layout Description + +rlength The length (in bytes) specified in the request. + +\layout Description + +mlength The length (in bytes) of the data that was manipulated by the operation. + For truncated operations, the manipulated length will be the number of + bytes specified by the memory descriptor (possibly with an offset) operation. + For all other operations, the manipulated length will be the length of + the requested operation. + +\layout Description + +offset Is the displacement (in bytes) into the memory region that the operation + used. + The offset can be determined by the operation (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + +) for a remote managed memory descriptor, or by the local memory descriptor + (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +). + +\layout Description + +md_handle Is the handle to the memory descriptor associated with the event. +\layout Description + +mem_desc Is the state of the memory descriptor immediately after the event + has been processed. + +\layout Description + +hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +). + +\layout Description + +link The +\emph on +link +\emph default + member is used to link +\family typewriter +START +\family default + events with the +\family typewriter +END +\family default + or +\family typewriter +FAIL +\family default + event that signifies completion of the operation. + The +\emph on +link +\emph default + member will be the same for the two events associated with an operation. + The link member is also used to link an +\family typewriter +UNLINK +\family default + event with the event that caused the memory descriptor to be unlinked. +\layout Description + +sequence The sequence number for this event. + Sequence numbers are unique to each event. +\layout Comment + +The +\emph on +sequence +\emph default + member is the last member and is volatile to support SMP implementations. + When an event structure is filled in, the +\emph on +sequence +\emph default + member should be written after all other members have been updated. + Moreover, a memory barrier should be inserted between the updating of other + members and the updating of the +\emph on +sequence +\emph default + member. +\layout Subsection + +PtlEQAlloc +\begin_inset LatexCommand \label{sec:eqalloc} + +\end_inset + + +\layout LyX-Code + +int PtlEQAlloc( ptl_handle_ni_t interface, +\newline + ptl_size_t count, +\newline + ptl_handle_eq_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlEQAlloc +\emph default + function is used to build an event queue. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + event queue. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface with which the event queue will be associated. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +count +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The number of events that can be stored in the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQFree +\begin_inset LatexCommand \label{sec:eqfree} + +\end_inset + + +\layout LyX-Code + +int PtlEQFree( ptl_handle_eq_t eventq ); +\layout Standard +\noindent +The +\emph on +PtlEQFree +\emph default + function releases the resources associated with an event queue. + It is up to the user to insure that no memory descriptors are associated + with the event queue once it is freed. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard + +A handle for the event queue to be released. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQGet +\begin_inset LatexCommand \label{sec:eqget} + +\end_inset + + +\layout LyX-Code + +int PtlEQGet( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQGet +\emph default + function is a nonblocking function that can be used to get the next event + in an event queue. + The event is removed from the queue. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_EQ_EMPTY Indicates that +\family typewriter +eventq +\family default + is empty or another thread is waiting on +\emph on +PtlEQWait +\emph default +. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlEQWait +\begin_inset LatexCommand \label{sec:eqwait} + +\end_inset + + +\layout LyX-Code + +int PtlEQWait( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQWait +\emph default + function can be used to block the calling process (thread) until there + is an event in an event queue. + This function also returns the next event in the event queue and removes + this event from the queue. + This is the only blocking operation in the Portals 3.2 API. + In the event that multiple threads are waiting on the same event queue, + PtlEQWait is guaranteed to wake exactly one thread, but the order in which + they are awakened is not specified. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + queue handle. + +\layout Subsubsection + +Arguments +\layout Standard +\noindent + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue to wait on. + The calling process (thread) will be blocked until +\family typewriter +eventq +\family default + is not empty. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset + + + + +\end_inset + + +\layout Section + +The Access Control Table +\begin_inset LatexCommand \label{sec:ac} + +\end_inset + + +\layout Standard + +Processes can use the access control table to control which processes are + allowed to perform operations on Portal table entries. + Each communication interface has a Portal table and an access control table. + The access control table for the default interface contains an entry at + index zero that allows all processes with the same user id to communicate. + Entries in the access control table can be manipulated using the +\emph on +PtlACEntry +\emph default + function. +\layout Subsection + +PtlACEntry +\begin_inset LatexCommand \label{sec:acentry} + +\end_inset + + +\layout LyX-Code + +int PtlACEntry( ptl_handle_ni_t interface, +\newline + ptl_ac_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_uid_t user_id, +\newline + ptl_pt_index_t portal ); +\layout Standard +\noindent +The +\emph on +PtlACEntry +\emph default + function can be used to update an entry in the access control table for + an interface. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_AC_INV_INDEX Indicates that +\family typewriter +index +\family default + is not a valid access control table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_PT_INV_INDEX Indicates that +\family typewriter +portal +\family default + is not a valid Portal table index. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the interface to use. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index of the entry in the access control table to update. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +matchid +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the process(es) that are allowed to perform operations. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +user_id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the user that is allowed to perform operations. + The value +\family typewriter +PTL_UID_ANY +\family default + can be used to wildcard the user. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Identifies the Portal index(es) that can be used. + The value +\family typewriter +PTL_PT_INDEX_ANY +\family default + can be used to wildcard the Portal index. +\end_inset + + + + +\end_inset + + +\layout Section + +Data Movement Operations +\begin_inset LatexCommand \label{sec:datamovement} + +\end_inset + + +\layout Standard + +The Portals API provides two data movement operations: +\emph on +PtlPut +\emph default + and +\emph on +PtlGet +\emph default +. +\layout Subsection + +PtlPut +\begin_inset LatexCommand \label{sec:put} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t; +\newline + +\newline +int PtlPut( ptl_handle_md_t mem_desc, +\newline + ptl_ack_req_t ack_req, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset, +\newline + ptl_hdr_data_t hdr_data ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ack_req_t +\family default + are used to control whether an acknowledgement should be sent when the + operation completes (i.e., when the data has been written to a memory descriptor + of the +\family typewriter +target +\family default + process). + The value +\family typewriter +PTL_ACK_REQ +\family default + requests an acknowledgement, the value +\family typewriter +PTL_NOACK_REQ +\family default + requests that no acknowledgement should be generated. +\layout Standard + +The +\emph on +PtlPut +\emph default + function initiates an asynchronous put operation. + There are several events associated with a put operation: initiation of + the send on the local node ( +\family typewriter +PTL_EVENT_SEND_START +\family default +), completion of the send on the local node ( +\family typewriter +PTL_EVENT_SEND_END +\family default + or +\family typewriter +PTL_EVENT_SEND_FAIL +\family default +), and, when the send completes successfully, the receipt of an acknowledgement + ( +\family typewriter +PTL_EVENT_ACK +\family default +) indicating that the operation was accepted by the target. + These events will be logged in the event queue associated with the memory + descriptor ( +\family typewriter +mem_desc +\family default +) used in the put operation. + Using a memory descriptor that does not have an associated event queue + results in these events being discarded. + In this case, the application must have another mechanism (e.g., a higher + level protocol) for determining when it is safe to modify the memory region + associated with the memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory to be sent. + If the memory descriptor has an event queue associated with it, it will + be used to record events when the message has been sent (PTL_EVENT_SEND_START, + PTL_EVENT_SEND_END). + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ack_req +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +Controls whether an acknowledgement event is requested. + Acknowledgements are only sent when they are requested by the initiating + process +\series bold +and +\series default + the memory descriptor has an event queue +\series bold +and +\series default + the target memory descriptor enables them. + Allowed constants: +\family typewriter +PTL_ACK_REQ +\family default +, +\family typewriter +PTL_NOACK_REQ +\family default +. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +hdr_data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +64 bits of user data that can be included in message header. + This data is written to an event queue entry at the target if an event + queue is present on the matching memory descriptor. +\end_inset + + + + +\end_inset + + +\layout Subsection + +PtlGet +\begin_inset LatexCommand \label{sec:get} + +\end_inset + + +\layout LyX-Code + +int PtlGet( ptl_handle_md_t mem_desc, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset ); +\layout Standard +\noindent +The +\emph on +PtlGet +\emph default + function initiates a remote read operation. + There are two event pairs associated with a get operation , when the data + is sent from the remote node, a +\family typewriter +PTL_EVENT_GET{START|END} +\family default + event pair is registered on the remote node; and when the data is returned + from the remote node a +\family typewriter +PTL_EVENT_REPLY{START|END} +\family default + event pair is registered on the local node. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory into which + the requested data will be received. + The memory descriptor can have an event queue associated with it to record + events, such as when the message receive has started ( +\family typewriter +PTL_EVENT_REPLY +\family default +_ +\family typewriter +START +\family default +). +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset + + + + +\end_inset + + +\layout Section + +Summary +\layout Standard + + +\begin_inset LatexCommand \label{sec:summary} + +\end_inset + + We conclude this section by summarizing the names introduced by the Portals + 3.2 API. + We start by summarizing the names of the types introduced by the API. + This is followed by a summary of the functions introduced by the API. + Which is followed by a summary of the function return codes. + Finally, we conclude with a summary of the other constant values introduced + by the API. +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + + presents a summary of the types defined by the Portals API. + The first column in this table gives the type name, the second column gives + a brief description of the type, the third column identifies the section + where the type is defined, and the fourth column lists the functions that + have arguments of this type. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Types Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:types} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\noindent + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold + Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Sect +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold + Functions +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlACEntry, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +acknowledgement request types +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlPut +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +kinds of events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +information about events +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +plt_seq_t +\end_inset + + +\begin_inset Text + +\layout Standard + +event sequence number +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_any_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for any object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for event queues +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert, + PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_me_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for match entries +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_ni_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +handles for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut, + PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +node identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlGetId,PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +process identifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetId, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +user indentifier +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlGetUid, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +insertion position (before or after) +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +identifiers for network interfaces +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +match (and ignore) bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mb-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +memory descriptors +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ni_fail_t +\end_inset + + +\begin_inset Text + +\layout Standard + +network interface-specific failures +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +process identifiers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for Portal tables +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +sizes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:size-t} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlPut, PtlGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +indexes for status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_value_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +values in status registers +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +unlink options +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + + presents a summary of the functions defined by the Portals API. + The first column in this table gives the name for the function, the second + column gives a brief description of the operation implemented by the function, + and the third column identifies the section where the function is defined. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Functions Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:func} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + Operation +\end_inset + + +\begin_inset Text + +\layout Standard + + Section +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlACEntry +\end_inset + + +\begin_inset Text + +\layout Standard + + update an entry in an access control table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQAlloc +\end_inset + + +\begin_inset Text + +\layout Standard + + create an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQGet +\end_inset + + +\begin_inset Text + +\layout Standard + + get the next event from an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQFree +\end_inset + + +\begin_inset Text + +\layout Standard + + release the resources for an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlEQWait +\end_inset + + +\begin_inset Text + +\layout Standard + + wait for a new event in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGet +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a get operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlGetId +\end_inset + + +\begin_inset Text + +\layout Standard + + get the id for the current process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize the Portals API +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDAttach +\end_inset + + +\begin_inset Text + +\layout Standard + + create a memory descriptor and attach it to a match entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDBind +\end_inset + + +\begin_inset Text + +\layout Standard + + create a free-floating memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a memory descriptor from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMDUpdate +\end_inset + + +\begin_inset Text + +\layout Standard + + update a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEAttach +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a Portal table +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a free Portal table entry +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:attachany} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEInsert +\end_inset + + +\begin_inset Text + +\layout Standard + + create a match entry and insert it in a list +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlMEUnlink +\end_inset + + +\begin_inset Text + +\layout Standard + + remove a match entry from a list and release its resources +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIDist +\end_inset + + +\begin_inset Text + +\layout Standard + + get the distance to another process +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIFini +\end_inset + + +\begin_inset Text + +\layout Standard + + shutdown a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIHandle +\end_inset + + +\begin_inset Text + +\layout Standard + + get the network interface handle for an object +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIInit +\end_inset + + +\begin_inset Text + +\layout Standard + + initialize a network interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlNIStatus +\end_inset + + +\begin_inset Text + +\layout Standard + + read a network interface status register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + PtlPut +\end_inset + + +\begin_inset Text + +\layout Standard + + perform a put operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + + summarizes the return codes used by functions defined by the Portals API. + All of these constants are integer values. + The first column of this table gives the symbolic name for the constant, + the second column gives a brief description of the value, and the third + column identifies the functions that can return this value. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Function Return Codes for the Portals 3.2 API +\begin_inset LatexCommand \label{tab:retcodes} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Functions +\series default + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_AC_INV_INDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlACEntry +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_DROPPED +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +at least one event has been dropped +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet, PtlWait +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_EMPTY +\end_inset + + +\begin_inset Text + +\layout Standard + +no events available in an event queue +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +error during initialization or cleanup +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlInit, PtlFini +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ILL_MD +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +illegal memory descriptor values +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDBind, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_DUP +\end_inset + + +\begin_inset Text + +\layout Standard + +duplicate initialization of an interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_INV +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +initialization of an invalid interface +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +the ME already has an MD +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ASIZE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid access control table size +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_EQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUpdate, PtlEQFree, PtlEQGet +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_HANDLE +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_MD +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid memory descriptor handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDUnlink, PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ME +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid match entry handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_NI +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid network interface handle +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PROC +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid process identifier +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PTINDEX +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid Portal table index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_REG +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_SR_INDX +\end_inset + + +\begin_inset Text + +\layout Standard + +invalid status register index +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ML_TOOLONG +\end_inset + + +\begin_inset Text + +\layout Standard + +match list too long +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach, PtlMEInsert +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_INUSE +\end_inset + + +\begin_inset Text + +\layout Standard + +MD has pending operations +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMDUnlink +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOINIT +\end_inset + + +\begin_inset Text + +\layout Standard + +uninitialized API +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\emph default +, except PtlInit +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOSPACE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +insufficient memory +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOUPDATE +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + no update was performed +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + PtlMDUpdate +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_FULL +\end_inset + + +\begin_inset Text + +\layout Standard + +Portal table is full +\end_inset + + +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_OK +\end_inset + + +\begin_inset Text + +\layout Standard + + success +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SEGV +\end_inset + + +\begin_inset Text + +\layout Standard + +addressing violation +\end_inset + + +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate, + PtlEQAlloc, PtlEQGet, PtlEQWait +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + summarizes the remaining constant values introduced by the Portals API. + The first column in this table presents the symbolic name for the constant, + the second column gives a brief description of the value, the third column + identifies the type for the value, and the fourth column identifies the + sections in which the value is mentioned. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Other Constants Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:oconsts} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Base type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Intr. +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Ref. +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ACK_REQ +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +request an acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_NONE +\end_inset + + +\begin_inset Text + +\layout Standard + +a NULL event queue handle +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_START +\end_inset + + +\begin_inset Text + +\layout Standard + +get event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_END +\end_inset + + +\begin_inset Text + +\layout Standard + +get event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +get event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_START +\end_inset + + +\begin_inset Text + +\layout Standard + +put event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_END +\end_inset + + +\begin_inset Text + +\layout Standard + +put event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +put event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_START +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_END +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +reply event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_START +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_END +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +acknowledgement event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_START +\end_inset + + +\begin_inset Text + +\layout Standard + +send event start +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_END +\end_inset + + +\begin_inset Text + +\layout Standard + +send event end +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_FAIL +\end_inset + + +\begin_inset Text + +\layout Standard + +send event fail +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +unlink event +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for process id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for node id fields +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UID_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for user id +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_IFACE_DEFAULT +\end_inset + + +\begin_inset Text + +\layout Standard + +default interface +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_AFTER +\end_inset + + +\begin_inset Text + +\layout Standard + +insert after +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_BEFORE +\end_inset + + +\begin_inset Text + +\layout Standard + +insert before +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_ACK_DISABLE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to disable acknowledgements +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_MANAGE_REMOTE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable the use of remote offsets +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_GET +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable get operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_PUT +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable put operations +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_THRESH_INF +\end_inset + + +\begin_inset Text + +\layout Standard + +infinite threshold for a memory descriptor +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_TRUNCATE +\end_inset + + +\begin_inset Text + +\layout Standard + +a flag to enable truncation of a request +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOACK_REQ +\end_inset + + +\begin_inset Text + +\layout Standard + +request no acknowledgement +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_INDEX_ANY +\end_inset + + +\begin_inset Text + +\layout Standard + +wildcard for Portal indexes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_RETAIN +\end_inset + + +\begin_inset Text + +\layout Standard + +disable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SR_DROP_COUNT +\end_inset + + +\begin_inset Text + +\layout Standard + +index for the dropped count register +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UNLINK +\end_inset + + +\begin_inset Text + +\layout Standard + +enable unlinking +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Chapter + +The Semantics of Message Transmission +\begin_inset LatexCommand \label{sec:semantics} + +\end_inset + + +\layout Standard + +The portals API uses four types of messages: put requests, acknowledgements, + get requests, and replies. + In this section, we describe the information passed on the wire for each + type of message. + We also describe how this information is used to process incoming messages. +\layout Section + +Sending Messages +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:put-wire} + +\end_inset + + summarizes the information that is transmitted for a put request. + The first column provides a descriptive name for the information, the second + column provides the type for this information, the third column identifies + the source of the information, and the fourth column provides additional + notes. + Most information that is transmitted is obtained directly from the +\emph on +PtlPut +\emph default + operation. + Notice that the handle for the memory descriptor used in the +\emph on +PtlPut +\emph default + operation is transmitted even though this value cannot be interpreted by + the target. + A value of anything other than +\family typewriter +PTL_MD_NONE +\family default +, is interpreted as a request for an acknowledgement. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Put Request +\begin_inset LatexCommand \label{tab:put-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlPut +\emph default + arg +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a put request +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +no ack if +\family typewriter +PTL_MD_NONE +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family roman +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +start +\family default + and +\family typewriter +length +\family default + members +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:ack-wire} + +\end_inset + + summarizes the information transmitted in an acknowledgement. + Most of the information is simply echoed from the put request. + Notice that the initiator and target are obtained directly from the put + request, but are swapped in generating the acknowledgement. + The only new piece of information in the acknowledgement is the manipulated + length which is determined as the put request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in an Acknowledgement +\begin_inset LatexCommand \label{tab:ack-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + offset +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + + length +\end_inset + + +\begin_inset Text + +\layout Standard + + echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + + manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + + obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:get-wire} + +\end_inset + + summarizes the information that is transmitted for a get request. + Like the information transmitted in a put request, most of the information + transmitted in a get request is obtained directly from the +\emph on +PtlGet +\emph default + operation. + Unlike put requests, get requests do not include the event queue handle. + In this case, the reply is generated whenever the operation succeeds and + the memory descriptor must not be unlinked until the reply is received. + As such, there is no advantage to explicitly sending the event queue handle. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Get Request +\begin_inset LatexCommand \label{tab:get-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlGet +\emph default + argument +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates a get operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +user +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +local information +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:reply-wire} + +\end_inset + + summarizes the information transmitted in a reply. + Like an acknowledgement, most of the information is simply echoed from + the get request. + The initiator and target are obtained directly from the get request, but + are swapped in generating the acknowledgement. + The only new information in the acknowledgement are the manipulated length + and the data, which are determined as the get request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Reply +\begin_inset LatexCommand \label{tab:reply-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular + + + + + + + + +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset + + +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset + + + + +\begin_inset Text + +\layout Standard + +operation +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +indicates an acknowledgement +\end_inset + + + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +target +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset + + +\begin_inset Text + +\layout Standard + +initiator +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset + + +\begin_inset Text + +\layout Standard + +portal index +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset + + +\begin_inset Text + +\layout Standard + +match bits +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +offset +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +memory desc +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +requested length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +length +\end_inset + + +\begin_inset Text + +\layout Standard + +echo +\end_inset + + + + +\begin_inset Text + +\layout Standard + +manipulated length +\end_inset + + +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\begin_inset Text + +\layout Standard + +data +\end_inset + + +\begin_inset Text + +\layout Standard + + +\emph on +bytes +\end_inset + + +\begin_inset Text + +\layout Standard + +\end_inset + + +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset + + + + +\end_inset + + +\end_inset + + +\layout Section + +Receiving Messages +\begin_inset LatexCommand \label{sec:receiving} + +\end_inset + + +\layout Standard + +When an incoming message arrives on a network interface, the communication + system first checks that the target process identified in the request is + a valid process that has initialized the network interface (i.e., that the + target process has a valid Portal table). + If this test fails, the communication system discards the message and increment +s the dropped message count for the interface. + The remainder of the processing depends on the type of the incoming message. + Put and get messages are subject to access control checks and translation + (searching a match list), while acknowledgement and reply messages bypass + the access control checks and the translation step. +\layout Standard + +Acknowledgement messages include a handle for the memory descriptor used + in the original +\emph on +PtlPut +\emph default + operation. + This memory descriptor will identify the event queue where the event should + be recorded. + Upon receipt of an acknowledgement, the runtime system only needs to confirm + that the memory descriptor and event queue still exist and that there is + space for another event. + Should the any of these conditions fail, the message is simply discarded + and the dropped message count for the interface is incremented. + Otherwise, the system builds an acknowledgement event from the information + in the acknowledgement message and adds it to the event queue. +\layout Standard + +Reception of reply messages is also relatively straightforward. + Each reply message includes a handle for a memory descriptor. + If this descriptor exists, it is used to receive the message. + A reply message will be dropped if the memory descriptor identified in + the request doesn't exist. + In either of this case, the dropped message count for the interface is + incremented. + These are the only reasons for dropping reply messages. + Every memory descriptor accepts and truncates incoming reply messages, + eliminating the other potential reasons for rejecting a reply message. +\layout Standard + +The critical step in processing an incoming put or get request involves + mapping the request to a memory descriptor. + This step starts by using the Portal index in the incoming request to identify + a list of match entries. + This list of match entries is searched in order until a match entry is + found whose match criteria matches the match bits in the incoming request + and whose memory descriptor accepts the request. +\layout Standard + +Because acknowledge and reply messages are generated in response to requests + made by the process receiving these messages, the checks performed by the + runtime system for acknowledgements and replies are minimal. + In contrast, put and get messages are generated by remote processes and + the checks performed for these messages are more extensive. + Incoming put or get messages may be rejected because: +\layout Itemize + +the Portal index supplied in the request is not valid; +\layout Itemize + +the cookie supplied in the request is not a valid access control entry; + +\layout Itemize + +the access control entry identified by the cookie does not match the identifier + of the requesting process; +\layout Itemize + +the access control entry identified by the access control entry does not + match the Portal index supplied in the request; or +\layout Itemize + +the match bits supplied in the request do not match any of the match entries + with a memory descriptor that accepts the request. + +\layout Standard + +In all cases, if the message is rejected, the incoming message is discarded + and the dropped message count for the interface is incremented. +\layout Standard + +A memory descriptor may reject an incoming request for any of the following + reasons: +\layout Itemize + +the +\family typewriter +PTL_MD_PUT +\family default + or +\family typewriter +PTL_MD_GET +\family default + option has not been enabled and the operation is put or get, respectively; + +\layout Itemize + +the length specified in the request is too long for the memory descriptor + and the +\family typewriter +PTL_MD_TRUNCATE +\family default + option has not been enabled. +\layout Chapter + +Examples +\begin_inset LatexCommand \label{sec:examples} + +\end_inset + + +\layout Comment + +The examples presented in this chapter have not been updated to reflect + the current API. +\layout Standard + +In this section we present several example to illustrate expected usage + patterns for the Portals 3.2 API. + The first example describes how to implement parallel servers using the + features of the Portals 3.2 API. + This example covers the access control list and the use of remote managed + offsets. + The second example presents an approach to dealing with dropped requests. + This example covers aspects of match lists and memory descriptors. + The final example covers message reception in MPI. + This example illustrates more sophisticated uses of matching and a procedure + to update a memory descriptor. +\layout Section + +Parallel File Servers +\begin_inset LatexCommand \label{sec:expfs} + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:file} + +\end_inset + + illustrates the logical structure of a parallel file server. + In this case, the parallel server consists of four servers that stripe + application data across four disks. + We would like to present applications with the illusion that the file server + is a single entity. + We will assume that all of the processes that constitute the parallel server + have the same user id. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename file.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 196pt + lyxheight 147pt +\end_inset + + +\layout Caption + +Parallel File Server +\begin_inset LatexCommand \label{fig:file} + +\end_inset + + +\end_inset + + +\layout Standard + +When an application establishes a connection to the parallel file server, + it will allocate a Portal and access control list entry for communicating + with the server. + The access control list entry will include the Portal and match any process + in the parallel file server's, so all of the file server processes will + have access to the portal. + The Portal information and access control entry will be sent to the file + server at this time. + If the application and server need to have multiple, concurrent I/O operations, + they can use additional portals or match entries to keep the operations + from interfering with one another. +\layout Standard + +When an application initiates an I/O operation, it first builds a memory + descriptor that describes the memory region involved in the operation. + This memory descriptor will enable the appropriate operation (put for read + operations and get for write operations) and enable the use of remote offsets + (this lets the servers decide where their data should be placed in the + memory region). + After creating the memory descriptor and linking it into the appropriate + Portal entry, the application sends a read or write request (using +\emph on +PtlPut +\emph default +) to one of the file server processes. + The file server processes can then use put or get operations with the appropria +te offsets to fill or retrieve the contents of the application's buffer. + To know when the operation has completed, the application can add an event + queue to the memory descriptor and add up the lengths of the remote operations + until the sum is the size of the requested I/O operation. +\layout Section + +Dealing with Dropped Requests +\begin_inset LatexCommand \label{sec:exdrop} + +\end_inset + + +\layout Standard + +If a process does not anticipate unexpected requests, they will be discarded. + Applications using the Portals API can query the dropped count for the + interface to determine the number of requests that have been dropped (see + Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + While this approach minimizes resource consumption, it does not provide + information that might be critical in debugging the implementation of a + higher level protocol. +\layout Standard + +To keep track of more information about dropped requests, we use a memory + descriptor that truncates each incoming request to zero bytes and logs + the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + operations in an event queue. + Note that the operations are not dropped in the Portals sense, because + the operation succeeds. +\layout Standard + +The following code fragment illustrates an implementation of this approach. + In this case, we assume that a thread is launched to execute the function + +\family typewriter +watch_drop +\family default +. + This code starts by building an event queue to log truncated operations + and a memory descriptor to truncate the incoming requests. + This example only captures +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests for a single portal. + In a more realistic situation, the memory descriptor would be appended + to the match list for every portal. + We also assume that the thread is capable of keeping up with the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests. + If this is not the case, we could use a finite threshold on the memory + descriptor to capture the first few dropped requests. +\layout LyX-Code + + +\size small +#include +\newline +#include +\newline +#include +\newline + +\newline +#define DROP_SIZE 32 /* number of dropped requests to track */ +\newline + +\newline +int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) { +\newline + ptl_handle_eq_t drop_events; +\newline + ptl_event_t event; +\newline + ptl_handle_md_t drop_em; +\newline + ptl_md_t drop_desc; +\newline + ptl_process_id_t any_proc; +\newline + ptl_handle_me_t match_any; +\newline + +\newline + /* create the event queue */ +\newline + if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the event queue +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* build a match entry */ +\newline + any_proc.nid = PTL_ID_ANY; +\newline + any_proc.pid = PTL_ID_ANY; +\newline + PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN, +\newline + &match_any ); +\newline + +\newline + /* create the memory descriptor */ +\newline + drop_desc.start = NULL; +\newline + drop_desc.length = 0; +\newline + drop_desc.threshold = PTL_MD_THRESH_INF; +\newline + drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE; +\newline + drop_desc.user_ptr = NULL; +\newline + drop_desc.eventq = drop_events; +\newline + if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the memory descriptor +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* watch for "dropped" requests */ +\newline + while( 1 ) { +\newline + if( PtlEQWait( drop_events, &event ) != PTL_OK ) break; +\newline + fprintf( stderr, "Dropped request from gid = event.initiator.gid, + event.initiator.rid ); +\newline + } +\newline +} +\layout Section + +Message Transmission in MPI +\begin_inset LatexCommand \label{sec:exmpi} + +\end_inset + + +\layout Standard + +We conclude this section with a fairly extensive example that describes + an approach to implementing message transmission for MPI. + Like many MPI implementations, we distinguish two message transmission + protocols: a short message protocol and a long message protocol. + We use the constant +\family typewriter +MPI_LONG_LENGTH +\family default + to determine the size of a long message. +\layout Standard + +For small messages, the sender simply sends the message and presumes that + the message will be received (i.e., the receiver has allocated a memory region + to receive the message body). + For large messages, the sender also sends the message, but does not presume + that the message body will be saved. + Instead, the sender builds a memory descriptor for the message and enables + get operations on this descriptor. + If the target does not save the body of the message, it will record an + event for the put operation. + When the process later issues a matching MPI receive, it will perform a + get operation to retrieve the body of the message. +\layout Standard + +To facilitate receive side matching based on the protocol, we use the most + significant bit in the match bits to indicate the protocol: 1 for long + messages and 0 for short messages. +\layout Standard + +The following code presents a function that implements the send side of + the protocol. + The global variable +\family typewriter +EndGet +\family default + is the last match entry attached to the Portal index used for posting long + messages. + This entry does not match any incoming requests (i.e., the memory descriptor + rejects all get operations) and is built during initialization of the MPI + library. + The other global variable, +\family typewriter +MPI_NI +\family default +, is a handle for the network interface used by the MPI implementation. +\layout LyX-Code + + +\size small +extern ptl_handle_me_t EndGet; +\newline +extern ptl_handle_ni_t MPI_NI; +\newline + +\newline +void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq, +\newline + ptl_process_id target, ptl_match_bits_t match ) +\newline +{ +\newline + ptl_handle_md_t send_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_ack_req_t want_ack; +\newline + +\newline + mem_desc.start = buf; +\newline + mem_desc.length = len; +\newline + mem_desc.threshold = 1; +\newline + mem_desc.options = PTL_MD_GET_OP; +\newline + mem_desc.user_ptr = data; +\newline + mem_desc.eventq = eventq; +\newline + +\newline + if( len >= MPI_LONG_LENGTH ) { +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + /* add a match entry to the end of the get list */ +\newline + PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet, + &me_handle ); +\newline + PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL ); +\newline + +\newline + /* we want an ack for long messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a long message + */ +\newline + match |= 1<<63; +\newline + } else { +\newline + /* we don't want an ack for short messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a short message + */ +\newline + match &= ~(1<<63); +\newline + } +\newline + +\newline + /* create a memory descriptor and send it */ +\newline + PtlMDBind( MPI_NI, mem_desc, &send_handle ); +\newline + PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match, + 0 ); +\newline +} +\layout Standard + +The +\emph on +MPISend +\emph default + function returns as soon as the message has been scheduled for transmission. + The event queue argument, +\family typewriter +eventq +\family default +, can be used to determine the disposition of the message. + Assuming that +\family typewriter +eventq +\family default + is not +\family typewriter +PTL_EQ_NONE +\family default +, a +\family typewriter +PTL_EVENT_SENT +\family default + event will be recorded for each message as the message is transmitted. + For small messages, this is the only event that will be recorded in +\family typewriter +eventq +\family default +. + In contrast, long messages include an explicit request for an acknowledgement. + If the +\family typewriter +target +\family default + process has posted a matching receive, the acknowledgement will be sent + as the message is received. + If a matching receive has not been posted, the message will be discarded + and no acknowledgement will be sent. + When the +\family typewriter +target +\family default + process later issues a matching receive, the receive will be translated + into a get operation and a +\family typewriter +PTL_EVENT_GET +\family default + event will be recorded in +\family typewriter +eventq +\family default +. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:mpi} + +\end_inset + + illustrates the organization of the match list used for receiving MPI messages. + The initial entries (not shown in this figure) would be used to match the + MPI receives that have been preposted by the application. + The preposted receives are followed by a match entry, +\emph on +RcvMark +\emph default +, that marks the boundary between preposted receives and the memory descriptors + used for +\begin_inset Quotes eld +\end_inset + +unexpected +\begin_inset Quotes erd +\end_inset + + messages. + The +\emph on +RcvMark +\emph default + entry is followed by a small collection of match entries that match unexpected + +\begin_inset Quotes eld +\end_inset + +short +\begin_inset Quotes erd +\end_inset + + messages, i.e., messages that have a 0 in the most significant bit of their + match bits. + The memory descriptors associated with these match entries will append + the incoming message to the associated memory descriptor and record an + event in an event queue for unexpected messages. + The unexpected short message matching entries are followed by a match entry + that will match messages that were not matched by the preceding match entries, + i.e., the unexpected long messages. + The memory descriptor associated with this match entry truncates the message + body and records an event in the event queue for unexpected messages. + Note that of the memory descriptors used for unexpected messages share + a common event queue. + This makes it possible to process the unexpected messages in the order + in which they arrived, regardless of. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename mpi.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 389pt + lyxheight 284pt +\end_inset + + +\layout Caption + +Message Reception in MPI +\begin_inset LatexCommand \label{fig:mpi} + +\end_inset + + +\end_inset + + +\layout Standard + +When the local MPI process posts an MPI receive, we must first search the + events unexpected message queue to see if a matching message has already + arrived. + If no matching message is found, a match entry for the receive is inserted + before the +\emph on +RcvMark +\emph default + entry--after the match entries for all of the previously posted receives + and before the match entries for the unexpected messages. + This ensures that preposted receives are matched in the order that they + were posted (a requirement of MPI). + +\layout Standard + +While this strategy respects the temporal semantics of MPI, it introduces + a race condition: a matching message might arrive after the events in the + unexpected message queue have been searched, but before the match entry + for the receive has been inserted in the match list. + +\layout Standard + +To avoid this race condition we start by setting the +\family typewriter +threshold +\family default + of the memory descriptor to 0, making the descriptor inactive. + We then insert the match entry into the match list and proceed to search + the events in the unexpected message queue. + A matching message that arrives as we are searching the unexpected message + queue will not be accepted by the memory descriptor and, if not matched + by an earlier match list element, will add an event to the unexpected message + queue. + After searching the events in the unexpected message queue, we update the + memory descriptor, setting the threshold to 1 to activate the memory descriptor. + This update is predicated by the condition that the unexpected message + queue is empty. + We repeat the process of searching the unexpected message queue until the + update succeeds. +\layout Standard + +The following code fragment illustrates this approach. + Because events must be removed from the unexpected message queue to be + examined, this code fragment assumes the existence of a user managed event + list, +\family typewriter +Rcvd +\family default +, for the events that have already been removed from the unexpected message + queue. + In an effort to keep the example focused on the basic protocol, we have + omitted the code that would be needed to manage the memory descriptors + used for unexpected short messages. + In particular, we simply leave messages in these descriptors until they + are received by the application. + In a robust implementation, we would introduce code to ensure that short + unexpected messages are removed from these memory descriptors so that they + can be re-used. +\layout LyX-Code + + +\size small +extern ptl_handle_eq_t UnexpQueue; +\newline +extern ptl_handle_me_t RcvMark; +\newline +extern ptl_handle_me_t ShortMatch; +\newline + +\newline +typedef struct event_list_tag { +\newline + ptl_event_t event; +\newline + struct event_list_tag* next; +\newline +} event_list; +\newline + +\newline +extern event_list Rcvd; +\newline + +\newline +void AppendRcvd( ptl_event_t event ) +\newline +{ +\newline + /* append an event onto the Rcvd list */ +\newline +} +\newline + +\newline +int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi +ts_t match, +\newline + ptl_match_bits_t ignore, ptl_event_t *event ) +\newline +{ +\newline + /* Search the Rcvd event queue, looking for a message that matches the + requested message. +\newline + * If one is found, remove the event from the Rcvd list and return it. + */ +\newline +} +\newline + +\newline +typedef enum { RECEIVED, POSTED } receive_state; +\newline + +\newline +receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event, + ptl_md_t md_buf ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + if( event.rlength >= MPI_LONG_LENGTH ) { +\newline + PtlMDBind( MPI_NI, md_buf, &md_handle ); +\newline + PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX, + md_handle ); +\newline + return POSTED; +\newline + } else { +\newline + /* copy the message */ +\newline + if( event.mlength < *length ) *length = event.mlength; +\newline + memcpy( buf, (char*)event.md_desc.start+event.offset, *length ); +\newline + return RECEIVED; +\newline + } +\newline +} +\newline + +\newline +receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle +_eq_t eventq, +\newline + ptl_process_id_t sender, ptl_match_bits_t match, + ptl_match_bits_t ignore ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_handle_me_t me_handle; +\newline + ptl_event_t event; +\newline + +\newline + /* build a memory descriptor for the receive */ +\newline + md_buf.start = buf; +\newline + md_buf.length = *len; +\newline + md_buf.threshold = 0; /* temporarily disabled */ +\newline + md_buf.options = PTL_MD_PUT_OP; +\newline + md_buf.user_ptr = MPI_data; +\newline + md_buf.eventq = eventq; +\newline + +\newline + /* see if we have already received the message */ +\newline + if( SearchRcvd(buf, len, sender, match, ignore, &event) ) +\newline + return CopyMsg( buf, len, event, md_buf ); +\newline + +\newline + /* create the match entry and attach the memory descriptor */ +\newline + PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark, + &me_handle); +\newline + PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle ); +\newline + +\newline + md_buf.threshold = 1; +\newline + do +\newline + if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) { +\newline + if( MPIMatch(event, match, ignore, sender) ) { +\newline + return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset, + md_buf ); +\newline + } else { +\newline + AppendRcvd( event ); +\newline + } +\newline + } +\newline + while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE + ); +\newline + return POSTED; +\newline +} +\layout Chapter* + +Acknowledgments +\layout Standard + +Several people have contributed to the philosophy, design, and implementation + of the Portals message passing architecture as it has evolved. + We acknowledge the following people for their contributions: Al Audette, + Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike + Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke, + Dave van Dresser, Lee Ward, and Stephen Wheat. + +\layout Standard + + +\begin_inset LatexCommand \BibTeX[ieee]{portals3} + +\end_inset + + +\the_end diff --git a/lnet/doc/put.fig b/lnet/doc/put.fig new file mode 100644 index 0000000..5235b6d --- /dev/null +++ b/lnet/doc/put.fig @@ -0,0 +1,32 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1350 900 2175 1200 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1275 2700 1725 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 1200 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2699 1788 899 1938 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001 +4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001 +4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 diff --git a/lnet/include/.cvsignore b/lnet/include/.cvsignore new file mode 100644 index 0000000..d45f796 --- /dev/null +++ b/lnet/include/.cvsignore @@ -0,0 +1,4 @@ +config.h +stamp-h +stamp-h1 +stamp-h.in diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am new file mode 100644 index 0000000..2cf7f99 --- /dev/null +++ b/lnet/include/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = portals linux +EXTRA_DIST = config.h.in +include $(top_srcdir)/Rules diff --git a/lnet/include/config.h.in b/lnet/include/config.h.in new file mode 100644 index 0000000..b05d0c4 --- /dev/null +++ b/lnet/include/config.h.in @@ -0,0 +1,11 @@ +/* ../include/config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if you have the readline library (-lreadline). */ +#undef HAVE_LIBREADLINE + +/* Name of package */ +#undef PACKAGE + +/* Version number of package */ +#undef VERSION + diff --git a/lnet/include/linux/Makefile.am b/lnet/include/linux/Makefile.am new file mode 100644 index 0000000..6a65cb5 --- /dev/null +++ b/lnet/include/linux/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(top_srcdir)/Rules + +linuxincludedir = $(includedir)/linux + +linuxinclude_HEADERS=kp30.h portals_lib.h diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h new file mode 100644 index 0000000..6d7f3f3 --- /dev/null +++ b/lnet/include/linux/kp30.h @@ -0,0 +1,943 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _KP30_INCLUDED +#define _KP30_INCLUDED + + +#define PORTAL_DEBUG + +#ifndef offsetof +# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) +#endif + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +#ifndef CONFIG_SMP +# define smp_processor_id() 0 +#endif + +/* + * Debugging + */ +extern unsigned int portal_subsystem_debug; +extern unsigned int portal_stack; +extern unsigned int portal_debug; +extern unsigned int portal_printk; +/* Debugging subsystems (8 bit ID) + * + * If you add debug subsystem #32, you need to send email to phil, because + * you're going to break kernel subsystem debug filtering. */ +#define S_UNDEFINED (0 << 24) +#define S_MDC (1 << 24) +#define S_MDS (2 << 24) +#define S_OSC (3 << 24) +#define S_OST (4 << 24) +#define S_CLASS (5 << 24) +#define S_OBDFS (6 << 24) /* obsolete */ +#define S_LLITE (7 << 24) +#define S_RPC (8 << 24) +#define S_EXT2OBD (9 << 24) /* obsolete */ +#define S_PORTALS (10 << 24) +#define S_SOCKNAL (11 << 24) +#define S_QSWNAL (12 << 24) +#define S_PINGER (13 << 24) +#define S_FILTER (14 << 24) +#define S_TRACE (15 << 24) /* obsolete */ +#define S_ECHO (16 << 24) +#define S_LDLM (17 << 24) +#define S_LOV (18 << 24) +#define S_GMNAL (19 << 24) +#define S_PTLROUTER (20 << 24) +#define S_COBD (21 << 24) +#define S_PTLBD (22 << 24) +#define S_LOG (23 << 24) + +/* If you change these values, please keep portals/linux/utils/debug.c + * up to date! */ + +/* Debugging masks (24 bits, non-overlapping) */ +#define D_TRACE (1 << 0) /* ENTRY/EXIT markers */ +#define D_INODE (1 << 1) +#define D_SUPER (1 << 2) +#define D_EXT2 (1 << 3) /* anything from ext2_debug */ +#define D_MALLOC (1 << 4) /* print malloc, free information */ +#define D_CACHE (1 << 5) /* cache-related items */ +#define D_INFO (1 << 6) /* general information */ +#define D_IOCTL (1 << 7) /* ioctl related information */ +#define D_BLOCKS (1 << 8) /* ext2 block allocation */ +#define D_NET (1 << 9) /* network communications */ +#define D_WARNING (1 << 10) +#define D_BUFFS (1 << 11) +#define D_OTHER (1 << 12) +#define D_DENTRY (1 << 13) +#define D_PORTALS (1 << 14) /* ENTRY/EXIT markers */ +#define D_PAGE (1 << 15) /* bulk page handling */ +#define D_DLMTRACE (1 << 16) +#define D_ERROR (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA (1 << 19) /* recovery and failover */ +#define D_RPCTRACE (1 << 20) /* for distributed debugging */ +#define D_VFSTRACE (1 << 21) + +#ifndef __KERNEL__ +#define THREAD_SIZE 8192 +#endif +#ifdef __ia64__ +#define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +#else +#define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +#endif + +#ifdef __KERNEL__ +#define CHECK_STACK(stack) \ + do { \ + if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR, \ + __FILE__, __FUNCTION__, __LINE__, \ + (stack), \ + "maximum lustre stack %u\n", \ + portal_stack = (stack)); \ + /*panic("LBUG");*/ \ + } \ + } while (0) +#else +#define CHECK_STACK(stack) do { } while(0) +#endif + +#if 1 +#define CDEBUG(mask, format, a...) \ +do { \ + CHECK_STACK(CDEBUG_STACK()); \ + if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \ + (portal_debug & (mask) && \ + portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24)))) \ + portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK(), format , ## a); \ +} while (0) + +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) + +#define GOTO(label, rc) \ +do { \ + long GOTO__ret = (long)(rc); \ + CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \ + #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\ + (signed long)GOTO__ret); \ + goto label; \ +} while (0) + +#define RETURN(rc) \ +do { \ + typeof(rc) RETURN__ret = (rc); \ + CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ + (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\ + return RETURN__ret; \ +} while (0) + +#define ENTRY \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ +} while(0) +#else +#define CDEBUG(mask, format, a...) do { } while (0) +#define CWARN(format, a...) do { } while (0) +#define CERROR(format, a...) printk("<3>" format, ## a) +#define CEMERG(format, a...) printk("<0>" format, ## a) +#define GOTO(label, rc) do { (void)(rc); goto label; } while (0) +#define RETURN(rc) return (rc) +#define ENTRY do { } while (0) +#define EXIT do { } while (0) +#endif + + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define schedule_work schedule_task +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_TQUEUE((wq), 0, 0); \ + PREPARE_TQUEUE((wq), (cb), (cbdata)); \ +} while (0) + +#define ll_invalidate_inode_pages invalidate_inode_pages +#define PageUptodate Page_Uptodate +#define our_recalc_sigpending(current) recalc_sigpending(current) +#define num_online_cpus() smp_num_cpus +static inline void our_cond_resched(void) +{ + if (current->need_resched) + schedule (); +} + +#else + +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_WORK((wq), (void *)(cb), (void *)(cbdata)); \ +} while (0) +#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping) +#define wait_on_page wait_on_page_locked +#define our_recalc_sigpending(current) recalc_sigpending() +#define strtok(a,b) strpbrk(a, b) +static inline void our_cond_resched(void) +{ + cond_resched(); +} +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ + +#ifdef PORTAL_DEBUG +extern void kportal_assertion_failed(char *expr,char *file,char *func,int line); +#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ + __FUNCTION__, __LINE__)) +#else +#define LASSERT(e) +#endif + +#ifdef __arch_um__ +#define LBUG_WITH_LOC(file, func, line) \ +do { \ + CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(file, func, line); \ + panic("LBUG"); \ +} while (0) +#else +#define LBUG_WITH_LOC(file, func, line) \ +do { \ + CEMERG("LBUG\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(file, func, line); \ + set_task_state(current, TASK_UNINTERRUPTIBLE); \ + schedule(); \ +} while (0) +#endif /* __arch_um__ */ + +#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__) + +/* + * Memory + */ +#ifdef PORTAL_DEBUG +extern atomic_t portal_kmemory; + +# define portal_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &portal_kmemory); \ +} while (0) + +# define portal_kmem_dec(ptr, size) do { \ + atomic_sub(size, &portal_kmemory); \ +} while (0) + +#else +# define portal_kmem_inc(ptr, size) do {} while (0) +# define portal_kmem_dec(ptr, size) do {} while (0) +#endif /* PORTAL_DEBUG */ + +#define PORTAL_VMALLOC_SIZE 16384 + +#define PORTAL_ALLOC(ptr, size) \ +do { \ + long s = size; \ + LASSERT (!in_interrupt()); \ + if (s > PORTAL_VMALLOC_SIZE) \ + (ptr) = vmalloc(s); \ + else \ + (ptr) = kmalloc(s, GFP_NOFS); \ + if ((ptr) == NULL) \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s); \ + else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_FREE(ptr, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + if (s > PORTAL_VMALLOC_SIZE) \ + vfree(ptr); \ + else \ + kfree(ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_ALLOC(ptr, slab, size) \ +do { \ + long s = (size); \ + LASSERT (!in_interrupt()); \ + (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' from slab '" #slab "')\n", __FILE__, \ + __LINE__); \ + } else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_FREE(ptr, slab, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + memset((ptr), 0x5a, s); \ + kmem_cache_free((slab), ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +/* ------------------------------------------------------------------- */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + +#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x) +#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x) + +#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x)) +#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) + +#define PORTAL_MODULE_USE MOD_INC_USE_COUNT +#define PORTAL_MODULE_UNUSE MOD_DEC_USE_COUNT +#else + +#define PORTAL_SYMBOL_REGISTER(x) +#define PORTAL_SYMBOL_UNREGISTER(x) + +#define PORTAL_SYMBOL_GET(x) symbol_get(x) +#define PORTAL_SYMBOL_PUT(x) symbol_put(x) + +#define PORTAL_MODULE_USE try_module_get(THIS_MODULE) +#define PORTAL_MODULE_UNUSE module_put(THIS_MODULE) + +#endif + +/******************************************************************************/ +/* Kernel Portals Router interface */ + +typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback + +/* space for routing targets to stash "stuff" in a forwarded packet */ +typedef union { + long long _alignment; + void *_space[16]; /* scale with CPU arch */ +} kprfd_scratch_t; + +/* Kernel Portals Routing Forwarded message Descriptor */ +typedef struct { + struct list_head kprfd_list; /* stash in queues (routing target can use) */ + ptl_nid_t kprfd_target_nid; /* final destination NID */ + ptl_nid_t kprfd_gateway_nid; /* gateway NID */ + int kprfd_nob; /* # message bytes (including header) */ + int kprfd_niov; /* # message frags (including header) */ + struct iovec *kprfd_iov; /* message fragments */ + void *kprfd_router_arg; // originating NAL's router arg + kpr_fwd_callback_t kprfd_callback; /* completion callback */ + void *kprfd_callback_arg; /* completion callback arg */ + kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets +} kpr_fwd_desc_t; + +typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); + +/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ +typedef const struct { + int kprni_nalid; /* NAL's id */ + void *kprni_arg; /* Arg to pass when calling into NAL */ + kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ +} kpr_nal_interface_t; + +/* Router's routing interface (Kernel Portals Routing Router Interface) */ +typedef const struct { + /* register the calling NAL with the router and get back the handle for + * subsequent calls */ + int (*kprri_register) (kpr_nal_interface_t *nal_interface, + void **router_arg); + + /* ask the router to find a gateway that forwards to 'nid' and is a peer + * of the calling NAL */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + ptl_nid_t *gateway_nid); + + /* hand a packet over to the router for forwarding */ + kpr_fwd_t kprri_fwd_start; + + /* hand a packet back to the router for completion */ + void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, + int error); + + /* the calling NAL is shutting down */ + void (*kprri_shutdown) (void *router_arg); + + /* deregister the calling NAL with the router */ + void (*kprri_deregister) (void *router_arg); + +} kpr_router_interface_t; + +/* Convenient struct for NAL to stash router interface/args */ +typedef struct { + kpr_router_interface_t *kpr_interface; + void *kpr_arg; +} kpr_router_t; + +/* Router's control interface (Kernel Portals Routing Control Interface) */ +typedef const struct { + int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); + int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_get_route)(int index, int *gateway_nal, + ptl_nid_t *gateway, ptl_nid_t *lo_nid, + ptl_nid_t *hi_nid); +} kpr_control_interface_t; + +extern kpr_control_interface_t kpr_control_interface; +extern kpr_router_interface_t kpr_router_interface; + +static inline int +kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) +{ + int rc; + + router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); + if (router->kpr_interface == NULL) + return (-ENOENT); + + rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); + if (rc != 0) + router->kpr_interface = NULL; + + PORTAL_SYMBOL_PUT (kpr_router_interface); + return (rc); +} + +static inline int +kpr_routing (kpr_router_t *router) +{ + return (router->kpr_interface != NULL); +} + +static inline int +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +{ + if (!kpr_routing (router)) + return (-EHOSTUNREACH); + + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + gateway_nid)); +} + +static inline void +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, + int nob, int niov, struct iovec *iov, + kpr_fwd_callback_t callback, void *callback_arg) +{ + fwd->kprfd_target_nid = nid; + fwd->kprfd_gateway_nid = nid; + fwd->kprfd_nob = nob; + fwd->kprfd_niov = niov; + fwd->kprfd_iov = iov; + fwd->kprfd_callback = callback; + fwd->kprfd_callback_arg = callback_arg; +} + +static inline void +kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) +{ + if (!kpr_routing (router)) + fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + else + router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); +} + +static inline void +kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) +{ + LASSERT (kpr_routing (router)); + router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); +} + +static inline void +kpr_shutdown (kpr_router_t *router) +{ + if (kpr_routing (router)) + router->kpr_interface->kprri_shutdown (router->kpr_arg); +} + +static inline void +kpr_deregister (kpr_router_t *router) +{ + if (!kpr_routing (router)) + return; + router->kpr_interface->kprri_deregister (router->kpr_arg); + router->kpr_interface = NULL; +} + +/******************************************************************************/ + +#ifdef PORTALS_PROFILING +#define prof_enum(FOO) PROF__##FOO +enum { + prof_enum(our_recvmsg), + prof_enum(our_sendmsg), + prof_enum(socknal_recv), + prof_enum(lib_parse), + prof_enum(conn_list_walk), + prof_enum(memcpy), + prof_enum(lib_finalize), + prof_enum(pingcli_time), + prof_enum(gmnal_send), + prof_enum(gmnal_recv), + MAX_PROFS +}; + +struct prof_ent { + char *str; + /* hrmph. wrap-tastic. */ + u32 starts; + u32 finishes; + cycles_t total_cycles; + cycles_t start; + cycles_t end; +}; + +extern struct prof_ent prof_ents[MAX_PROFS]; + +#define PROF_START(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->starts++; \ + pe->start = get_cycles(); \ + } while (0) + +#define PROF_FINISH(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->finishes++; \ + pe->end = get_cycles(); \ + pe->total_cycles += (pe->end - pe->start); \ + } while (0) +#else /* !PORTALS_PROFILING */ +#define PROF_START(FOO) do {} while(0) +#define PROF_FINISH(FOO) do {} while(0) +#endif /* PORTALS_PROFILING */ + +/* debug.c */ +void portals_run_lbug_upcall(char * file, char *fn, int line); +void portals_debug_dumplog(void); +int portals_debug_init(unsigned long bufsize); +int portals_debug_cleanup(void); +int portals_debug_clear_buffer(void); +int portals_debug_mark_buffer(char *text); +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *file, unsigned int size); +__s32 portals_debug_copy_to_user(char *buf, unsigned long len); +#if (__GNUC__) +/* Use the special GNU C __attribute__ hack to have the compiler check the + * printf style argument string against the actual argument count and + * types. + */ +#ifdef printf +# warning printf has been defined as a macro... +# undef printf +#endif +void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) + __attribute__ ((format (printf, 7, 8))); +#else +void portals_debug_msg (int subsys, int mask, char *file, char *fn, + int line, unsigned long stack, + const char *format, ...); +#endif /* __GNUC__ */ +void portals_debug_set_level(unsigned int debug_level); + +# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) +# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) +# define time(a) CURRENT_TIME + +extern void kportal_daemonize (char *name); +extern void kportal_blockallsigs (void); + +#else /* !__KERNEL__ */ +# include +# include +#ifndef __CYGWIN__ +# include +#endif +# include +# include +# include +# ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +# endif +# ifdef PORTAL_DEBUG +# undef NDEBUG +# include +# define LASSERT(e) assert(e) +# else +# define LASSERT(e) +# endif +# define printk(format, args...) printf (format, ## args) +# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); +# define PORTAL_FREE(a, b) do { free(a); } while (0); +# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ + printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ + (subsys) >> 24, (mask), (long)time(0), file, fn, line, \ + getpid() , stack, ## a); +#endif + +#ifndef CURRENT_TIME +# define CURRENT_TIME time(0) +#endif + +#include + +/* + * USER LEVEL STUFF BELOW + */ + +#define PORTAL_IOCTL_VERSION 0x00010007 +#define PING_SYNC 0 +#define PING_ASYNC 1 + +struct portal_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + __u64 ioc_nid; + __u64 ioc_nid2; + __u64 ioc_nid3; + __u32 ioc_count; + __u32 ioc_nal; + __u32 ioc_nal_cmd; + __u32 ioc_fd; + __u32 ioc_id; + + __u32 ioc_flags; + __u32 ioc_size; + + __u32 ioc_wait; + __u32 ioc_timeout; + __u32 ioc_misc; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +struct portal_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct portals_debug_ioctl_data +{ + struct portal_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define PORTAL_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = PORTAL_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME check conflict with lustre_lib.h */ +#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) + +static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) +{ + int len = sizeof(*data); + len += size_round(data->ioc_inllen1); + len += size_round(data->ioc_inllen2); + return len; +} + +static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if (portal_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("PORTALS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + +#ifndef __KERNEL__ +static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, + int max) +{ + char *ptr; + struct portal_ioctl_data *overlay; + data->ioc_len = portal_ioctl_packlen(data); + data->ioc_version = PORTAL_IOCTL_VERSION; + + if (*pbuf && portal_ioctl_packlen(data) > max) + return 1; + if (*pbuf == NULL) { + *pbuf = malloc(data->ioc_len); + } + if (!*pbuf) + return 1; + overlay = (struct portal_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + if (data->ioc_inlbuf2) + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + if (portal_ioctl_is_invalid(overlay)) + return 1; + + return 0; +} +#else +#include + +/* buffer MUST be at least the size of portal_ioctl_hdr */ +static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct portal_ioctl_hdr *hdr; + struct portal_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct portal_ioctl_hdr *)buf; + data = (struct portal_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if ( err ) { + EXIT; + return err; + } + + if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { + CERROR ("PORTALS: version mismatch kernel vs application\n"); + return -EINVAL; + } + + if (hdr->ioc_len + buf >= end) { + CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); + return -EINVAL; + } + + + if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { + CERROR ("PORTALS: user buffer too small for ioctl\n"); + return -EINVAL; + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if ( err ) { + EXIT; + return err; + } + + if (portal_ioctl_is_invalid(data)) { + CERROR ("PORTALS: ioctl not correctly formatted\n"); + return -EINVAL; + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); + } + + EXIT; + return 0; +} +#endif + +/* ioctls for manipulating snapshots 30- */ +#define IOC_PORTAL_TYPE 'e' +#define IOC_PORTAL_MIN_NR 30 + +#define IOC_PORTAL_PING _IOWR('e', 30, long) +#define IOC_PORTAL_GET_DEBUG _IOWR('e', 31, long) +#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long) +#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long) +#define IOC_PORTAL_PANIC _IOWR('e', 34, long) +#define IOC_PORTAL_ADD_ROUTE _IOWR('e', 35, long) +#define IOC_PORTAL_DEL_ROUTE _IOWR('e', 36, long) +#define IOC_PORTAL_GET_ROUTE _IOWR('e', 37, long) +#define IOC_PORTAL_NAL_CMD _IOWR('e', 38, long) +#define IOC_PORTAL_GET_NID _IOWR('e', 39, long) +#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) +#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) + +#define IOC_PORTAL_MAX_NR 41 + +enum { + QSWNAL = 1, + SOCKNAL, + GMNAL, + TOENAL, + TCPNAL, + SCIMACNAL, + NAL_ENUM_END_MARKER +}; + +#ifdef __KERNEL__ +extern ptl_handle_ni_t kqswnal_ni; +extern ptl_handle_ni_t ksocknal_ni; +extern ptl_handle_ni_t ktoenal_ni; +extern ptl_handle_ni_t kgmnal_ni; +extern ptl_handle_ni_t kscimacnal_ni; +#endif + +#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) + +#define NAL_CMD_REGISTER_PEER_FD 100 +#define NAL_CMD_CLOSE_CONNECTION 101 +#define NAL_CMD_REGISTER_MYNID 102 +#define NAL_CMD_PUSH_CONNECTION 103 + +enum { + DEBUG_DAEMON_START = 1, + DEBUG_DAEMON_STOP = 2, + DEBUG_DAEMON_PAUSE = 3, + DEBUG_DAEMON_CONTINUE = 4, +}; + +/* XXX remove to lustre ASAP */ +struct lustre_peer { + ptl_nid_t peer_nid; + ptl_handle_ni_t peer_ni; +}; + +/* module.c */ +typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private); +int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private); +int kportal_nal_unregister(int nal); + +ptl_handle_ni_t *kportal_get_ni (int nal); +void kportal_put_ni (int nal); + +#ifdef __CYGWIN__ +#ifndef BITS_PER_LONG +#if (~0UL) == 0xffffffffUL +#define BITS_PER_LONG 32 +#else +#define BITS_PER_LONG 64 +#endif +#endif +#endif + +#if (BITS_PER_LONG == 32 || __WORDSIZE == 32) +# define LPU64 "%Lu" +# define LPD64 "%Ld" +# define LPX64 "%#Lx" +# define LPSZ "%u" +# define LPSSZ "%d" +#endif +#if (BITS_PER_LONG == 64 || __WORDSIZE == 64) +# define LPU64 "%lu" +# define LPD64 "%ld" +# define LPX64 "%#lx" +# define LPSZ "%lu" +# define LPSSZ "%ld" +#endif +#ifndef LPU64 +# error "No word size defined" +#endif + +#endif diff --git a/lnet/include/linux/portals_compat25.h b/lnet/include/linux/portals_compat25.h new file mode 100644 index 0000000..e28fbac --- /dev/null +++ b/lnet/include/linux/portals_compat25.h @@ -0,0 +1,13 @@ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20) +# define SIGNAL_MASK_LOCK(task, flags) \ + spin_lock_irqsave(&task->sighand->siglock, flags) +# define SIGNAL_MASK_UNLOCK(task, flags) \ + spin_unlock_irqrestore(&task->sighand->siglock, flags) +# define RECALC_SIGPENDING recalc_sigpending() +#else +# define SIGNAL_MASK_LOCK(task, flags) \ + spin_lock_irqsave(&task->sigmask_lock, flags) +# define SIGNAL_MASK_UNLOCK(task, flags) \ + spin_unlock_irqrestore(&task->sigmask_lock, flags) +# define RECALC_SIGPENDING recalc_sigpending(current) +#endif diff --git a/lnet/include/linux/portals_lib.h b/lnet/include/linux/portals_lib.h new file mode 100644 index 0000000..a528a80 --- /dev/null +++ b/lnet/include/linux/portals_lib.h @@ -0,0 +1,188 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _PORTALS_LIB_H +#define _PORTALS_LIB_H + +#ifndef __KERNEL__ +# include +#else +# include +#endif + +#undef MIN +#define MIN(a,b) (((a)<(b)) ? (a): (b)) +#undef MAX +#define MAX(a,b) (((a)>(b)) ? (a): (b)) +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int size_round (int val) +{ + return (val + 7) & (~0x7); +} + +static inline int size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t round_strlen(char *fset) +{ + return size_round(strlen(fset) + 1); +} + +#ifdef __KERNEL__ +static inline char *strdup(const char *str) +{ + int len = strlen(str) + 1; + char *tmp = kmalloc(len, GFP_KERNEL); + if (tmp) + memcpy(tmp, str, len); + + return tmp; +} +#endif + +#ifdef __KERNEL__ +# define NTOH__u32(var) le32_to_cpu(var) +# define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u32(var) cpu_to_le32(var) +# define HTON__u64(var) cpu_to_le64(var) +#else +# define expansion_u64(var) \ + ({ __u64 ret; \ + switch (sizeof(var)) { \ + case 8: (ret) = (var); break; \ + case 4: (ret) = (__u32)(var); break; \ + case 2: (ret) = (__u16)(var); break; \ + case 1: (ret) = (__u8)(var); break; \ + }; \ + (ret); \ + }) +# define NTOH__u32(var) (var) +# define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u32(var) (var) +# define HTON__u64(var) (expansion_u64(var)) +#endif + +/* + * copy sizeof(type) bytes from pointer to var and move ptr forward. + * return EFAULT if pointer goes beyond end + */ +#define UNLOGV(var,type,ptr,end) \ +do { \ + var = *(type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* the following two macros convert to little endian */ +/* type MUST be __u32 or __u64 */ +#define LUNLOGV(var,type,ptr,end) \ +do { \ + var = NTOH##type(*(type *)ptr); \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* now log values */ +#define LOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = var; \ + ptr += sizeof(type); \ +} while (0) + +/* and in network order */ +#define LLOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = HTON##type(var); \ + ptr += sizeof(type); \ +} while (0) + + +/* + * set var to point at (type *)ptr, move ptr forward with sizeof(type) + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGP(var,type,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define LOGP(var,type,ptr) \ +do { \ + memcpy(ptr, var, sizeof(type)); \ + ptr += sizeof(type); \ +} while (0) + +/* + * set var to point at (char *)ptr, move ptr forward by size_round(len); + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGL(var,type,len,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += size_round(len * sizeof(type)); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define UNLOGL0(var,type,len,ptr,end) \ +do { \ + UNLOGL(var,type,len,ptr,end); \ + if ( *((char *)ptr - size_round(len) + len - 1) != '\0') \ + return -EFAULT; \ +} while (0) + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += size_round(len + 1); \ +} while (0) + +#endif /* _PORTALS_LIB_H */ diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am new file mode 100644 index 0000000..c61b084 --- /dev/null +++ b/lnet/include/lnet/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = base +include $(top_srcdir)/Rules + +pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h + diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h new file mode 100644 index 0000000..af4a2dc --- /dev/null +++ b/lnet/include/lnet/api-support.h @@ -0,0 +1,27 @@ +# define DEBUG_SUBSYSTEM S_PORTALS +# define PORTAL_DEBUG + +#ifndef __KERNEL__ +# include +# include +# include +# include + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include +# include +# include +#endif + +#include +#include +#include + +#include +#include +#include + +/* Hack for 2.4.18 macro name collision */ +#ifdef yield +#undef yield +#endif diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h new file mode 100644 index 0000000..a83749b --- /dev/null +++ b/lnet/include/lnet/api.h @@ -0,0 +1,159 @@ +#ifndef P30_API_H +#define P30_API_H + +#include + +#ifndef PTL_NO_WRAP +int PtlInit(void); +int PtlInitialized(void); +void PtlFini(void); + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in, + ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid, + ptl_handle_ni_t * interface_out); + +int PtlNIInitialized(ptl_interface_t); + +int PtlNIFini(ptl_handle_ni_t interface_in); + +#endif + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); + + +/* + * Network interfaces + */ + +#ifndef PTL_NO_WRAP +int PtlNIBarrier(ptl_handle_ni_t interface_in); +#endif + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out); + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out); + +#ifndef PTL_NO_WRAP +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); +#endif + + +/* + * PtlNIDebug: + * + * This is not an official Portals 3 API call. It is provided + * by the reference implementation to allow the maintainers an + * easy way to turn on and off debugging information in the + * library. Do not use it in code that is not intended for use + * with any version other than the portable reference library. + */ +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); + +/* + * PtlNIFailNid + * + * Not an official Portals 3 API call. It provides a way of simulating + * communications failures to all (nid == PTL_NID_ANY), or specific peers + * (via multiple calls), either until further notice (threshold == -1), or + * for a specific number of messages. Passing a threshold of zero, "heals" + * the given peer. + */ +int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); + + +/* + * Match entries + */ + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out); + +int PtlMEUnlink(ptl_handle_me_t current_in); + +int PtlMEUnlinkList(ptl_handle_me_t current_in); + +int PtlTblDump(ptl_handle_ni_t ni, int index_in); +int PtlMEDump(ptl_handle_me_t current_in); + + + +/* + * Memory descriptors + */ + +#ifndef PTL_NO_WRAP +int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out); + +int PtlMDUnlink(ptl_handle_md_t md_in); + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in); + +#endif + +/* These should not be called by users */ +int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in, + ptl_seq_t sequence_in); + + + + +/* + * Event queues + */ +#ifndef PTL_NO_WRAP + +/* These should be called by users */ +int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out); +int PtlEQFree(ptl_handle_eq_t eventq_in); + +int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); + +int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout); +#endif + +/* + * Access Control Table + */ +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); + + +/* + * Data movement + */ + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in); + + + +#endif diff --git a/lnet/include/lnet/arg-blocks.h b/lnet/include/lnet/arg-blocks.h new file mode 100644 index 0000000..3c3b154 --- /dev/null +++ b/lnet/include/lnet/arg-blocks.h @@ -0,0 +1,265 @@ +#ifndef PTL_BLOCKS_H +#define PTL_BLOCKS_H + +/* + * blocks.h + * + * Argument block types for the Portals 3.0 library + * Generated by idl + * + */ + +#include + +/* put LIB_MAX_DISPATCH last here -- these must match the + assignements to the dispatch table in lib-p30/dispatch.c */ +#define PTL_GETID 1 +#define PTL_NISTATUS 2 +#define PTL_NIDIST 3 +#define PTL_NIDEBUG 4 +#define PTL_MEATTACH 5 +#define PTL_MEINSERT 6 +// #define PTL_MEPREPEND 7 +#define PTL_MEUNLINK 8 +#define PTL_TBLDUMP 9 +#define PTL_MEDUMP 10 +#define PTL_MDATTACH 11 +// #define PTL_MDINSERT 12 +#define PTL_MDBIND 13 +#define PTL_MDUPDATE 14 +#define PTL_MDUNLINK 15 +#define PTL_EQALLOC 16 +#define PTL_EQFREE 17 +#define PTL_ACENTRY 18 +#define PTL_PUT 19 +#define PTL_GET 20 +#define PTL_FAILNID 21 +#define LIB_MAX_DISPATCH 21 + +typedef struct PtlFailNid_in { + ptl_handle_ni_t interface; + ptl_nid_t nid; + unsigned int threshold; +} PtlFailNid_in; + +typedef struct PtlFailNid_out { + int rc; +} PtlFailNid_out; + +typedef struct PtlGetId_in { + ptl_handle_ni_t handle_in; +} PtlGetId_in; + +typedef struct PtlGetId_out { + int rc; + ptl_process_id_t id_out; +} PtlGetId_out; + +typedef struct PtlNIStatus_in { + ptl_handle_ni_t interface_in; + ptl_sr_index_t register_in; +} PtlNIStatus_in; + +typedef struct PtlNIStatus_out { + int rc; + ptl_sr_value_t status_out; +} PtlNIStatus_out; + + +typedef struct PtlNIDist_in { + ptl_handle_ni_t interface_in; + ptl_process_id_t process_in; +} PtlNIDist_in; + +typedef struct PtlNIDist_out { + int rc; + unsigned long distance_out; +} PtlNIDist_out; + + +typedef struct PtlNIDebug_in { + unsigned int mask_in; +} PtlNIDebug_in; + +typedef struct PtlNIDebug_out { + unsigned int rc; +} PtlNIDebug_out; + + +typedef struct PtlMEAttach_in { + ptl_handle_ni_t interface_in; + ptl_pt_index_t index_in; + ptl_ins_pos_t position_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; +} PtlMEAttach_in; + +typedef struct PtlMEAttach_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEAttach_out; + + +typedef struct PtlMEInsert_in { + ptl_handle_me_t current_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; + ptl_ins_pos_t position_in; +} PtlMEInsert_in; + +typedef struct PtlMEInsert_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEInsert_out; + +typedef struct PtlMEUnlink_in { + ptl_handle_me_t current_in; + ptl_unlink_t unlink_in; +} PtlMEUnlink_in; + +typedef struct PtlMEUnlink_out { + int rc; +} PtlMEUnlink_out; + + +typedef struct PtlTblDump_in { + int index_in; +} PtlTblDump_in; + +typedef struct PtlTblDump_out { + int rc; +} PtlTblDump_out; + + +typedef struct PtlMEDump_in { + ptl_handle_me_t current_in; +} PtlMEDump_in; + +typedef struct PtlMEDump_out { + int rc; +} PtlMEDump_out; + + +typedef struct PtlMDAttach_in { + ptl_handle_me_t me_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; + ptl_unlink_t unlink_in; +} PtlMDAttach_in; + +typedef struct PtlMDAttach_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDAttach_out; + + +typedef struct PtlMDBind_in { + ptl_handle_ni_t ni_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; +} PtlMDBind_in; + +typedef struct PtlMDBind_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDBind_out; + + +typedef struct PtlMDUpdate_internal_in { + ptl_handle_md_t md_in; + ptl_handle_eq_t testq_in; + ptl_seq_t sequence_in; + + ptl_md_t old_inout; + int old_inout_valid; + ptl_md_t new_inout; + int new_inout_valid; +} PtlMDUpdate_internal_in; + +typedef struct PtlMDUpdate_internal_out { + int rc; + ptl_md_t old_inout; + ptl_md_t new_inout; +} PtlMDUpdate_internal_out; + + +typedef struct PtlMDUnlink_in { + ptl_handle_md_t md_in; +} PtlMDUnlink_in; + +typedef struct PtlMDUnlink_out { + int rc; + ptl_md_t status_out; +} PtlMDUnlink_out; + + +typedef struct PtlEQAlloc_in { + ptl_handle_ni_t ni_in; + ptl_size_t count_in; + void *base_in; + int len_in; + int (*callback_in) (ptl_event_t * event); +} PtlEQAlloc_in; + +typedef struct PtlEQAlloc_out { + int rc; + ptl_handle_eq_t handle_out; +} PtlEQAlloc_out; + + +typedef struct PtlEQFree_in { + ptl_handle_eq_t eventq_in; +} PtlEQFree_in; + +typedef struct PtlEQFree_out { + int rc; +} PtlEQFree_out; + + +typedef struct PtlACEntry_in { + ptl_handle_ni_t ni_in; + ptl_ac_index_t index_in; + ptl_process_id_t match_id_in; + ptl_pt_index_t portal_in; +} PtlACEntry_in; + +typedef struct PtlACEntry_out { + int rc; +} PtlACEntry_out; + + +typedef struct PtlPut_in { + ptl_handle_md_t md_in; + ptl_ack_req_t ack_req_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; + ptl_hdr_data_t hdr_data_in; +} PtlPut_in; + +typedef struct PtlPut_out { + int rc; +} PtlPut_out; + + +typedef struct PtlGet_in { + ptl_handle_md_t md_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; +} PtlGet_in; + +typedef struct PtlGet_out { + int rc; +} PtlGet_out; + + +#endif diff --git a/lnet/include/lnet/defines.h b/lnet/include/lnet/defines.h new file mode 100644 index 0000000..785ce73 --- /dev/null +++ b/lnet/include/lnet/defines.h @@ -0,0 +1,116 @@ +/* +** +** This files contains definitions that are used throughout the cplant code. +*/ + +#ifndef CPLANT_H +#define CPLANT_H + +#define TITLE(fname,zmig) + + +/* +** TRUE and FALSE +*/ +#undef TRUE +#define TRUE (1) +#undef FALSE +#define FALSE (0) + + +/* +** Return codes from functions +*/ +#undef OK +#define OK (0) +#undef ERROR +#define ERROR (-1) + + + +/* +** The GCC macro for a safe max() that works on all types arithmetic types. +*/ +#ifndef MAX +#define MAX(a, b) (a) > (b) ? (a) : (b) +#endif /* MAX */ + +#ifndef MIN +#define MIN(a, b) (a) < (b) ? (a) : (b) +#endif /* MIN */ + +/* +** The rest is from the old qkdefs.h +*/ + +#ifndef __linux__ +#define __inline__ +#endif + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef __osf__ +#define PRIVATE static +#define PUBLIC +#endif + +#ifndef __osf__ +typedef unsigned char uchar; +#endif + +typedef char CHAR; +typedef unsigned char UCHAR; +typedef char INT8; +typedef unsigned char UINT8; +typedef short int INT16; +typedef unsigned short int UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long LONG32; +typedef unsigned long ULONG32; + +/* long may be 32 or 64, so we can't really append the size to the definition */ +typedef long LONG; +typedef unsigned long ULONG; + +#ifdef __alpha__ +typedef long int_t; +#ifndef __osf__ +typedef unsigned long uint_t; +#endif +#endif + +#ifdef __i386__ +typedef int int_t; +typedef unsigned int uint_t; +#endif + +typedef float FLOAT32; +typedef double FLOAT64; +typedef void VOID; +typedef INT32 BOOLEAN; +typedef void (*FCN_PTR)(void); + +#ifndef off64_t + +#if defined (__alpha__) || defined (__ia64__) +typedef long off64_t; +#else +typedef long long off64_t; +#endif + +#endif + +/* +** Process related typedefs +*/ +typedef UINT16 PID_TYPE; /* Type of Local process ID */ +typedef UINT16 NID_TYPE; /* Type of Physical node ID */ +typedef UINT16 GID_TYPE; /* Type of Group ID */ +typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */ + + + +#endif /* CPLANT_H */ diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h new file mode 100644 index 0000000..817936a --- /dev/null +++ b/lnet/include/lnet/errno.h @@ -0,0 +1,61 @@ +#ifndef _P30_ERRNO_H_ +#define _P30_ERRNO_H_ + +/* + * include/portals/errno.h + * + * Shared error number lists + */ + +/* If you change these, you must update the string table in api-errno.c */ +typedef enum { + PTL_OK = 0, + PTL_SEGV = 1, + + PTL_NOSPACE = 2, + PTL_INUSE = 3, + PTL_VAL_FAILED = 4, + + PTL_NAL_FAILED = 5, + PTL_NOINIT = 6, + PTL_INIT_DUP = 7, + PTL_INIT_INV = 8, + PTL_AC_INV_INDEX = 9, + + PTL_INV_ASIZE = 10, + PTL_INV_HANDLE = 11, + PTL_INV_MD = 12, + PTL_INV_ME = 13, + PTL_INV_NI = 14, +/* If you change these, you must update the string table in api-errno.c */ + PTL_ILL_MD = 15, + PTL_INV_PROC = 16, + PTL_INV_PSIZE = 17, + PTL_INV_PTINDEX = 18, + PTL_INV_REG = 19, + + PTL_INV_SR_INDX = 20, + PTL_ML_TOOLONG = 21, + PTL_ADDR_UNKNOWN = 22, + PTL_INV_EQ = 23, + PTL_EQ_DROPPED = 24, + + PTL_EQ_EMPTY = 25, + PTL_NOUPDATE = 26, + PTL_FAIL = 27, + PTL_NOT_IMPLEMENTED = 28, + PTL_NO_ACK = 29, + + PTL_IOV_TOO_MANY = 30, + PTL_IOV_TOO_SMALL = 31, + + PTL_EQ_INUSE = 32, + PTL_MD_INUSE = 33, + + PTL_MAX_ERRNO = 33 +} ptl_err_t; +/* If you change these, you must update the string table in api-errno.c */ + +extern const char *ptl_err_str[]; + +#endif diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h new file mode 100644 index 0000000..d78cad4 --- /dev/null +++ b/lnet/include/lnet/internal.h @@ -0,0 +1,45 @@ +/* +*/ +#ifndef _P30_INTERNAL_H_ +#define _P30_INTERNAL_H_ + +/* + * p30/internal.h + * + * Internals for the API level library that are not needed + * by the user application + */ + +#include + +extern int ptl_init; /* Has the library be initialized */ + +extern int ptl_ni_init(void); +extern int ptl_me_init(void); +extern int ptl_md_init(void); +extern int ptl_eq_init(void); + +extern int ptl_me_ni_init(nal_t * nal); +extern int ptl_md_ni_init(nal_t * nal); +extern int ptl_eq_ni_init(nal_t * nal); + +extern void ptl_ni_fini(void); +extern void ptl_me_fini(void); +extern void ptl_md_fini(void); +extern void ptl_eq_fini(void); + +extern void ptl_me_ni_fini(nal_t * nal); +extern void ptl_md_ni_fini(nal_t * nal); +extern void ptl_eq_ni_fini(nal_t * nal); + +static inline ptl_eq_t * +ptl_handle2usereq (ptl_handle_eq_t *handle) +{ + /* EQ handles are a little wierd. On the "user" side, the cookie + * is just a pointer to a queue of events in shared memory. It's + * cb_eq_handle is the "real" handle which we pass when we + * call do_forward(). */ + return (ptl_eq_t *)((unsigned long)handle->cookie); +} + +#endif diff --git a/lnet/include/lnet/lib-dispatch.h b/lnet/include/lnet/lib-dispatch.h new file mode 100644 index 0000000..f87ff83 --- /dev/null +++ b/lnet/include/lnet/lib-dispatch.h @@ -0,0 +1,45 @@ +#ifndef PTL_DISPATCH_H +#define PTL_DISPATCH_H + +/* + * include/dispatch.h + * + * Dispatch table header and externs for remote side + * operations + * + * Generated by idl + * + */ + +#include +#include + +extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); + +extern char *dispatch_name(int index); +#endif diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h new file mode 100644 index 0000000..b623b93 --- /dev/null +++ b/lnet/include/lnet/lib-lnet.h @@ -0,0 +1,385 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_EQ); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_ME); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_assert_wire_constants (void); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lnet/include/lnet/lib-nal.h b/lnet/include/lnet/lib-nal.h new file mode 100644 index 0000000..4052c0c --- /dev/null +++ b/lnet/include/lnet/lib-nal.h @@ -0,0 +1,102 @@ +#ifndef _LIB_NAL_H_ +#define _LIB_NAL_H_ + +/* + * nal.h + * + * Library side headers that define the abstraction layer's + * responsibilities and interfaces + */ + +#include + +struct nal_cb_t { + /* + * Per interface portal table, access control table + * and NAL private data field; + */ + lib_ni_t ni; + void *nal_data; + /* + * send: Sends a preformatted header and user data to a + * specified remote process. + * Can overwrite iov. + */ + int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, size_t mlen); + /* + * recv: Receives an incoming message from a remote process + * Type of iov depends on options. Can overwrite iov. + */ + int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, size_t mlen, + size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, size_t mlen, + size_t rlen); + /* + * read: Reads a block of data from a specified user address + */ + int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); + + /* + * write: Writes a block of data into a specified user address + */ + int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); + + /* + * callback: Calls an event callback + */ + int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); + + /* + * malloc: Acquire a block of memory in a system independent + * fashion. + */ + void *(*cb_malloc) (nal_cb_t * nal, size_t len); + + void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to cb_unmap() is what cb_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); + + /* Turn interrupts off (begin of protected area) */ + void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); + + /* Turn interrupts on (end of protected area) */ + void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); + + /* + * Calculate a network "distance" to given node + */ + int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); +}; + +#endif diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h new file mode 100644 index 0000000..b623b93 --- /dev/null +++ b/lnet/include/lnet/lib-p30.h @@ -0,0 +1,385 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include +# include +#else +# include +# include +#endif +#include +#include +#include +#include +#include +#include +#include + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_EQ); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_ME); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_assert_wire_constants (void); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h new file mode 100644 index 0000000..47c0dd2 --- /dev/null +++ b/lnet/include/lnet/lib-types.h @@ -0,0 +1,282 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * p30/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef _LIB_TYPES_H_ +#define _LIB_TYPES_H_ + +#include +#ifdef __KERNEL__ +# define PTL_USE_SLAB_CACHE +# include +# include +# include +#else +# include +#endif + +/* struct nal_cb_t is defined in lib-nal.h */ +typedef struct nal_cb_t nal_cb_t; + +typedef char *user_ptr; +typedef struct lib_msg_t lib_msg_t; +typedef struct lib_ptl_t lib_ptl_t; +typedef struct lib_ac_t lib_ac_t; +typedef struct lib_me_t lib_me_t; +typedef struct lib_md_t lib_md_t; +typedef struct lib_eq_t lib_eq_t; + +#define WIRE_ATTR __attribute__((packed)) + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} WIRE_ATTR ptl_handle_wire_t; + +/* byte-flip insensitive! */ +#define PTL_WIRE_HANDLE_NONE \ +((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) + +typedef enum { + PTL_MSG_ACK = 0, + PTL_MSG_PUT, + PTL_MSG_GET, + PTL_MSG_REPLY, + PTL_MSG_HELLO, +} ptl_msg_type_t; + +/* Each of these structs should start with an odd number of + * __u32, or the compiler could add its own padding and confuse + * everyone. + * + * Also, "length" needs to be at offset 28 of each struct. + */ +typedef struct ptl_ack { + ptl_size_t mlength; + ptl_handle_wire_t dst_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for acks) moving out RSN */ +} WIRE_ATTR ptl_ack_t; + +typedef struct ptl_put { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t ack_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length moving out RSN */ + ptl_size_t offset; + ptl_hdr_data_t hdr_data; +} WIRE_ATTR ptl_put_t; + +typedef struct ptl_get { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t return_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for gets) moving out RSN */ + ptl_size_t src_offset; + ptl_size_t return_offset; /* unused: going RSN */ + ptl_size_t sink_length; +} WIRE_ATTR ptl_get_t; + +typedef struct ptl_reply { + __u32 unused1; /* unused fields going RSN */ + ptl_handle_wire_t dst_wmd; + ptl_size_t dst_offset; /* unused: going RSN */ + __u32 unused2; + ptl_size_t length; /* common length moving out RSN */ +} WIRE_ATTR ptl_reply_t; + +typedef struct { + ptl_nid_t dest_nid; + ptl_nid_t src_nid; + ptl_pid_t dest_pid; + ptl_pid_t src_pid; + __u32 type; /* ptl_msg_type_t */ + union { + ptl_ack_t ack; + ptl_put_t put; + ptl_get_t get; + ptl_reply_t reply; + } msg; +} WIRE_ATTR ptl_hdr_t; + +/* All length fields in individual unions at same offset */ +/* LASSERT for same in lib-move.c */ +#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length) + +/* A HELLO message contains the portals magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * PTL_MSG_HELLO in the type field. All other fields are zero (including + * PTL_HDR_LENGTH; i.e. no payload). + * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID, so that hosts with + * multiple IP interfaces can have a single NID. These NALs should exchange + * HELLO messages when a connection is first established. */ +typedef struct { + __u32 magic; /* PORTALS_PROTO_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} WIRE_ATTR ptl_magicversion_t; + +#define PORTALS_PROTO_MAGIC 0xeebc0ded + +#define PORTALS_PROTO_VERSION_MAJOR 0 +#define PORTALS_PROTO_VERSION_MINOR 1 + +typedef struct { + long recv_count, recv_length, send_count, send_length, drop_count, + drop_length, msgs_alloc, msgs_max; +} lib_counters_t; + +/* temporary expedient: limit number of entries in discontiguous MDs */ +#if PTL_LARGE_MTU +# define PTL_MD_MAX_IOV 64 +#else +# define PTL_MD_MAX_IOV 16 +#endif + +struct lib_msg_t { + struct list_head msg_list; + int send_ack; + lib_md_t *md; + ptl_nid_t nid; + ptl_pid_t pid; + ptl_event_t ev; + ptl_handle_wire_t ack_wmd; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } msg_iov; +}; + +struct lib_ptl_t { + ptl_pt_index_t size; + struct list_head *tbl; +}; + +struct lib_ac_t { + int next_free; +}; + +typedef struct { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lib_handle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +struct lib_eq_t { + struct list_head eq_list; + lib_handle_t eq_lh; + ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + int eq_refcount; + int (*event_callback) (ptl_event_t * event); + void *eq_addrkey; +}; + +struct lib_me_t { + struct list_head me_list; + lib_handle_t me_lh; + ptl_process_id_t match_id; + ptl_match_bits_t match_bits, ignore_bits; + ptl_unlink_t unlink; + lib_md_t *md; +}; + +struct lib_md_t { + struct list_head md_list; + lib_handle_t md_lh; + lib_me_t *me; + user_ptr start; + ptl_size_t offset; + ptl_size_t length; + ptl_size_t max_size; + int threshold; + int pending; + ptl_unlink_t unlink; + unsigned int options; + unsigned int md_flags; + void *user_ptr; + lib_eq_t *eq; + void *md_addrkey; + unsigned int md_niov; /* # frags */ + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } md_iov; +}; + +#define PTL_MD_FLAG_UNLINK (1 << 0) +#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) + +#ifndef PTL_USE_SLAB_CACHE +typedef struct +{ + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lib_freelist_t; + +typedef struct +{ + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lib_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* stash in ni.ni_test_peers */ + ptl_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lib_test_peer_t; + +#define PTL_COOKIE_TYPE_MD 1 +#define PTL_COOKIE_TYPE_ME 2 +#define PTL_COOKIE_TYPE_EQ 3 +#define PTL_COOKIE_TYPES 4 +/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be + * extracted by masking with (PTL_COOKIE_TYPES - 1) */ + +typedef struct { + int up; + int refcnt; + ptl_nid_t nid; + ptl_pid_t pid; + int num_nodes; + unsigned int debug; + lib_ptl_t tbl; + lib_ac_t ac; + lib_counters_t counters; + + int ni_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ni_next_object_cookie; /* cookie generator */ + __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ + + struct list_head ni_test_peers; + +#ifndef PTL_USE_SLAB_CACHE + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; +} lib_ni_t; + +#endif diff --git a/lnet/include/lnet/list.h b/lnet/include/lnet/list.h new file mode 100644 index 0000000..2b63312 --- /dev/null +++ b/lnet/include/lnet/list.h @@ -0,0 +1,245 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#endif + +#ifndef list_for_each_entry +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) +#endif + +#ifndef list_for_each_entry_safe +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) +#endif diff --git a/lnet/include/lnet/lltrace.h b/lnet/include/lnet/lltrace.h new file mode 100644 index 0000000..7d1b304 --- /dev/null +++ b/lnet/include/lnet/lltrace.h @@ -0,0 +1,175 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Compile with: + * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl + */ +#ifndef __LTRACE_H_ +#define __LTRACE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline int ltrace_write_file(char* fname) +{ + char* argv[3]; + + argv[0] = "debug_kernel"; + argv[1] = fname; + argv[2] = "1"; + + fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]); + + return jt_dbg_debug_kernel(3, argv); +} + +static inline int ltrace_clear() +{ + char* argv[1]; + + argv[0] = "clear"; + + fprintf(stderr, "[ptlctl] %s\n", argv[0]); + + return jt_dbg_clear_debug_buf(1, argv); +} + +static inline int ltrace_mark(int indent_level, char* text) +{ + char* argv[2]; + char mark_buf[PATH_MAX]; + + snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text); + + argv[0] = "mark"; + argv[1] = mark_buf; + return jt_dbg_mark_debug_buf(2, argv); +} + +static inline int ltrace_applymasks() +{ + char* argv[2]; + argv[0] = "list"; + argv[1] = "applymasks"; + + fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]); + + return jt_dbg_list(2, argv); +} + + +static inline int ltrace_filter(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "filter"; + argv[1] = subsys_or_mask; + return jt_dbg_filter(2, argv); +} + +static inline int ltrace_show(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "show"; + argv[1] = subsys_or_mask; + return jt_dbg_show(2, argv); +} + +static inline int ltrace_start() +{ + int rc = 0; + dbg_initialize(0, NULL); +#ifdef PORTALS_DEV_ID + rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); +#endif + ltrace_filter("class"); + ltrace_filter("socknal"); + ltrace_filter("qswnal"); + ltrace_filter("gmnal"); + ltrace_filter("portals"); + + ltrace_show("all_types"); + ltrace_filter("trace"); + ltrace_filter("malloc"); + ltrace_filter("net"); + ltrace_filter("page"); + ltrace_filter("other"); + ltrace_filter("info"); + ltrace_applymasks(); + + return rc; +} + + +static inline void ltrace_stop() +{ +#ifdef PORTALS_DEV_ID + unregister_ioc_dev(PORTALS_DEV_ID); +#endif +} + +static inline int not_uml() +{ + /* Return Values: + * 0 when run under UML + * 1 when run on host + * <0 when lookup failed + */ + struct stat buf; + int rc = stat("/dev/ubd", &buf); + rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; + if (rc<0) { + fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); + rc = 1; /* Assume host */ + } + return rc; +} + +#define LTRACE_MAX_NOB 256 +static inline void ltrace_add_processnames(char* fname) +{ + char cmdbuf[LTRACE_MAX_NOB]; + struct timeval tv; + struct timezone tz; + int nob; + int underuml = !not_uml(); + + gettimeofday(&tv, &tz); + + nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \""); + + /* Careful - these format strings need to match the CDEBUG + * formats in portals/linux/debug.c EXACTLY + */ + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ", + S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec); + + if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d | %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L); + } + else { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0L); + } + + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname); + system(cmdbuf); +} + +#endif diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lnet/include/lnet/lnet.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h new file mode 100644 index 0000000..dc02780 --- /dev/null +++ b/lnet/include/lnet/lnetctl.h @@ -0,0 +1,75 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_shownid(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h new file mode 100644 index 0000000..12b1925 --- /dev/null +++ b/lnet/include/lnet/myrnal.h @@ -0,0 +1,26 @@ +/* +*/ + +#ifndef MYRNAL_H +#define MYRNAL_H + +#define MAX_ARGS_LEN (256) +#define MAX_RET_LEN (128) +#define MYRNAL_MAX_ACL_SIZE (64) +#define MYRNAL_MAX_PTL_SIZE (64) + +#define P3CMD (100) +#define P3SYSCALL (200) +#define P3REGISTER (300) + +enum { PTL_MLOCKALL }; + +typedef struct { + void *args; + size_t args_len; + void *ret; + size_t ret_len; + int p3cmd; +} myrnal_forward_t; + +#endif /* MYRNAL_H */ diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h new file mode 100644 index 0000000..88be63c --- /dev/null +++ b/lnet/include/lnet/nal.h @@ -0,0 +1,49 @@ +/* +*/ +#ifndef _NAL_H_ +#define _NAL_H_ + +/* + * p30/nal.h + * + * The API side NAL declarations + */ + +#include + +#ifdef yield +#undef yield +#endif + +typedef struct nal_t nal_t; + +struct nal_t { + ptl_ni_t ni; + int refct; + void *nal_data; + int *timeout; /* for libp30api users */ + int (*forward) (nal_t * nal, int index, /* Function ID */ + void *args, size_t arg_len, void *ret, size_t ret_len); + + int (*shutdown) (nal_t * nal, int interface); + + int (*validate) (nal_t * nal, void *base, size_t extent); + + void (*yield) (nal_t * nal); + + void (*lock) (nal_t * nal, unsigned long *flags); + + void (*unlock) (nal_t * nal, unsigned long *flags); +}; + +typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); + +extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); + +#ifndef PTL_IFACE_DEFAULT +#define PTL_IFACE_DEFAULT (PTL_IFACE_IP) +#endif + +#endif diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h new file mode 100644 index 0000000..1b837b4 --- /dev/null +++ b/lnet/include/lnet/nalids.h @@ -0,0 +1,4 @@ +#define PTL_IFACE_TCP 1 +#define PTL_IFACE_ER 2 +#define PTL_IFACE_SS 3 +#define PTL_IFACE_MAX 4 diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lnet/include/lnet/p30.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include +#include +#else +#include +#include +#endif + +#include +#include +#include +#include +#include + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lnet/include/lnet/ppid.h b/lnet/include/lnet/ppid.h new file mode 100644 index 0000000..4727599 --- /dev/null +++ b/lnet/include/lnet/ppid.h @@ -0,0 +1,52 @@ +/* + */ + +#ifndef _INCppidh_ +#define _INCppidh_ + +#include "defines.h" +// #include "idtypes.h" + + +#define MAX_PPID 1000 /* this needs to fit into 16 bits so the + maximum value is 65535. having it "large" + can help w/ debugging process accounting + but there are reasons for making it + somewhat smaller than the maximum -- + requiring storage for arrays that index + on the ppid, eg... */ + +#define MAX_GID 1000 /* this needs to fit into 16 bits... */ + +#define MAX_FIXED_PPID 100 +#define MAX_FIXED_GID 100 +#define PPID_FLOATING MAX_FIXED_PPID+1 /* Floating area starts here */ +#define GID_FLOATING MAX_FIXED_GID+1 /* Floating area starts here */ +#define NUM_PTL_TASKS MAX_FIXED_PPID+80 /* Maximum no. portals tasks */ + +#define PPID_AUTO 0 + +/* Minimum PPID is 1 */ +#define PPID_BEBOPD 1 /* bebopd */ +#define GID_BEBOPD 1 /* bebopd */ + +#define PPID_PCT 2 /* pct */ +#define GID_PCT 2 /* pct */ + +#define PPID_FYOD 3 /* fyod */ +#define GID_FYOD 3 /* fyod */ + +#define PPID_GDBWRAP 11 /* portals proxy for gdb */ +#define GID_GDBWRAP 11 /* portals proxy for gdb */ + +#define PPID_TEST 15 /* for portals tests */ +#define GID_TEST 15 + +#define GID_YOD 5 /* yod */ +#define GID_PINGD 6 /* pingd */ +#define GID_BT 7 /* bt */ +#define GID_PTLTEST 8 /* ptltest */ +#define GID_CGDB 9 /* cgdb */ +#define GID_TVDSVR 10 /* start-tvdsvr */ + +#endif /* _INCppidh_ */ diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h new file mode 100644 index 0000000..dc02780 --- /dev/null +++ b/lnet/include/lnet/ptlctl.h @@ -0,0 +1,75 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_shownid(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h new file mode 100644 index 0000000..c9683f7 --- /dev/null +++ b/lnet/include/lnet/stringtab.h @@ -0,0 +1,5 @@ +/* +*/ +/* + * stringtab.h + */ diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h new file mode 100644 index 0000000..d4038b6 --- /dev/null +++ b/lnet/include/lnet/types.h @@ -0,0 +1,157 @@ +#ifndef _P30_TYPES_H_ +#define _P30_TYPES_H_ + +#ifdef __linux__ +#include +#include +#else +#include +typedef u_int32_t __u32; +typedef u_int64_t __u64; +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) { return 0; } +#endif + +typedef __u64 ptl_nid_t; +typedef __u32 ptl_pid_t; +typedef __u32 ptl_pt_index_t; +typedef __u32 ptl_ac_index_t; +typedef __u64 ptl_match_bits_t; +typedef __u64 ptl_hdr_data_t; +typedef __u32 ptl_size_t; + +typedef struct { + unsigned long nal_idx; /* which network interface */ + __u64 cookie; /* which thing on that interface */ +} ptl_handle_any_t; + +typedef ptl_handle_any_t ptl_handle_ni_t; +typedef ptl_handle_any_t ptl_handle_eq_t; +typedef ptl_handle_any_t ptl_handle_md_t; +typedef ptl_handle_any_t ptl_handle_me_t; + +#define PTL_HANDLE_NONE \ +((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) +#define PTL_EQ_NONE PTL_HANDLE_NONE + +static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +{ + return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); +} + +#define PTL_NID_ANY ((ptl_nid_t) -1) +#define PTL_PID_ANY ((ptl_pid_t) -1) + +typedef struct { + ptl_nid_t nid; + ptl_pid_t pid; /* node id / process id */ +} ptl_process_id_t; + +typedef enum { + PTL_RETAIN = 0, + PTL_UNLINK +} ptl_unlink_t; + +typedef enum { + PTL_INS_BEFORE, + PTL_INS_AFTER +} ptl_ins_pos_t; + +typedef struct { + struct page *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; +} ptl_kiov_t; + +typedef struct { + void *start; + ptl_size_t length; + int threshold; + int max_size; + unsigned int options; + void *user_ptr; + ptl_handle_eq_t eventq; + unsigned int niov; +} ptl_md_t; + +/* Options for the MD structure */ +#define PTL_MD_OP_PUT (1 << 0) +#define PTL_MD_OP_GET (1 << 1) +#define PTL_MD_MANAGE_REMOTE (1 << 2) +#define PTL_MD_AUTO_UNLINK (1 << 3) +#define PTL_MD_TRUNCATE (1 << 4) +#define PTL_MD_ACK_DISABLE (1 << 5) +#define PTL_MD_IOV (1 << 6) +#define PTL_MD_MAX_SIZE (1 << 7) +#define PTL_MD_KIOV (1 << 8) + +#define PTL_MD_THRESH_INF (-1) + +typedef enum { + PTL_EVENT_GET, + PTL_EVENT_PUT, + PTL_EVENT_REPLY, + PTL_EVENT_ACK, + PTL_EVENT_SENT +} ptl_event_kind_t; + +#define PTL_SEQ_BASETYPE long +typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; +#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) + +typedef struct { + ptl_event_kind_t type; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength, mlength, offset; + ptl_handle_me_t unlinked_me; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + cycles_t arrival_time; + volatile ptl_seq_t sequence; +} ptl_event_t; + + +typedef enum { + PTL_ACK_REQ, + PTL_NOACK_REQ +} ptl_ack_req_t; + + +typedef struct { + volatile ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + ptl_handle_any_t cb_eq_handle; +} ptl_eq_t; + +typedef struct { + ptl_eq_t *eq; +} ptl_ni_t; + + +typedef struct { + int max_match_entries; /* max number of match entries */ + int max_mem_descriptors; /* max number of memory descriptors */ + int max_event_queues; /* max number of event queues */ + int max_atable_index; /* maximum access control list table index */ + int max_ptable_index; /* maximum portals table index */ +} ptl_ni_limits_t; + +/* + * Status registers + */ +typedef enum { + PTL_SR_DROP_COUNT, + PTL_SR_DROP_LENGTH, + PTL_SR_RECV_COUNT, + PTL_SR_RECV_LENGTH, + PTL_SR_SEND_COUNT, + PTL_SR_SEND_LENGTH, + PTL_SR_MSGS_MAX, +} ptl_sr_index_t; + +typedef int ptl_sr_value_t; + +#endif diff --git a/lnet/klnds/.cvsignore b/lnet/klnds/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lnet/klnds/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lnet/klnds/Makefile.am b/lnet/klnds/Makefile.am new file mode 100644 index 0000000..fed2785 --- /dev/null +++ b/lnet/klnds/Makefile.am @@ -0,0 +1,7 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal +SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@ diff --git a/lnet/klnds/Makefile.mk b/lnet/klnds/Makefile.mk new file mode 100644 index 0000000..ce40a60 --- /dev/null +++ b/lnet/klnds/Makefile.mk @@ -0,0 +1,4 @@ +include ../Kernelenv + +obj-y = socknal/ +# more coming... \ No newline at end of file diff --git a/lnet/klnds/gmlnd/.cvsignore b/lnet/klnds/gmlnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/klnds/gmlnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/klnds/gmlnd/Makefile.am b/lnet/klnds/gmlnd/Makefile.am new file mode 100644 index 0000000..1dc6f4e --- /dev/null +++ b/lnet/klnds/gmlnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kgmnal +modulenet_DATA = kgmnal.o +EXTRA_PROGRAMS = kgmnal + +DEFS = +kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h diff --git a/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch new file mode 100644 index 0000000..23c80d9 --- /dev/null +++ b/lnet/klnds/gmlnd/gm-1.5.2.1-exports.patch @@ -0,0 +1,43 @@ +diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c +--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002 ++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002 +@@ -30,6 +30,8 @@ + * + ************************************************************************/ + ++#define EXPORT_SYMTAB ++ + #include + #include + +@@ -4075,6 +4077,28 @@ + return 0; + } + ++EXPORT_SYMBOL(gm_blocking_receive_no_spin); ++EXPORT_SYMBOL(gm_close); ++EXPORT_SYMBOL(gm_dma_free); ++EXPORT_SYMBOL(gm_dma_malloc); ++EXPORT_SYMBOL(gm_drop_sends); ++EXPORT_SYMBOL(gm_finalize); ++EXPORT_SYMBOL(gm_get_node_id); ++EXPORT_SYMBOL(gm_init); ++EXPORT_SYMBOL(gm_initialize_alarm); ++EXPORT_SYMBOL(gm_max_node_id_in_use); ++EXPORT_SYMBOL(gm_min_size_for_length); ++EXPORT_SYMBOL(gm_num_receive_tokens); ++EXPORT_SYMBOL(gm_num_send_tokens); ++EXPORT_SYMBOL(gm_open); ++EXPORT_SYMBOL(gm_provide_receive_buffer); ++EXPORT_SYMBOL(gm_resume_sending); ++EXPORT_SYMBOL(gm_send_with_callback); ++EXPORT_SYMBOL(gm_set_acceptable_sizes); ++EXPORT_SYMBOL(gm_set_alarm); ++EXPORT_SYMBOL(gm_unknown); ++ ++ + /* + This file uses GM standard indentation. + +Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~ +Only in gm-1.5.2.1_Linux-cfs/: trace diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h new file mode 100644 index 0000000..47e8c3c --- /dev/null +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -0,0 +1,101 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _GMNAL_H +#define _GMNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_GMNAL + +#include +#include +#include + +#include + + +/* + * Myrinet GM NAL + */ +#define NPAGES_LARGE 16 +#define NPAGES_SMALL 1 +#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE +#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE +#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE)) +#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL)) + +#define TXMSGS 64 /* Number of Transmit Messages */ +#define ENVELOPES 8 /* Number of outstanding receive msgs */ + +#define KGM_PORT_NUM 3 +#define KGM_HOSTNAME "kgmnal" + + +typedef struct { + char *krx_buffer; + unsigned long krx_len; + unsigned int krx_size; + unsigned int krx_priority; + struct list_head krx_item; +} kgmnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + char *ktx_buffer; + size_t ktx_len; + unsigned long ktx_size; + int ktx_ndx; + unsigned int ktx_priority; + unsigned int ktx_tgt_node; + unsigned int ktx_tgt_port_id; +} kgmnal_tx_t; + + +typedef struct { + char kgm_init; + char kgm_shuttingdown; + struct gm_port *kgm_port; + struct list_head kgm_list; + ptl_nid_t kgm_nid; + nal_cb_t *kgm_cb; + struct kgm_trans *kgm_trans; + struct tq_struct kgm_ready_tq; + spinlock_t kgm_dispatch_lock; + spinlock_t kgm_update_lock; + spinlock_t kgm_send_lock; +} kgmnal_data_t; + +int kgm_init(kgmnal_data_t *kgm_data); +int kgmnal_recv_thread(void *); +int gm_return_mynid(void); +void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +extern kgmnal_data_t kgmnal_data; +extern nal_t kgmnal_api; +extern nal_cb_t kgmnal_lib; + +#endif /* _GMNAL_H */ + diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c new file mode 100644 index 0000000..3d4c86d --- /dev/null +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -0,0 +1,517 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* TODO + * preallocate send buffers, store on list + * put receive buffers on queue, handle with receive threads + * use routing + */ + +#include "gmnal.h" + +extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int); + +static kgmnal_tx_t * +get_trans(void) +{ + kgmnal_tx_t *t; + PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t))); + return t; +} + +static void +put_trans(kgmnal_tx_t *t) +{ + PORTAL_FREE(t, sizeof(kgmnal_tx_t)); +} + +int +kgmnal_ispeer (ptl_nid_t nid) +{ + unsigned int gmnid = (unsigned int)nid; + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */ + gmnid < nnids); /* it's in this machine */ +} + +/* + * LIB functions follow + * + */ +static int +kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static int +kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static void * +kgmnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + +static void +kgmnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kgmnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kgmnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kgm_dispatch_lock,*flags); +} + + +static void +kgmnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags); +} + + +static int +kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +/* FIXME rmr: add rounting code here */ +static void +kgmnal_tx_done(kgmnal_tx_t *trans, int error) +{ + lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie); + + gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer); + + trans->ktx_buffer = NULL; + trans->ktx_len = 0; + + put_trans(trans); +} +static char * gm_error_strings[GM_NUM_STATUS_CODES] = { + [GM_SUCCESS] = "GM_SUCCESS", + [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT", + [GM_SEND_REJECTED] = "GM_SEND_REJECTED", + [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED", + [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE", + [GM_SEND_DROPPED] = "GM_SEND_DROPPED", + [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED", +}; + +inline char * get_error(int status) +{ + if (gm_error_strings[status] != NULL) + return gm_error_strings[status]; + else + return "Unknown error"; +} + +static void +kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status) +{ + CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status); +} + +static void +kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status) +{ + kgmnal_tx_t *ktx = (kgmnal_tx_t *)context; + int err = 0; + + LASSERT (p != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id); + + switch((int)status) { + case GM_SUCCESS: /* normal */ + break; + case GM_SEND_TIMED_OUT: /* application error */ + case GM_SEND_REJECTED: /* size of msg unacceptable */ + case GM_SEND_TARGET_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_TARGET_NODE_UNREACHABLE: + case GM_SEND_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_DROPPED: + CERROR("%s (%d):\n", get_error(status), status); + err = -EIO; + break; + default: + CERROR("Unknown status: %d\n", status); + err = -EIO; + break; + } + + kgmnal_tx_done(ktx, err); +} + +/* + */ + +static int +kgmnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t len) +{ + /* + * ipnal assumes that this is the private as passed to lib_dispatch.. + * so do we :/ + */ + kgmnal_tx_t *ktx=NULL; + int rc=0; + void * buf; + int buf_len = sizeof(ptl_hdr_t) + len; + int buf_size = 0; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + PROF_START(gmnal_send); + + + CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n", + len, iov, nid, KGM_PORT_NUM); + + /* ensure there is an available tx handle */ + + /* save transaction info to trans for later finalize and cleanup */ + ktx = get_trans(); + if (ktx == NULL) { + rc = -ENOMEM; + goto send_exit; + } + + /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce + header and data. + Also, memory must be dma'able or registered with GM. */ + + if (buf_len <= MSG_LEN_SMALL) { + buf_size = MSG_SIZE_SMALL; + } else if (buf_len <= MSG_LEN_LARGE) { + buf_size = MSG_SIZE_LARGE; + } else { + printk("kgmnal:request exceeds TX MTU size (%d).\n", + MSG_SIZE_LARGE); + rc = -1; + goto send_exit; + } + + buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len); + if (buf == NULL) { + rc = -ENOMEM; + goto send_exit; + } + memcpy(buf, hdr, sizeof(ptl_hdr_t)); + + if (len != 0) + lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), + options, niov, iov, len); + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + ktx->ktx_len = buf_len; + ktx->ktx_size = buf_size; + ktx->ktx_buffer = buf; + ktx->ktx_priority = GM_LOW_PRIORITY; + ktx->ktx_tgt_node = nid; + ktx->ktx_tgt_port_id = KGM_PORT_NUM; + + CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx " + "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM, + GM_LOW_PRIORITY); + + gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size, + buf_len, GM_LOW_PRIORITY, + nid, KGM_PORT_NUM, + kgmnal_txhandler, ktx); + + PROF_FINISH(gmnal_send); + send_exit: + return rc; +} +void +kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + CERROR ("forwarding not implemented\n"); +} + + +static inline void +kgmnal_requeue_rx(kgmnal_rx_t *krx) +{ + gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer, + krx->krx_size, krx->krx_priority); +} + +/* Process a received portals packet */ + +/* Receive Interrupt Handler */ +static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size, + void * buf, unsigned int pri) +{ + ptl_hdr_t *hdr = buf; + kgmnal_rx_t krx; + + CDEBUG(D_NET,"buf %p, len %ld\n", buf, len); + + if ( len < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (kgm->kgm_shuttingdown) + return; + CERROR("kgmnal: did not receive complete portal header, " + "len= %ld", len); + gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri); + return; + } + + /* might want to use seperate threads to handle receive */ + krx.krx_buffer = buf; + krx.krx_len = len; + krx.krx_size = size; + krx.krx_priority = pri; + + if ( hdr->dest_nid == kgmnal_lib.ni.nid ) { + PROF_START(lib_parse); + lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); + PROF_FINISH(lib_parse); + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx: target is " + "a peer", hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); + } else { + /* forward to gateway */ + CERROR("forwarding not implemented yet"); + kgmnal_requeue_rx(&krx); + } + + return; +} + + +static int kgmnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t mlen, + size_t rlen) +{ + kgmnal_rx_t *krx = private; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen); + + /* What was actually received must be >= what sender claims to + * have sent. This is an LASSERT, since lib-move doesn't + * check cb return code yet. */ + LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + LASSERT (mlen <= rlen); + + PROF_START(gmnal_recv); + + if(mlen != 0) { + PROF_START(memcpy); + lib_copy_buf2iov (options, niov, iov, + krx->krx_buffer + sizeof (ptl_hdr_t), mlen); + PROF_FINISH(memcpy); + } + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + kgmnal_requeue_rx(krx); + + PROF_FINISH(gmnal_recv); + + return rlen; +} + + +static void kgmnal_shutdown(void * none) +{ + CERROR("called\n"); + return; +} + +/* + * Set terminate and use alarm to wake up the recv thread. + */ +static void recv_shutdown(kgmnal_data_t *kgm) +{ + gm_alarm_t alarm; + + kgm->kgm_shuttingdown = 1; + gm_initialize_alarm(&alarm); + gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL); +} + +int kgmnal_end(kgmnal_data_t *kgm) +{ + + /* wait for sends to finish ? */ + /* remove receive buffers */ + /* shutdown receive thread */ + + recv_shutdown(kgm); + + return 0; +} + +/* Used only for the spinner */ +int kgmnal_recv_thread(void *arg) +{ + kgmnal_data_t *kgm = arg; + + LASSERT(kgm != NULL); + + kportal_daemonize("kgmnal_rx"); + + while(1) { + gm_recv_event_t *e; + int priority = GM_LOW_PRIORITY; + if (kgm->kgm_shuttingdown) + break; + + e = gm_blocking_receive_no_spin(kgm->kgm_port); + if (e == NULL) { + CERROR("gm_blocking_receive returned NULL\n"); + break; + } + + switch(gm_ntohc(e->recv.type)) { + case GM_HIGH_RECV_EVENT: + priority = GM_HIGH_PRIORITY; + /* fall through */ + case GM_RECV_EVENT: + kgmnal_rx(kgm, gm_ntohl(e->recv.length), + gm_ntohc(e->recv.size), + gm_ntohp(e->recv.buffer), priority); + break; + case GM_ALARM_EVENT: + CERROR("received alarm"); + gm_unknown(kgm->kgm_port, e); + break; + case GM_BAD_SEND_DETECTED_EVENT: /* ?? */ + CERROR("received bad send!\n"); + break; + default: + gm_unknown(kgm->kgm_port, e); + } + } + + CERROR("shuttting down.\n"); + return 0; +} + +nal_cb_t kgmnal_lib = { + nal_data: &kgmnal_data, /* NAL private data */ + cb_send: kgmnal_send, + cb_recv: kgmnal_recv, + cb_read: kgmnal_read, + cb_write: kgmnal_write, + cb_malloc: kgmnal_malloc, + cb_free: kgmnal_free, + cb_printf: kgmnal_printf, + cb_cli: kgmnal_cli, + cb_sti: kgmnal_sti, + cb_dist: kgmnal_dist +}; diff --git a/lnet/klnds/gmlnd/gmnal.c b/lnet/klnds/gmlnd/gmnal.c new file mode 100644 index 0000000..ceeea2a --- /dev/null +++ b/lnet/klnds/gmlnd/gmnal.c @@ -0,0 +1,284 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "gmnal.h" + +ptl_handle_ni_t kgmnal_ni; +nal_t kgmnal_api; + +kgmnal_data_t kgmnal_data; +int gmnal_debug = 0; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: GMNAL, + kprni_arg: NULL, + kprni_fwd: kgmnal_fwd_packet, +}; + +static int kgmnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return PTL_OK; +} + +static void kgmnal_lock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void kgmnal_unlock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int kgmnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kgmnal_api); + return 0; +} + +static void kgmnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kgmnal_api); + + if (current->need_resched) + schedule(); + return; +} + +kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx) +{ + kgmnal_rx_t *conn; + + PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t)); + /* Check for out of mem here */ + if (conn==NULL) { + printk("kgm_add_recv: memory alloc failed\n"); + return NULL; + } + + list_add(&conn->krx_item,(struct list_head *)&data->kgm_list); + // conn->ndx=ndx; + // conn->len=conn->ptlhdr_copied=0; + // conn->loopback=0; + return conn; +} + +static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n", + kgmnal_data.kgm_nid, nnids); + lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size); + return &kgmnal_api; +} + +static void __exit +kgmnal_finalize(void) +{ + struct list_head *tmp; + + PORTAL_SYMBOL_UNREGISTER (kgmnal_ni); + PtlNIFini(kgmnal_ni); + lib_fini(&kgmnal_api); + + if (kgmnal_data.kgm_port) { + gm_close(kgmnal_data.kgm_port); + } + + /* FIXME: free dma buffers */ + /* FIXME: kill receiver thread */ + + PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS); + + list_for_each(tmp, &kgmnal_data.kgm_list) { + kgmnal_rx_t *conn; + conn = list_entry(tmp, kgmnal_rx_t, krx_item); + CDEBUG(D_IOCTL, "freeing conn %p\n",conn); + tmp = tmp->next; + list_del(&conn->krx_item); + PORTAL_FREE(conn, sizeof(*conn)); + } + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + +static int __init +kgmnal_initialize(void) +{ + int rc; + int ntok; + unsigned long sizemask; + unsigned int nid; + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kgmnal_api.forward = kgmnal_forward; + kgmnal_api.shutdown = kgmnal_shutdown; + kgmnal_api.yield = kgmnal_yield; + kgmnal_api.validate = NULL; /* our api validate is a NOOP */ + kgmnal_api.lock= kgmnal_lock; + kgmnal_api.unlock= kgmnal_unlock; + kgmnal_api.nal_data = &kgmnal_data; + + kgmnal_lib.nal_data = &kgmnal_data; + + memset(&kgmnal_data, 0, sizeof(kgmnal_data)); + + INIT_LIST_HEAD(&kgmnal_data.kgm_list); + kgmnal_data.kgm_cb = &kgmnal_lib; + + /* Allocate transmit descriptors */ + PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS); + if (kgmnal_data.kgm_trans==NULL) { + printk("kgmnal: init: failed to allocate transmit " + "descriptors\n"); + return -1; + } + memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS)); + + spin_lock_init(&kgmnal_data.kgm_dispatch_lock); + spin_lock_init(&kgmnal_data.kgm_update_lock); + spin_lock_init(&kgmnal_data.kgm_send_lock); + + /* Do the receiver and xmtr allocation */ + + rc = gm_init(); + if (rc != GM_SUCCESS) { + CERROR("gm_init failed: %d\n", rc); + return -1; + } + + rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME, + GM_API_VERSION_1_1); + if (rc != GM_SUCCESS) { + gm_finalize(); + kgmnal_data.kgm_port = NULL; + CERROR("gm_open failed: %d\n", rc); + return -1; + } + gm_get_node_id(kgmnal_data.kgm_port, &nid); + kgmnal_data.kgm_nid = nid; + /* Allocate 2 different sizes of buffers. For new, use half + the tokens for each. */ + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n", + ntok, MSG_LEN_LARGE); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_LARGE); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_LARGE, GM_LOW_PRIORITY); + } + + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n", + ntok, MSG_LEN_SMALL); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_SMALL); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + } + sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL); + CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n", + kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY, + sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0); + + /* Initialize Network Interface */ + rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + return (-ENOMEM); + } + + /* Start receiver thread */ + kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0); + + PORTAL_SYMBOL_REGISTER(kgmnal_ni); + + kgmnal_data.kgm_init = 1; + + return 0; +} + +MODULE_AUTHOR("Robert Read "); +MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1"); +MODULE_LICENSE("GPL"); + +module_init (kgmnal_initialize); +module_exit (kgmnal_finalize); + +EXPORT_SYMBOL (kgmnal_ni); diff --git a/lnet/klnds/qswlnd/.cvsignore b/lnet/klnds/qswlnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/klnds/qswlnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/klnds/qswlnd/Makefile.am b/lnet/klnds/qswlnd/Makefile.am new file mode 100644 index 0000000..3eb4dd5 --- /dev/null +++ b/lnet/klnds/qswlnd/Makefile.am @@ -0,0 +1,17 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kqswnal +modulenet_DATA = kqswnal.o +EXTRA_PROGRAMS = kqswnal + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +CPPFLAGS=@CPPFLAGS@ @with_quadrics@ +kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c new file mode 100644 index 0000000..1a8fb74 --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -0,0 +1,608 @@ +/* + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +ptl_handle_ni_t kqswnal_ni; +nal_t kqswnal_api; +kqswnal_data_t kqswnal_data; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: QSWNAL, + kprni_arg: NULL, + kprni_fwd: kqswnal_fwd_packet, +}; + + +static int +kqswnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return (PTL_OK); +} + +static void +kqswnal_lock (nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void +kqswnal_unlock(nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int +kqswnal_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "shutdown\n"); + + LASSERT (nal == &kqswnal_api); + return (0); +} + +static void +kqswnal_yield( nal_t *nal ) +{ + CDEBUG (D_NET, "yield\n"); + + if (current->need_resched) + schedule(); + return; +} + +static nal_t * +kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, + ptl_pid_t requested_pid) +{ + ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid); + int nnids = kqswnal_data.kqn_nnodes; + + CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids); + + lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size); + + return (&kqswnal_api); +} + +int +kqswnal_cmd (struct portal_ioctl_data *data, void *private) +{ + LASSERT (data != NULL); + + switch (data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_MYNID: + CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", + data->ioc_nid - kqswnal_data.kqn_elanid, + kqswnal_data.kqn_nid_offset); + kqswnal_data.kqn_nid_offset = + data->ioc_nid - kqswnal_data.kqn_elanid; + kqswnal_lib.ni.nid = data->ioc_nid; + return (0); + + default: + return (-EINVAL); + } +} + +void __exit +kqswnal_finalise (void) +{ + switch (kqswnal_data.kqn_init) + { + default: + LASSERT (0); + + case KQN_INIT_ALL: + PORTAL_SYMBOL_UNREGISTER (kqswnal_ni); + /* fall through */ + + case KQN_INIT_PTL: + PtlNIFini (kqswnal_ni); + lib_fini (&kqswnal_lib); + /* fall through */ + + case KQN_INIT_DATA: + break; + + case KQN_INIT_NOTHING: + return; + } + + /**********************************************************************/ + /* Make router stop her calling me and fail any more call-ins */ + kpr_shutdown (&kqswnal_data.kqn_router); + + /**********************************************************************/ + /* flag threads to terminate, wake them and wait for them to die */ + + kqswnal_data.kqn_shuttingdown = 1; + wake_up_all (&kqswnal_data.kqn_sched_waitq); + + while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { + CDEBUG(D_NET, "waiting for %d threads to terminate\n", + atomic_read (&kqswnal_data.kqn_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + /**********************************************************************/ + /* close elan comms */ + + if (kqswnal_data.kqn_eprx_small != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); + + if (kqswnal_data.kqn_eprx_large != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); + + if (kqswnal_data.kqn_eptx != NULL) + ep_free_large_xmtr (kqswnal_data.kqn_eptx); + + /**********************************************************************/ + /* No more threads. No more portals, router or comms callbacks! + * I control the horizontals and the verticals... + */ + + /**********************************************************************/ + /* Complete any blocked forwarding packets with error + */ + + while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + while (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + /**********************************************************************/ + /* Wait for router to complete any packets I sent her + */ + + kpr_deregister (&kqswnal_data.kqn_router); + + + /**********************************************************************/ + /* Unmap message buffers and free all descriptors and buffers + */ + + if (kqswnal_data.kqn_eprxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, 0, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle); + } + + if (kqswnal_data.kqn_eptxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, 0, + KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle); + } + + if (kqswnal_data.kqn_txds != NULL) + { + int i; + + for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) + { + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + } + + PORTAL_FREE(kqswnal_data.kqn_txds, + sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + } + + if (kqswnal_data.kqn_rxds != NULL) + { + int i; + int j; + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + for (j = 0; j < krx->krx_npages; j++) + if (krx->krx_pages[j] != NULL) + __free_page (krx->krx_pages[j]); + } + + PORTAL_FREE(kqswnal_data.kqn_rxds, + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGS_LARGE)); + } + + /* resets flags, pointers to NULL etc */ + memset(&kqswnal_data, 0, sizeof (kqswnal_data)); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); + + printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kqswnal_initialise (void) +{ + ELAN3_DMA_REQUEST dmareq; + int rc; + int i; + int elan_page_idx; + int pkmem = atomic_read(&portal_kmemory); + + LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); + + kqswnal_api.forward = kqswnal_forward; + kqswnal_api.shutdown = kqswnal_shutdown; + kqswnal_api.yield = kqswnal_yield; + kqswnal_api.validate = NULL; /* our api validate is a NOOP */ + kqswnal_api.lock = kqswnal_lock; + kqswnal_api.unlock = kqswnal_unlock; + kqswnal_api.nal_data = &kqswnal_data; + + kqswnal_lib.nal_data = &kqswnal_data; + + /* ensure all pointers NULL etc */ + memset (&kqswnal_data, 0, sizeof (kqswnal_data)); + + kqswnal_data.kqn_cb = &kqswnal_lib; + + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + spin_lock_init (&kqswnal_data.kqn_idletxd_lock); + init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); + + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); + + spin_lock_init (&kqswnal_data.kqn_sched_lock); + init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); + + spin_lock_init (&kqswnal_data.kqn_statelock); + + /* pointers/lists/locks initialised */ + kqswnal_data.kqn_init = KQN_INIT_DATA; + + /**********************************************************************/ + /* Find the first Elan device */ + + kqswnal_data.kqn_epdev = ep_device (0); + if (kqswnal_data.kqn_epdev == NULL) + { + CERROR ("Can't get elan device 0\n"); + return (-ENOMEM); + } + + kqswnal_data.kqn_nid_offset = 0; + kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_epdev); + kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_epdev); + + /**********************************************************************/ + /* Get the transmitter */ + + kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev); + if (kqswnal_data.kqn_eptx == NULL) + { + CERROR ("Can't allocate transmitter\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the receivers */ + + kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_SMALL, + KQSW_EP_ENVELOPES_SMALL); + if (kqswnal_data.kqn_eprx_small == NULL) + { + CERROR ("Can't install small msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_LARGE, + KQSW_EP_ENVELOPES_LARGE); + if (kqswnal_data.kqn_eprx_large == NULL) + { + CERROR ("Can't install large msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for transmit buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEREAD; + + rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState, + KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), + &dmareq, &kqswnal_data.kqn_eptxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for receive buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEWRITE; + + rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, + &dmareq, &kqswnal_data.kqn_eprxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Allocate/Initialise transmit descriptors */ + + PORTAL_ALLOC(kqswnal_data.kqn_txds, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + if (kqswnal_data.kqn_txds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* clear flags, null pointers etc */ + memset(kqswnal_data.kqn_txds, 0, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) + { + int premapped_pages; + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + int basepage = i * KQSW_NTXMSGPAGES; + + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + if (ktx->ktx_buffer == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* Map pre-allocated buffer NOW, to save latency on transmit */ + premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, + basepage, &ktx->ktx_ebuffer); + + ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ + ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ + + if (i < KQSW_NTXMSGS) + ktx->ktx_idle = &kqswnal_data.kqn_idletxds; + else + ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + + list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + } + + /**********************************************************************/ + /* Allocate/Initialise receive descriptors */ + + PORTAL_ALLOC (kqswnal_data.kqn_rxds, + sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); + if (kqswnal_data.kqn_rxds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); + + elan_page_idx = 0; + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + E3_Addr elanaddr; + int j; + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + if (i < KQSW_NRXMSGS_SMALL) + { + krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; + krx->krx_eprx = kqswnal_data.kqn_eprx_small; + } + else + { + krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; + krx->krx_eprx = kqswnal_data.kqn_eprx_large; + } + + LASSERT (krx->krx_npages > 0); + for (j = 0; j < krx->krx_npages; j++) + { + krx->krx_pages[j] = alloc_page(GFP_KERNEL); + if (krx->krx_pages[j] == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + LASSERT(page_address(krx->krx_pages[j]) != NULL); + + elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, + page_address(krx->krx_pages[j]), + PAGE_SIZE, elan_page_idx, + &elanaddr); + elan_page_idx++; + + if (j == 0) + krx->krx_elanaddr = elanaddr; + + /* NB we assume a contiguous */ + LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE); + } + } + LASSERT (elan_page_idx == + (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + + (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); + + /**********************************************************************/ + /* Network interface ready to initialise */ + + rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni); + if (rc != 0) + { + CERROR ("PtlNIInit failed %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_init = KQN_INIT_PTL; + + /**********************************************************************/ + /* Queue receives, now that it's OK to run their completion callbacks */ + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + /* NB this enqueue can allocate/sleep (attr == 0) */ + rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, + krx->krx_elanaddr, + krx->krx_npages * PAGE_SIZE, 0); + if (rc != 0) + { + CERROR ("failed ep_queue_receive %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + } + + /**********************************************************************/ + /* Spawn scheduling threads */ + for (i = 0; i < smp_num_cpus; i++) + { + rc = kqswnal_thread_start (kqswnal_scheduler, NULL); + if (rc != 0) + { + CERROR ("failed to spawn scheduling thread: %d\n", rc); + kqswnal_finalise (); + return (rc); + } + } + + /**********************************************************************/ + /* Connect to the router */ + rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); + CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); + + rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + kqswnal_finalise (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(kqswnal_ni); + kqswnal_data.kqn_init = KQN_INIT_ALL; + + printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d " + "(Routing %s, initial mem %d)\n", + kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes, + kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + + +MODULE_AUTHOR("W. Marcus Miller "); +MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00"); +MODULE_LICENSE("GPL"); + +module_init (kqswnal_initialise); +module_exit (kqswnal_finalise); + +EXPORT_SYMBOL (kqswnal_ni); diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h new file mode 100644 index 0000000..88ab74f --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _QSWNAL_H +#define _QSWNAL_H +#define EXPORT_SYMTAB + +#ifdef PROPRIETARY_ELAN +# include +#else +# include +#endif + +#undef printf /* nasty QSW #define */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_QSWNAL + +#include +#include +#include + +#define KQSW_CHECKSUM 0 +#if KQSW_CHECKSUM +typedef unsigned long kqsw_csum_t; +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#else +#define KQSW_CSUM_SIZE 0 +#endif +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) + +/* + * Elan NAL + */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +/* NB small/large message sizes are GLOBAL constants */ + +/* + * Performance Tuning defines + * NB no mention of PAGE_SIZE for interoperability + */ +#if PTL_LARGE_MTU +# define KQSW_MAXPAYLOAD (256<<10) /* biggest message this NAL will cope with */ +#else +# define KQSW_MAXPAYLOAD (64<<10) /* biggest message this NAL will cope with */ +#endif + +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ + +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ + +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 128 /* # reserved transmit messages if can't block */ + +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ + +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ + +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ + +/* + * derived constants + */ + +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +/* The pre-allocated tx buffer (hdr + small payload) */ + +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +/* Reserve elan address space for pre-allocated and pre-mapped transmit + * buffer and a full payload too. Extra pages allow for page alignment */ + +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) + +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +/* biggest complete packet we can receive (or transmit) */ + + +typedef struct +{ + struct list_head krx_list; /* enqueue -> thread */ + EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RXD *krx_rxd; /* receive descriptor (for repost) */ + E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ + int krx_npages; /* # pages in receive buffer */ + int krx_nob; /* Number Of Bytes received into buffer */ + kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ + struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ + struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ +} kqswnal_rx_t; + +typedef struct +{ + struct list_head ktx_list; /* enqueue idle/delayed */ + struct list_head *ktx_idle; /* where to put when idle */ + char ktx_state; /* What I'm doing */ + uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ + int ktx_npages; /* pages reserved for mapping messages */ + int ktx_nmappedpages; /* # pages mapped for current message */ + EP_IOVEC ktx_iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ + int ktx_niov; /* # message frags */ + int ktx_port; /* destination ep port */ + ptl_nid_t ktx_nid; /* destination node */ + void *ktx_args[2]; /* completion passthru */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ +} kqswnal_tx_t; + +#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ + +typedef struct +{ + char kqn_init; /* what's been initialised */ + char kqn_shuttingdown; /* I'm trying to shut down */ + atomic_t kqn_nthreads; /* # threads still running */ + + kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + + struct list_head kqn_idletxds; /* transmit descriptors free to use */ + struct list_head kqn_nblk_idletxds; /* reserve of */ + spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ + wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ + struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ + + spinlock_t kqn_sched_lock; /* serialise packet schedulers */ + wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ + + struct list_head kqn_readyrxds; /* rxds full of data */ + struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_delayedtxds; /* delayed transmits */ + + spinlock_t kqn_statelock; /* cb_cli/cb_sti */ + nal_cb_t *kqn_cb; /* -> kqswnal_lib */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ + + ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ + int kqn_nnodes; /* this cluster's size */ + int kqn_elanid; /* this nodes's elan ID */ +} kqswnal_data_t; + +/* kqn_init state */ +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 + +extern nal_cb_t kqswnal_lib; +extern nal_t kqswnal_api; +extern kqswnal_data_t kqswnal_data; + +extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); +extern void kqswnal_rxhandler(EP_RXD *rxd); +extern int kqswnal_scheduler (void *); +extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +static inline ptl_nid_t +kqswnal_elanid2nid (int elanid) +{ + return (kqswnal_data.kqn_nid_offset + elanid); +} + +static inline int +kqswnal_nid2elanid (ptl_nid_t nid) +{ + /* not in this cluster? */ + if (nid < kqswnal_data.kqn_nid_offset || + nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes) + return (-1); + + return (nid - kqswnal_data.kqn_nid_offset); +} + +static inline void +kqswnal_requeue_rx (kqswnal_rx_t *krx) +{ + ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE); +} + +static inline int +kqswnal_pages_spanned (void *base, int nob) +{ + unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; + unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; + + LASSERT (last_page >= first_page); /* can't wrap address space */ + return (last_page - first_page + 1); +} + +#if KQSW_CHECKSUM +static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) +{ + unsigned char *ptr = (unsigned char *)base; + + while (nob-- > 0) + sum += *ptr++; + + return (sum); +} +#endif + +#endif /* _QSWNAL_H */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c new file mode 100644 index 0000000..3b47a25 --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -0,0 +1,1239 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +atomic_t kqswnal_packets_launched; +atomic_t kqswnal_packets_transmitted; +atomic_t kqswnal_packets_received; + + +/* + * LIB functions follow + * + */ +static int +kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static int +kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static void * +kqswnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return (buf); +} + +static void +kqswnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + + +static void +kqswnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kqn_statelock, *flags); +} + + +static void +kqswnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kqn_statelock, *flags); +} + + +static int +kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + if (nid == nal->ni.nid) + *dist = 0; /* it's me */ + else if (kqswnal_nid2elanid (nid) >= 0) + *dist = 1; /* it's my peer */ + else + *dist = 2; /* via router */ + return (0); +} + +void +kqswnal_unmap_tx (kqswnal_tx_t *ktx) +{ + if (ktx->ktx_nmappedpages == 0) + return; + + CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", + ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages); + + LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); + LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= + kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); + + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_basepage, ktx->ktx_nmappedpages); + ktx->ktx_nmappedpages = 0; +} + +int +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + char *ptr; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = kiov->kiov_len; + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + /* each frag fits in a page */ + LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + + nmapped++; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + /* XXX this is really crap, but we'll have to kmap until + * EKC has a page (rather than vaddr) mapping interface */ + + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, page %d, %d total\n", + ktx, nfrags, ptr, fraglen, basepage, nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ptr, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + + kunmap (kiov->kiov_page); + + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage++; + kiov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +int +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = iov->iov_len; + long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + + nmapped += npages; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", + ktx, nfrags, iov->iov_base, fraglen, basepage, npages, + nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + iov->iov_base, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage += npages; + iov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +void +kqswnal_put_idle_tx (kqswnal_tx_t *ktx) +{ + kpr_fwd_desc_t *fwd = NULL; + struct list_head *idle = ktx->ktx_idle; + unsigned long flags; + + kqswnal_unmap_tx (ktx); /* release temporary mappings */ + ktx->ktx_state = KTX_IDLE; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_add (&ktx->ktx_list, idle); + + /* reserved for non-blocking tx */ + if (idle == &kqswnal_data.kqn_nblk_idletxds) { + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return; + } + + /* anything blocking for a tx descriptor? */ + if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ + { + CDEBUG(D_NET,"wakeup fwd\n"); + + fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + } + + if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq)) /* process? */ + { + /* local sender waiting for tx desc */ + CDEBUG(D_NET,"wakeup process\n"); + wake_up (&kqswnal_data.kqn_idletxd_waitq); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + if (fwd == NULL) + return; + + /* schedule packet for forwarding again */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +kqswnal_tx_t * +kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) +{ + unsigned long flags; + kqswnal_tx_t *ktx = NULL; + + for (;;) { + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kqswnal_data.kqn_idletxds)) { + ktx = list_entry (kqswnal_data.kqn_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* "normal" descriptor pool is empty */ + + if (fwd != NULL) { /* forwarded packet => queue for idle txd */ + CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); + list_add_tail (&fwd->kprfd_list, + &kqswnal_data.kqn_idletxd_fwdq); + break; + } + + /* doing a local transmit */ + if (!may_block) { + if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { + CERROR ("intr tx desc pool exhausted\n"); + break; + } + + ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* block for idle tx */ + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + CDEBUG (D_NET, "blocking for tx desc\n"); + wait_event (kqswnal_data.kqn_idletxd_waitq, + !list_empty (&kqswnal_data.kqn_idletxds)); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ + LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); + return (ktx); +} + +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +{ + switch (ktx->ktx_state) { + case KTX_FORWARDING: /* router asked me to forward this packet */ + kpr_fwd_done (&kqswnal_data.kqn_router, + (kpr_fwd_desc_t *)ktx->ktx_args[0], error); + break; + + case KTX_SENDING: /* packet sourced locally */ + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + (lib_msg_t *)ktx->ktx_args[1]); + break; + + default: + LASSERT (0); + } + + kqswnal_put_idle_tx (ktx); +} + +static void +kqswnal_txhandler(EP_TXD *txd, void *arg, int status) +{ + kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + + LASSERT (txd != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); + + if (status == EP_SUCCESS) + atomic_inc (&kqswnal_packets_transmitted); + + if (status != EP_SUCCESS) + { + CERROR ("kqswnal: Transmit failed with %d\n", status); + status = -EIO; + } + + kqswnal_tx_done (ktx, status); +} + +int +kqswnal_launch (kqswnal_tx_t *ktx) +{ + /* Don't block for transmit descriptor if we're in interrupt context */ + int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; + int dest = kqswnal_nid2elanid (ktx->ktx_nid); + long flags; + int rc; + + LASSERT (dest >= 0); /* must be a peer */ + rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, + ktx->ktx_port, attr, kqswnal_txhandler, + ktx, ktx->ktx_iov, ktx->ktx_niov); + if (rc == 0) + atomic_inc (&kqswnal_packets_launched); + + if (rc != ENOMEM) + return (rc); + + /* can't allocate ep txd => queue for later */ + + LASSERT (in_interrupt()); /* not called by thread (not looping) */ + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + return (0); +} + + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + default: + return (""); + } +} + +static void +kqswnal_cerror_hdr(ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + CERROR("P3 Header at %p of type %s\n", hdr, type_str); + CERROR(" From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid), + NTOH__u32(hdr->src_pid)); + CERROR(" To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid), + NTOH__u32(hdr->dest_pid)); + + switch (NTOH__u32(hdr->type)) { + case PTL_MSG_PUT: + CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + NTOH__u64 (hdr->msg.put.match_bits)); + CERROR(" Length %d, offset %d, hdr data "LPX64"\n", + NTOH__u32(PTL_HDR_LENGTH(hdr)), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + CERROR(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.get.ptl_index), + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CERROR(" Length %d, src offset %d\n", + NTOH__u32 (hdr->msg.get.sink_length), + NTOH__u32 (hdr->msg.get.src_offset)); + break; + + case PTL_MSG_ACK: + CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + NTOH__u32 (hdr->msg.ack.mlength)); + break; + + case PTL_MSG_REPLY: + CERROR(" dst md "LPX64"."LPX64", length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + NTOH__u32 (PTL_HDR_LENGTH(hdr))); + } + +} /* end of print_hdr() */ + +static int +kqswnal_sendmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + kqswnal_tx_t *ktx; + int rc; + ptl_nid_t gatewaynid; +#if KQSW_CHECKSUM + int i; + kqsw_csum_t csum; + int sumnob; +#endif + + /* NB, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 + " pid %u\n", payload_nob, payload_niov, nid, pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + if (payload_nob > KQSW_MAXPAYLOAD) { + CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", + payload_nob, KQSW_MAXPAYLOAD); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + if (kqswnal_nid2elanid (gatewaynid) < 0) { + CERROR("Bad gateway "LPX64" for "LPX64"\n", + gatewaynid, nid); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + nid = gatewaynid; + } + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (ktx == NULL) { + kqswnal_cerror_hdr (hdr); + lib_finalize (&kqswnal_lib, private, cookie); + } + + memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + +#if KQSW_CHECKSUM + csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); + memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); + for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + if (payload_kiov != NULL) { + ptl_kiov_t *kiov = &payload_kiov[i]; + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset; + + csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); + sumnob -= kiov->kiov_len; + } else { + struct iovec *iov = &payload_iov[i]; + + csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); + sumnob -= iov->iov_len; + } + } + memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); +#endif + + /* Set up first frag from pre-mapped buffer (it's at least the + * portals header) */ + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; + ktx->ktx_iov[0].Len = KQSW_HDR_SIZE; + ktx->ktx_niov = 1; + + if (payload_nob > 0) { /* got some payload (something more to do) */ + /* make a single contiguous message? */ + if (payload_nob <= KQSW_TX_MAXCONTIG) { + /* copy payload to ktx_buffer, immediately after hdr */ + if (payload_kiov != NULL) + lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_kiov, payload_nob); + else + lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_iov, payload_nob); + /* first frag includes payload */ + ktx->ktx_iov[0].Len += payload_nob; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov (ktx, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov (ktx, payload_nob, + payload_niov, payload_iov); + if (rc != 0) { + kqswnal_put_idle_tx (ktx); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + } + } + + ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_SENDING; /* => lib_finalize() on completion */ + ktx->ktx_args[0] = private; + ktx->ktx_args[1] = cookie; + + rc = kqswnal_launch (ktx); + if (rc != 0) { /* failed? */ + CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + return (0); +} + +static int +kqswnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kqswnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + +int kqswnal_fwd_copy_contig = 0; + +void +kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + int rc; + kqswnal_tx_t *ktx; + struct iovec *iov = fwd->kprfd_iov; + int niov = fwd->kprfd_niov; + int nob = fwd->kprfd_nob; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + /* The router wants this NAL to forward a packet */ + CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + fwd, nid, niov, nob); + + LASSERT (niov > 0); + + ktx = kqswnal_get_idle_tx (fwd, FALSE); + if (ktx == NULL) /* can't get txd right now */ + return; /* fwd will be scheduled when tx desc freed */ + + if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + nid = fwd->kprfd_target_nid; /* target is final dest */ + + if (kqswnal_nid2elanid (nid) < 0) { + CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); + rc = -EHOSTUNREACH; + goto failed; + } + + if (nob > KQSW_NRXMSGBYTES_LARGE) { + CERROR ("Can't forward [%p] to "LPX64 + ": size %d bigger than max packet size %ld\n", + fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); + rc = -EMSGSIZE; + goto failed; + } + + if ((kqswnal_fwd_copy_contig || niov > 1) && + nob <= KQSW_TX_BUFFER_SIZE) + { + /* send from ktx's pre-allocated/mapped contiguous buffer? */ + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ + ktx->ktx_iov[0].Len = nob; + ktx->ktx_niov = 1; + } + else + { + /* zero copy */ + ktx->ktx_niov = 0; /* no frags mapped yet */ + rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + if (rc != 0) + goto failed; + } + + ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_FORWARDING; /* kpr_put_packet() on completion */ + ktx->ktx_args[0] = fwd; + + rc = kqswnal_launch (ktx); + if (rc == 0) + return; + + failed: + LASSERT (rc != 0); + CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); + + kqswnal_put_idle_tx (ktx); + /* complete now (with failure) */ + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; + + /* The router has finished forwarding this packet */ + + if (error != 0) + { + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); + } + + kqswnal_requeue_rx (krx); +} + +void +kqswnal_rx (kqswnal_rx_t *krx) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int nob; + int niov; + + if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ + /* NB krx requeued when lib_parse() calls back kqswnal_recv */ + lib_parse (&kqswnal_lib, hdr, krx); + return; + } + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ + { + CERROR("dropping packet from "LPX64" for "LPX64 + ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); + kqswnal_requeue_rx (krx); + return; + } + + /* NB forwarding may destroy iov; rebuild every time */ + for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) + { + LASSERT (niov < krx->krx_npages); + krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); + krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + } + + kpr_fwd_init (&krx->krx_fwd, dest_nid, + krx->krx_nob, niov, krx->krx_iov, + kqswnal_fwd_callback, krx); + + kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); +} + +/* Receive Interrupt Handler: posts to schedulers */ +void +kqswnal_rxhandler(EP_RXD *rxd) +{ + long flags; + int nob = ep_rxd_len (rxd); + int status = ep_rxd_status (rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); + + CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", + rxd, krx, nob, status); + + LASSERT (krx != NULL); + + krx->krx_rxd = rxd; + krx->krx_nob = nob; + + /* must receive a whole header to be able to parse */ + if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) + { + /* receives complete with failure when receiver is removed */ + if (kqswnal_data.kqn_shuttingdown) + return; + + CERROR("receive status failed with status %d nob %d\n", + ep_rxd_status(rxd), nob); + kqswnal_requeue_rx (krx); + return; + } + + atomic_inc (&kqswnal_packets_received); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +#if KQSW_CHECKSUM +void +kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 + ", dpid %d, spid %d, type %d\n", + ishdr ? "Header" : "Payload", krx, + NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid) + NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid), + NTOH__u32(hdr->type)); + + switch (NTOH__u32 (hdr->type)) + { + case PTL_MSG_ACK: + CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 + " len %u\n", + NTOH__u32(hdr->msg.ack.mlength), + hdr->msg.ack.dst_wmd.handle_cookie, + hdr->msg.ack.dst_wmd.handle_idx, + NTOH__u64(hdr->msg.ack.match_bits), + NTOH__u32(hdr->msg.ack.length)); + break; + case PTL_MSG_PUT: + CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 + " len %u off %u data "LPX64"\n", + NTOH__u32(hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.handle_cookie, + hdr->msg.put.ack_wmd.handle_idx, + NTOH__u64(hdr->msg.put.match_bits), + NTOH__u32(hdr->msg.put.length), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + case PTL_MSG_GET: + CERROR ("GET: <>\n"); + break; + case PTL_MSG_REPLY: + CERROR ("REPLY: <>\n"); + break; + default: + CERROR ("TYPE?: <>\n"); + } +} +#endif + +static int +kqswnal_recvmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + int page; + char *page_ptr; + int page_nob; + char *iov_ptr; + int iov_nob; + int frag; +#if KQSW_CHECKSUM + kqsw_csum_t senders_csum; + kqsw_csum_t payload_csum = 0; + kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), + sizeof(ptl_hdr_t)); + size_t csum_len = mlen; + int csum_frags = 0; + int csum_nob = 0; + static atomic_t csum_counter; + int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; + + atomic_inc (&csum_counter); + + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + if (senders_csum != hdr_csum) + kqswnal_csum_error (krx, 1); +#endif + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + + /* What was actually received must be >= payload. + * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ + LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + LASSERT (mlen <= rlen); + + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + if (mlen != 0) + { + page = 0; + page_ptr = ((char *) page_address(krx->krx_pages[0])) + + KQSW_HDR_SIZE; + page_nob = PAGE_SIZE - KQSW_HDR_SIZE; + + LASSERT (niov > 0); + if (kiov != NULL) { + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + + for (;;) + { + /* We expect the iov to exactly match mlen */ + LASSERT (iov_nob <= mlen); + + frag = MIN (page_nob, iov_nob); + memcpy (iov_ptr, page_ptr, frag); +#if KQSW_CHECKSUM + payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); + csum_nob += frag; + csum_frags++; +#endif + mlen -= frag; + if (mlen == 0) + break; + + page_nob -= frag; + if (page_nob != 0) + page_ptr += frag; + else + { + page++; + LASSERT (page < krx->krx_npages); + page_ptr = page_address(krx->krx_pages[page]); + page_nob = PAGE_SIZE; + } + + iov_nob -= frag; + if (iov_nob != 0) + iov_ptr += frag; + else if (kiov != NULL) { + kunmap (kiov->kiov_page); + kiov++; + niov--; + LASSERT (niov > 0); + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov++; + niov--; + LASSERT (niov > 0); + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + } + + if (kiov != NULL) + kunmap (kiov->kiov_page); + } + +#if KQSW_CHECKSUM + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + + if (csum_len != rlen) + CERROR("Unable to checksum data in user's buffer\n"); + else if (senders_csum != payload_csum) + kqswnal_csum_error (krx, 0); + + if (csum_verbose) + CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " + "csum_nob %d\n", + hdr_csum, payload_csum, csum_frags, csum_nob); +#endif + lib_finalize(nal, private, cookie); + + kqswnal_requeue_rx (krx); + + return (rlen); +} + +static int +kqswnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + +static int +kqswnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + +int +kqswnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kqswnal_data.kqn_nthreads); + return (0); +} + +void +kqswnal_thread_fini (void) +{ + atomic_dec (&kqswnal_data.kqn_nthreads); +} + +int +kqswnal_scheduler (void *arg) +{ + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; + kpr_fwd_desc_t *fwd; + long flags; + int rc; + int counter = 0; + int did_something; + + kportal_daemonize ("kqswnal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + while (!kqswnal_data.kqn_shuttingdown) + { + did_something = FALSE; + + if (!list_empty (&kqswnal_data.kqn_readyrxds)) + { + krx = list_entry(kqswnal_data.kqn_readyrxds.next, + kqswnal_rx_t, krx_list); + list_del (&krx->krx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + kqswnal_rx (krx); + + did_something = TRUE; + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) + { + ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + rc = kqswnal_launch (ktx); + if (rc != 0) /* failed: ktx_nid down? */ + { + CERROR("Failed delayed transmit to "LPX64 + ": %d\n", ktx->ktx_nid, rc); + kqswnal_tx_done (ktx, rc); + } + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_fwd_packet (NULL, fwd); + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == KQSW_RESCHED) { + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, + kqswnal_data.kqn_shuttingdown || + !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_delayedtxds) || + !list_empty(&kqswnal_data.kqn_delayedfwds)); + LASSERT (rc == 0); + } else if (current->need_resched) + schedule (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_thread_fini (); + return (0); +} + +nal_cb_t kqswnal_lib = +{ + nal_data: &kqswnal_data, /* NAL private data */ + cb_send: kqswnal_send, + cb_send_pages: kqswnal_send_pages, + cb_recv: kqswnal_recv, + cb_recv_pages: kqswnal_recv_pages, + cb_read: kqswnal_read, + cb_write: kqswnal_write, + cb_malloc: kqswnal_malloc, + cb_free: kqswnal_free, + cb_printf: kqswnal_printf, + cb_cli: kqswnal_cli, + cb_sti: kqswnal_sti, + cb_dist: kqswnal_dist +}; diff --git a/lnet/klnds/scimaclnd/.cvsignore b/lnet/klnds/scimaclnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/klnds/scimaclnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/klnds/scimaclnd/Makefile.am b/lnet/klnds/scimaclnd/Makefile.am new file mode 100644 index 0000000..6da31f0 --- /dev/null +++ b/lnet/klnds/scimaclnd/Makefile.am @@ -0,0 +1,11 @@ +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kscimacnal +modulenet_DATA = kscimacnal.o +EXTRA_PROGRAMS = kscimacnal + +DEFS = +kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h diff --git a/lnet/klnds/scimaclnd/README.scimacnal b/lnet/klnds/scimaclnd/README.scimacnal new file mode 100644 index 0000000..d4c6a49 --- /dev/null +++ b/lnet/klnds/scimaclnd/README.scimacnal @@ -0,0 +1,14 @@ + +scimacnal - A NAL for the Scali ScaMAC midlayer. + +The ScaMAC midlayer is a simplified API to the SCI high performance +interconnect. + +In order to use this NAL you'll need to tune scimac to use larger buffers. +See scimac.conf in this directory for an example. + +Overall performance and stability isn't great but this can be attributed +to the scimac driver which apparently is in need of some development. + +TODO: +Routing isn't yet implemented. diff --git a/lnet/klnds/scimaclnd/scimac.conf b/lnet/klnds/scimaclnd/scimac.conf new file mode 100644 index 0000000..bfb6d02 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimac.conf @@ -0,0 +1,35 @@ +# Configuration file for the scimac driver - lustre friendly settings +# + +# The maximal number of message headers to use in the system. +scimac_max_no_hdrs = 32 + +# The maximal number of eager buffers to use in the system. +scimac_max_no_ebufs = 8 + +# The maximal size in bytes of each eager buffer. +scimac_max_ebuf_size = 65536 + +# Enable use of a kernel thread to defer reception of packets. +# Default is to use a tasklet (sw interrupt). +scimac_use_ulevel_recv = 1 + +# The maximal number of packets queued for transfer per path at any one time. +scimac_max_send_queuelen = 2000 + +# The packet retransmit time in milliseconds. +# The time elapsed since a packet was attempted sent until the packet is resent. +scimac_pkt_rexmit_time = 200 + +# The packet's maximal retransmit time in milliseconds. +# The total time that a packet will be attempted sent before it is dropped. +scimac_max_rexmit_time = 5000 + +# The lowest valid node identifier in the system. +scimac_min_nodeid_number = 0x100 + +# The largest valid node identifier in the system. +scimac_max_nodeid_number = 0xff00 + +# The incremental nodeid step in the system. +scimac_nodeid_increment = 0x100 diff --git a/lnet/klnds/scimaclnd/scimacnal.c b/lnet/klnds/scimaclnd/scimacnal.c new file mode 100644 index 0000000..1066d69 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal.c @@ -0,0 +1,219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * Based on gmnal, which is based on ksocknal and qswnal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#include "scimacnal.h" + +ptl_handle_ni_t kscimacnal_ni; +nal_t kscimacnal_api; + +kscimacnal_data_t kscimacnal_data; + +kpr_nal_interface_t kscimacnal_router_interface = { + kprni_nalid: SCIMACNAL, + kprni_arg: NULL, + kprni_fwd: kscimacnal_fwd_packet, +}; + + +static int kscimacnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */ + return PTL_OK; +} + + +static void kscimacnal_lock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + + +static void kscimacnal_unlock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + + +static int kscimacnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kscimacnal_api); + return 0; +} + + +static void kscimacnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kscimacnal_api); + + if (current->need_resched) + schedule(); + return; +} + + +static nal_t *kscimacnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + int nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */ + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids); + lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); + return &kscimacnal_api; +} + + +/* Called by kernel at module unload time */ +static void __exit +kscimacnal_finalize(void) +{ + /* FIXME: How should the shutdown procedure really look? */ + kscimacnal_data.ksci_shuttingdown=1; + + PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni); + + PtlNIFini(kscimacnal_ni); + lib_fini(&kscimacnal_lib); + + mac_finish(kscimacnal_data.ksci_machandle); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + + +/* Called by kernel at module insertion time */ +static int __init +kscimacnal_initialize(void) +{ + int rc; + unsigned long nid=0; + mac_handle_t *machandle = NULL; + + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kscimacnal_api.forward = kscimacnal_forward; + kscimacnal_api.shutdown = kscimacnal_shutdown; + kscimacnal_api.yield = kscimacnal_yield; + kscimacnal_api.validate = NULL; /* our api validate is a NOOP */ + kscimacnal_api.lock= kscimacnal_lock; + kscimacnal_api.unlock= kscimacnal_unlock; + kscimacnal_api.nal_data = &kscimacnal_data; + + kscimacnal_lib.nal_data = &kscimacnal_data; + + memset(&kscimacnal_data, 0, sizeof(kscimacnal_data)); + + kscimacnal_data.ksci_cb = &kscimacnal_lib; + + /* We're not using this, but cli/sti callbacks does... ??? */ + spin_lock_init(&kscimacnal_data.ksci_dispatch_lock); + + /* FIXME: We only support one adapter for now */ + machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx, + &kscimacnal_data); + + if(!machandle) { + CERROR("mac_init() failed\n"); + return -1; + } + + kscimacnal_data.ksci_machandle = machandle; + + /* Make sure the scimac MTU is tuned */ + if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) { + CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n", + mac_get_mtusize(machandle), SCIMACNAL_MTU); + CERROR("Consult README.scimacnal for more information\n"); + mac_finish(machandle); + return -1; + } + + /* Get the node ID */ + /* mac_get_physaddrlen() is a function instead of define, sigh */ + LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid)); + if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) { + CERROR("mac_get_physaddr() failed\n"); + mac_finish(machandle); + return -1; + } + nid = ntohl(nid); + kscimacnal_data.ksci_nid = nid; + + + /* Initialize Network Interface */ + /* FIXME: What do the magic numbers mean? Documentation anyone? */ + rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + mac_finish(machandle); + return (-ENOMEM); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); + + /* We're done now, it's OK for the RX callback to do stuff */ + kscimacnal_data.ksci_init = 1; + + return 0; +} + + +MODULE_AUTHOR("Niklas Edmundsson "); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_LICENSE("GPL"); + +module_init (kscimacnal_initialize); +module_exit (kscimacnal_finalize); + +EXPORT_SYMBOL(kscimacnal_ni); diff --git a/lnet/klnds/scimaclnd/scimacnal.h b/lnet/klnds/scimaclnd/scimacnal.h new file mode 100644 index 0000000..1ff180e --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal.h @@ -0,0 +1,85 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + */ + + +#ifndef _SCIMACNAL_H +#define _SCIMACNAL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include /* For PAGE_SIZE */ + +#define DEBUG_SUBSYSTEM S_UNDEFINED + +#include +#include +#include + +#include + +#ifndef MAC_SAPID_LUSTRE +#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 +#endif /* MAC_SAPID_LUSTRE */ + +#define SCIMACNAL_MTU 65536 +/* FIXME: What is really the MTU of lustre? */ +#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#endif + +typedef struct { + mac_handle_t *handle; + mac_mblk_t *msg; + mac_msg_type_t type; + void *userdata; +} kscimacnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + ptl_hdr_t ktx_hdr; +} kscimacnal_tx_t; + + +typedef struct { + char ksci_init; + char ksci_shuttingdown; + ptl_nid_t ksci_nid; + nal_cb_t *ksci_cb; + spinlock_t ksci_dispatch_lock; + mac_handle_t *ksci_machandle; +} kscimacnal_data_t; + +extern kscimacnal_data_t kscimacnal_data; +extern nal_t kscimacnal_api; +extern nal_cb_t kscimacnal_lib; + +void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata); + + +#endif /* _SCIMACNAL_H */ diff --git a/lnet/klnds/scimaclnd/scimacnal_cb.c b/lnet/klnds/scimaclnd/scimacnal_cb.c new file mode 100644 index 0000000..7e4a2e8 --- /dev/null +++ b/lnet/klnds/scimaclnd/scimacnal_cb.c @@ -0,0 +1,468 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson + + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "scimacnal.h" + +static int +kscimacnal_read (nal_cb_t *nal, void *private, + void *dst_addr, user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static int +kscimacnal_write(nal_cb_t *nal, void *private, + user_ptr dst_addr, void *src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static void * +kscimacnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + + +static void +kscimacnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + + +static void +kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kscimacnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->ksci_dispatch_lock,*flags); +} + + +static void +kscimacnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags); +} + + +static int +kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* FIXME: Network distance has a meaning, but is there no easy + * way to figure it out (depends on routing) */ + + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + + +static +char * get_mac_error(mac_status_t status) +{ + switch(status) { + case MAC_MSG_STAT_OK: + return "MAC_MSG_STAT_OK"; + case MAC_MSG_STAT_FREED: + return "MAC_MSG_STAT_FREED"; + case MAC_MSG_STAT_ABORTED: + return "MAC_MSG_STAT_ABORTED"; + case MAC_MSG_STAT_TIMEDOUT: + return "MAC_MSG_STAT_TIMEDOUT"; + case MAC_MSG_STAT_NODEUNREACH: + return "MAC_MSG_STAT_NODEUNREACH"; + case MAC_MSG_STAT_NETDOWN: + return "MAC_MSG_STAT_NETDOWN"; + case MAC_MSG_STAT_RESET: + return "MAC_MSG_STAT_RESET"; + case MAC_MSG_STAT_INITFAILED: + return "MAC_MSG_STAT_INITFAILED"; + case MAC_MSG_STAT_SYNCFAILED: + return "MAC_MSG_STAT_SYNCFAILED"; + case MAC_MSG_STAT_BADPROTO: + return "MAC_MSG_STAT_BADPROTO"; + case MAC_MSG_STAT_NOBUFSPACE: + return "MAC_MSG_STAT_NOBUFSPACE"; + case MAC_MSG_STAT_CONGESTION: + return "MAC_MSG_STAT_CONGESTION"; + case MAC_MSG_STAT_OTHER: + return "MAC_MSG_STAT_OTHER"; + default: + return "Unknown error"; + } +} + + +/* FIXME add routing code here ? */ + +/* Called by ScaMac when transmission is complete (ie. message is released) */ +static void +kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) +{ + kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; + int err=0; + + LASSERT (ktx != NULL); + + /* Euh, there is no feedback when transmission fails?! */ + switch(status) { + case MAC_MSG_STAT_OK: /* normal */ + break; + default: + CERROR("%s (%d):\n", get_mac_error(status), status); + err = -EIO; + break; + } + + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); +} + + +/* Called by portals when it wants to send a message. + * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ +static int +kscimacnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_len) +{ + kscimacnal_tx_t *ktx=NULL; + kscimacnal_data_t *ksci = nal->nal_data; + int rc=0; + int buf_len = sizeof(ptl_hdr_t) + payload_len; + mac_mblk_t *msg=NULL, *lastblk, *newblk; + unsigned long physaddr; + + + CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, nid, payload_niov); + + LASSERT(ksci != NULL); + + LASSERT(hdr != NULL); + + /* Do real check if we can send this */ + if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { + CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", + mac_get_mtusize(ksci->ksci_machandle)); + return -EINVAL; + } + + + /* save transaction info for later finalize and cleanup */ + PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); + if (!ktx) { + return -ENOMEM; + } + + /* *SIGH* hdr is a stack variable in the calling function, so we + * need to copy it to a buffer. Zerocopy magic (or is it just + * deferred memcpy?) is annoying sometimes. */ + memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t)); + + /* First, put the header in the main message mblk */ + msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t), + kscimacnal_txrelease, ktx); + if (!msg) { + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(msg, sizeof(ptl_hdr_t)); + lastblk=msg; + + /* Allocate additional mblks for each iov as needed. + * Essentially lib_copy_iov2buf with a twist or two */ + while (payload_len > 0) + { + ptl_size_t nob; + + LASSERT (payload_niov > 0); + + nob = MIN (payload_iov->iov_len, payload_len); + + /* We don't need a callback on the additional mblks, since + * all release callbacks seems to be called when the entire + * message has been sent */ + newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); + if(!newblk) { + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(newblk, nob); + mac_link_mblk(lastblk, newblk); + lastblk=newblk; + + payload_len -= nob; + payload_niov--; + payload_iov++; + } + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + + CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid); + + physaddr = htonl(nid); + + if((rc=mac_send(ksci->ksci_machandle, msg, + (mac_physaddr_t *) &physaddr))) { + CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return rc; + } + + return 0; +} + + +void +kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + + +/* Process a received portals packet */ +/* Called by the ScaMac RX thread when a packet is received */ +void +kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, + void *userdata) +{ + ptl_hdr_t *hdr = NULL; + kscimacnal_rx_t krx; + mac_size_t size; + kscimacnal_data_t *ksci = userdata; + + LASSERT(ksci != NULL); + + if ( !ksci->ksci_init || ksci->ksci_shuttingdown || + type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) { + /* We're not interested in messages not for us, ignore */ + mac_free_msg(msg); + return; + } + + size = mac_msg_size(msg); + + CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", + msg, type, size, mac_msg_mblks(msg)); + + if( size < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (ksci->ksci_shuttingdown) + return; + CERROR("kscimacnal: did not receive complete portal header," + "size= %ld\n", size); + /* Free the message before exiting */ + mac_free_msg(msg); + return; + } + + /* Provide everything we know */ + krx.handle = handle; + krx.msg = msg; + krx.type = type; + krx.userdata = userdata; + + /* mac_msg_next returns the next mblk with unread data */ + hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) ); + + if(!hdr) { + CERROR("kscimacnal: no data block in message %p\n", msg); + mac_free_msg(msg); + return; + } + + if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) { + PROF_START(lib_parse); + /* sets wanted_len, iovs etc and calls our callback */ + lib_parse(&kscimacnal_lib, hdr, &krx); + PROF_FINISH(lib_parse); +#if 0 /* FIXME: Is it possible to detect this? */ + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx:" + "target is a peer\n", + hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); +#endif /* if 0 FIXME */ + } else { + /* forward to gateway */ + CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n", + kscimacnal_lib.ni.nid, hdr->dest_nid); + } + + mac_free_msg(msg); + + CDEBUG(D_NET, "msg %p: Done\n", msg); +} + + +/* Called by portals to process a recieved packet */ +static int kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + kscimacnal_rx_t *krx = private; + mac_mblk_t *mblk; + void *src; + mac_size_t pkt_len; + ptl_size_t iovused=0; + + LASSERT (krx != NULL); + LASSERT (krx->msg != NULL); + + CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n", + krx->msg, mlen, rlen, niov); + + /* What was actually received must be >= what sender claims to have + * sent. This is an LASSERT, since lib-move doesn't check cb return + * code yet. Also, rlen seems to be negative when mlen==0 so don't + * assert on that. + */ + LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); + LASSERT (mlen==0 || mlen <= rlen); + + PROF_START(memcpy); + + /* mac_msg_next returns next mblk with unread data (ie. can + * be same mblk */ + while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) { + pkt_len = mac_mblk_len(mblk); + src = mac_get_mblk(mblk, pkt_len); /* Next unread block */ + + CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld src: %p\n", + krx->msg, mblk, pkt_len, src); + + LASSERT(src != NULL); + + /* Essentially lib_copy_buf2iov but with continuation support, + * we "gracefully" thrash the argument vars ;) */ + while (pkt_len > 0) { + ptl_size_t nob; + + LASSERT (niov > 0); + + LASSERT(iovused < iov->iov_len); + + nob = MIN (iov->iov_len-iovused, pkt_len); + CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + "iovused: %d\n", + iov->iov_base, iov->iov_len, + src, nob, iovused); + + memcpy (iov->iov_base+iovused, src, nob); + pkt_len -= nob; + src += nob; + + if(nob+iovused < iov->iov_len) { + /* We didn't use all of the iov */ + iovused+=nob; + } + else { + niov--; + iov++; + iovused=0; + } + } + } + PROF_FINISH(memcpy); + + CDEBUG(D_NET, "Calling lib_finalize.\n"); + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + CDEBUG(D_NET, "Done.\n"); + + return rlen; +} + + +nal_cb_t kscimacnal_lib = { + nal_data: &kscimacnal_data, /* NAL private data */ + cb_send: kscimacnal_send, + cb_send_pages: NULL, /* Ignore for now */ + cb_recv: kscimacnal_recv, + cb_recv_pages: NULL, + cb_read: kscimacnal_read, + cb_write: kscimacnal_write, + cb_malloc: kscimacnal_malloc, + cb_free: kscimacnal_free, + cb_printf: kscimacnal_printf, + cb_cli: kscimacnal_cli, + cb_sti: kscimacnal_sti, + cb_dist: kscimacnal_dist +}; diff --git a/lnet/klnds/socklnd/.cvsignore b/lnet/klnds/socklnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/klnds/socklnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/klnds/socklnd/Makefile.am b/lnet/klnds/socklnd/Makefile.am new file mode 100644 index 0000000..437d7fc --- /dev/null +++ b/lnet/klnds/socklnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ksocknal +modulenet_DATA = ksocknal.o +EXTRA_PROGRAMS = ksocknal + +DEFS = +ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lnet/klnds/socklnd/Makefile.mk b/lnet/klnds/socklnd/Makefile.mk new file mode 100644 index 0000000..46edf01 --- /dev/null +++ b/lnet/klnds/socklnd/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Kernelenv + +obj-y += ksocknal.o +ksocknal-objs := socknal.o socknal_cb.o + diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 0000000..91d971c --- /dev/null +++ b/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,860 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +ptl_handle_ni_t ksocknal_ni; +static nal_t ksocknal_api; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +ksock_nal_data_t ksocknal_data; +#else +static ksock_nal_data_t ksocknal_data; +#endif + +kpr_nal_interface_t ksocknal_router_interface = { + kprni_nalid: SOCKNAL, + kprni_arg: &ksocknal_data, + kprni_fwd: ksocknal_fwd_packet, +}; + + +int +ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ + return PTL_OK; +} + +int +ksocknal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ksocknal_close_sock(0); /* close all sockets */ +} + +void +ksocknal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ksocknal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ksocknal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ksocknal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0); + lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size); + return (&ksocknal_api); +} + +/* + * EXTRA functions follow + */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ksocknal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ksocknal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->nid); + + ni->nid = nid; + return (0); +} + +void +ksocknal_bind_irq (unsigned int irq, int cpu) +{ +#if (defined(CONFIG_SMP) && CPU_AFFINITY) + char cmdline[64]; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + snprintf (cmdline, sizeof (cmdline), + "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + + printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); + + /* FIXME: Find a better method of setting IRQ affinity... + */ + + call_usermodehelper (argv[0], argv, envp); +#endif +} + +int +ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + ksock_sched_t *sched = NULL; + unsigned int irq = 0; + struct net_device *dev = NULL; + int ret; + int idx; + ENTRY; + + LASSERT (!in_interrupt()); + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + sock->sk->allocation = GFP_NOFS; /* don't call info fs for alloc */ + + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->data_ready; + conn->ksnc_saved_write_space = sock->sk->write_space; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + +#warning check it is OK to derefence sk->dst_cache->dev like this... + lock_sock (conn->ksnc_sock->sk); + + if (conn->ksnc_sock->sk->dst_cache != NULL) { + dev = conn->ksnc_sock->sk->dst_cache->dev; + if (dev != NULL) { + irq = dev->irq; + if (irq >= NR_IRQS) { + CERROR ("Unexpected IRQ %x\n", irq); + irq = 0; + } + } + } + + release_sock (conn->ksnc_sock->sk); + + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (irq == 0 || + ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { + /* This is a software NIC, or we haven't associated it with + * a CPU yet */ + + /* Choose the CPU with the fewest connections */ + sched = ksocknal_data.ksnd_schedulers; + for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[idx].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[idx]; + + if (irq != 0) { /* Hardware NIC */ + /* Remember which scheduler we chose */ + idx = sched - ksocknal_data.ksnd_schedulers; + + LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + + if (bind_irq) /* remember if we will bind below */ + idx |= SOCKNAL_IRQ_BOUND; + + ksocknal_data.ksnd_irq_info[irq] = idx; + } + } else { + /* This is a hardware NIC, associated with a CPU */ + idx = ksocknal_data.ksnd_irq_info[irq]; + + /* Don't bind again if we've bound already */ + if ((idx & SOCKNAL_IRQ_BOUND) != 0) + bind_irq = 0; + + sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + } + + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (bind_irq && /* irq binding required */ + irq != 0) /* hardware NIC */ + ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + + /* NOW it's safe to get called back when socket is ready... */ + sock->sk->user_data = conn; + sock->sk->data_ready = ksocknal_data_ready; + sock->sk->write_space = ksocknal_write_space; + + /* ...which I call right now to get things going */ + ksocknal_data_ready (sock->sk, 0); + ksocknal_write_space (sock->sk); + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ksocknal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid == 0) { /* close ALL connections */ + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ksocknal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ksocknal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid && list_empty (&death_row)) + return (-ENOENT); + + while (!list_empty (&death_row)) { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + /* NB I _have_ to restore the callback, rather than storing + * a noop, since the socket could survive past this module + * being unloaded!! */ + conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space; + + /* OK; no more callbacks, but they could be in progress now, + * so wait for them to complete... */ + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + /* ...however if I get the lock before a callback gets it, + * this will make them noop + */ + conn->ksnc_sock->sk->user_data = NULL; + + /* And drop the scheduler's connection count while I've got + * the exclusive lock */ + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, + flags); + + ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ + } + + return (0); +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + return &(sk->tp_pinfo.af_tcp); +} +#else +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + struct tcp_sock *s = (struct tcp_sock *)sk; + return &s->tcp; +} +#endif + +void +ksocknal_push_conn (ksock_conn_t *conn) +{ + struct sock *sk = conn->ksnc_sock->sk; + struct tcp_opt *tp = sock2tcp_opt(sk); + int nonagle; + int val = 1; + int rc; + mm_segment_t oldmm; + + lock_sock (sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock (sk); + + oldmm = get_fs (); + set_fs (KERNEL_DS); + + rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof (val)); + LASSERT (rc == 0); + + set_fs (oldmm); + + lock_sock (sk); + tp->nonagle = nonagle; + release_sock (sk); +} + +/* Passing in a zero nid pushes all connections */ +int +ksocknal_push_sock (ptl_nid_t nid) +{ + ksock_conn_t *conn; + struct list_head *tmp; + int index; + int i; + + if (nid != 0) { + conn = ksocknal_get_conn (nid); + + if (conn == NULL) + return (-ENOENT); + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + + return (0); + } + + /* NB we can't remove connections from the socket list so we have to + * cope with them being removed from under us... + */ + for (index = 0; ; index++) { + read_lock (&ksocknal_data.ksnd_socklist_lock); + + i = 0; + conn = NULL; + + list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); // take a ref + break; + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + if (conn == NULL) + break; + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + } + + return (0); +} + +ksock_conn_t * +ksocknal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ksocknal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", + nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ksocknal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready); + LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space); + LASSERT (conn->ksnc_sock->sk->user_data == NULL); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) { + ksocknal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} + +int +ksocknal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, + data->ioc_flags); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ksocknal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ksocknal_set_mynid (data->ioc_nid); + break; + } + case NAL_CMD_PUSH_CONNECTION: { + rc = ksocknal_push_sock (data->ioc_nid); + break; + } + } + + return rc; +} + +void +ksocknal_free_buffers (void) +{ + if (ksocknal_data.ksnd_fmbs != NULL) { + ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; + i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); + i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ksocknal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ksocknal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ksocknal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + + SOCKNAL_NNBLK_LTXS)); + + if (ksocknal_data.ksnd_schedulers != NULL) + PORTAL_FREE (ksocknal_data.ksnd_schedulers, + sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); +} + +void __exit +ksocknal_module_fini (void) +{ + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(SOCKNAL); + PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ksocknal_ni); + lib_fini(&ksocknal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); + LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = + &ksocknal_data.ksnd_schedulers[i]; + + LASSERT (list_empty (&kss->kss_tx_conns)); + LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (kss->kss_nconns == 0); + } + + /* stop router calling me */ + kpr_shutdown (&ksocknal_data.ksnd_router); + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_reaper_waitq); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) + wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + + while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ksocknal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ksocknal_data.ksnd_router); + + ksocknal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + + +int __init +ksocknal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ksocknal_api.forward = ksocknal_api_forward; + ksocknal_api.shutdown = ksocknal_api_shutdown; + ksocknal_api.yield = ksocknal_api_yield; + ksocknal_api.validate = NULL; /* our api validate is a NOOP */ + ksocknal_api.lock = ksocknal_api_lock; + ksocknal_api.unlock = ksocknal_api_unlock; + ksocknal_api.nal_data = &ksocknal_data; + + ksocknal_lib.nal_data = &ksocknal_data; + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); + rwlock_init(&ksocknal_data.ksnd_socklist_lock); + + ksocknal_data.ksnd_nal_cb = &ksocknal_lib; + spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); + + spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, + sizeof (ksocknal_data.ksnd_irq_info)); + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + + PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); + if (ksocknal_data.ksnd_schedulers == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; + + spin_lock_init (&kss->kss_lock); + INIT_LIST_HEAD (&kss->kss_rx_conns); + INIT_LIST_HEAD (&kss->kss_tx_conns); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&kss->kss_zctxdone_list); +#endif + init_waitqueue_head (&kss->kss_waitq); + } + + CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, + sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS)); + if (ksocknal_data.ksnd_ltxs == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ksocknal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ksocknal_data.ksnd_idle_ltx_list : + &ksocknal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ksocknal_ni, ~0); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + rc = ksocknal_thread_start (ksocknal_scheduler, + &ksocknal_data.ksnd_schedulers[i]); + if (rc != 0) { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", + i, rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + + rc = ksocknal_thread_start (ksocknal_reaper, NULL); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ksocknal_data.ksnd_router, + &ksocknal_router_interface); + if (rc != 0) { + CDEBUG(D_NET, "Can't initialise routing interface " + "(rc = %d): not routing\n", rc); + } else { + /* Only allocate forwarding buffers if I'm on a gateway */ + + PORTAL_ALLOC(ksocknal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + if (ksocknal_data.ksnd_fmbs == NULL) { + ksocknal_module_fini (); + RETURN(-ENOMEM); + } + + /* NULL out buffer pointers etc */ + memset(ksocknal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS); i++) { + ksock_fmb_t *fmb = + &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; + } else { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) { + fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + LASSERT(page_address (fmb->fmb_pages[j]) != + NULL); + } + + list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + } + + rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + ksocknal_module_fini (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(ksocknal_ni); + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " + "mem %d)\n", + kpr_routing (&ksocknal_data.ksnd_router) ? + "enabled" : "disabled", pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); + +EXPORT_SYMBOL (ksocknal_ni); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 0000000..86cdeb0 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,292 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +typedef struct /* pool of forwarding buffers */ +{ + spinlock_t fmp_lock; /* serialise */ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + struct list_head kss_tx_conns; /* conn waiting to be written */ +#if SOCKNAL_ZC + struct list_head kss_zctxdone_list; /* completed ZC transmits */ +#endif + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + int kss_nconns; /* # connections assigned to this scheduler */ +} ksock_sched_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + ksock_sched_t *ksnd_schedulers; /* scheduler state */ + + kpr_router_t ksnd_router; /* THE router */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + spinlock_t ksnd_idle_ltx_lock; /* serialise ltx alloc/free */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ +#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ +#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded + * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 + * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 + * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t + * fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, if the message + * requires forwarding or will be received into mapped memory, up to + * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. + * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. + */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet iovec frags */ + struct iovec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + ptl_kiov_t *tx_kiov; /* packet page frags */ +#if SOCKNAL_ZC + ksock_sched_t *tx_sched; /* who to wake on callback */ + zccd_t tx_zccd; /* zero copy callback descriptor */ +#endif +} ksock_tx_t; + +#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the tx frag descriptors: hdr is always 1 iovec + * and payload is PTL_MD_MAX of either type. */ +typedef struct +{ + struct iovec hdr; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } payload; +} ksock_txiovspace_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + ksock_txiovspace_t ltx_iov_space; /* where to stash frag descriptors */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the address of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or PTL_MD_MAX_IOV frags of payload of either type. */ +typedef union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + volatile int ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct iovec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + volatile int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); +extern int ksocknal_close_sock(ptl_nid_t nid); +extern int ksocknal_set_mynid(ptl_nid_t nid); +extern int ksocknal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); +extern void _ksocknal_put_conn (ksock_conn_t *conn); +extern void ksocknal_close_conn (ksock_conn_t *conn); + +static inline void +ksocknal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ksocknal_put_conn (conn); +} + +extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); +extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ksocknal_scheduler (void *arg); +extern int ksocknal_reaper (void *arg); +extern void ksocknal_data_ready(struct sock *sk, int n); +extern void ksocknal_write_space(struct sock *sk); + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 0000000..6147d8a --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,1613 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +atomic_t ksocknal_packets_received; +atomic_t ksocknal_packets_launched; +atomic_t ksocknal_packets_being_sent; + +#if SOCKNAL_ZC +int ksocknal_do_zc = 1; +int ksocknal_zc_min_frag = 2048; +#endif + +/* + * LIB functions follow + * + */ +int +ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ksocknal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ksocknal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ksocknal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ksocknal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ksocknal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ksocknal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) { + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) { + if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock, + flags); + + wait_event (ksocknal_data.ksnd_idle_ltx_waitq, + !list_empty (&ksocknal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + return (ltx); +} + +#if SOCKNAL_ZC +struct page * +ksocknal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (page == NULL || + !VALID_PAGE (page)) + return (NULL); + + return (page); +} +#endif + +int +ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +{ + struct iovec *iov = tx->tx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; +#if SOCKNAL_ZC + int offset = vaddr & (PAGE_SIZE - 1); + int zcsize = MIN (fragsize, PAGE_SIZE - offset); + struct page *page; +#endif + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (tx->tx_niov > 0); + more |= (tx->tx_niov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + zcsize >= ksocknal_zc_min_frag && + (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { + + CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", + (void *)vaddr, page, page_address(page), offset, zcsize); + + more |= (zcsize < fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, zcsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + /* NB don't pass tx's iov; sendmsg may or may not update it */ + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_iov++; + tx->tx_niov--; + return (1); +} + +int +ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +{ + ptl_kiov_t *kiov = tx->tx_kiov; + int fragsize = kiov->kiov_len; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_nkiov > 0); + more |= (tx->tx_nkiov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + fragsize >= ksocknal_zc_min_frag) { + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, fragsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + char *addr = ((char *)kmap (page)) + offset; + struct iovec fragiov = {.iov_base = addr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + kunmap (page); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_kiov++; + tx->tx_nkiov--; + return (1); +} + +int +ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +{ + int rc; + int sent_some = 0; + ENTRY; + + LASSERT (!in_interrupt()); + + for (;;) { + if (tx->tx_niov != 0) + rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + else + rc = ksocknal_send_kiov (sock, tx, more); + + /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ + if (rc <= 0) /* error or partial send */ + RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); + + if (tx->tx_nob == 0) /* sent everything */ + RETURN (0); + + sent_some = 1; + } +} + +int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_iov++; + conn->ksnc_rx_niov--; + return (1); +} + +int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + unsigned long vaddr = ((unsigned long)kmap (page)) + offset; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + LASSERT (conn->ksnc_rx_nkiov > 0); + LASSERT (offset + fragsize <= PAGE_SIZE); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + kunmap (page); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_kiov++; + conn->ksnc_rx_nkiov--; + return (1); +} + +int +ksocknal_recvmsg (ksock_conn_t *conn) +{ + int rc; + int got_some = 0; + ENTRY; + + LASSERT (!in_interrupt ()); + + for (;;) { + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + /* CAVEAT EMPTOR: we return... + * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ + + if (rc <= 0) /* error/EOF or partial receive */ + RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); + + if (conn->ksnc_rx_nob_wanted == 0) + RETURN (1); + + got_some = 0; + } +} + +#if SOCKNAL_ZC +void +ksocknal_zc_callback (zccd_t *zcd) +{ + ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); + ksock_sched_t *sched = tx->tx_sched; + unsigned long flags; + ENTRY; + + /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); + EXIT; +} +#endif + +void +ksocknal_tx_done (ksock_tx_t *tx) +{ + long flags; + ksock_ltx_t *ltx; + ENTRY; + + atomic_dec (&ksocknal_packets_being_sent); + + if (tx->tx_isfwd) { /* was a forwarded packet? */ + kpr_fwd_done (&ksocknal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + EXIT; + return; + } + + /* local send */ + ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + EXIT; +} + +void +ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + LASSERT (!list_empty (&sched->kss_tx_conns)); + conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + /* assume transmit will complete now, so dequeue while I've got lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ksocknal_sendmsg (conn->ksnc_sock, tx, + !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc != 0) { +#warning FIXME: handle socket errors properly + CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + /* kid on for now the whole packet went. + * NB when we handle the error better, we'll still need to + * block for zccd completion. + */ + tx->tx_nob = 0; + } + + if (tx->tx_nob == 0) /* nothing left to send */ + { + /* everything went; assume more can go, so prevent write_space locking */ + conn->ksnc_tx_ready = 1; + + ksocknal_put_conn (conn); /* release packet's ref */ + atomic_inc (&ksocknal_packets_being_sent); +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + /* zccd skbufs are still in-flight. Release my + * initial ref on zccd, so callback can occur */ + zccd_put (&tx->tx_zccd); + } else +#endif + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + } else { + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); +} + +void +ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + unsigned long flags; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. + */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", + ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, + tx->tx_niov, tx->tx_nkiov); + +#if SOCKNAL_ZC + zccd_init (&tx->tx_zccd, ksocknal_zc_callback); + /* NB this sets 1 ref on zccd, so the callback can only occur + * after I've released this ref */ + tx->tx_sched = sched; +#endif + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + + atomic_inc (&ksocknal_packets_launched); +} + +ksock_conn_t * +ksocknal_send_target (ptl_nid_t nid) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + int rc; + + if ((conn = ksocknal_get_conn (nid)) == NULL) { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + return (NULL); + } + + if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { + CERROR ("Can't route to "LPX64": gateway "LPX64 + " is not a peer\n", nid, gatewaynid); + return (NULL); + } + } + + return (conn); +} + +ksock_ltx_t * +ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type) +{ + ksock_ltx_t *ltx; + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) { + CERROR ("Can't allocate tx desc\n"); + return (NULL); + } + + /* Init local send packet (storage for hdr, finalize() args) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + /* Init common ltx_tx */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr); + + /* We always have 1 mapped frag for the header */ + ltx->ltx_tx.tx_niov = 1; + ltx->ltx_tx.tx_iov = <x->ltx_iov_space.hdr; + ltx->ltx_tx.tx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + ltx->ltx_tx.tx_kiov = NULL; + ltx->ltx_tx.tx_nkiov = 0; + + return (ltx); +} + +int +ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it + * + * Also, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64 + " pid %d\n", payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) { + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + /* append the payload_iovs to the one pointing at the header */ + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +int +ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) + return (-1); + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + return (-1); + } + + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov; + memcpy (ltx->ltx_tx.tx_kiov, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_nkiov = payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + /* I'm the gateway; must be the last hop */ + if (nid == ksocknal_lib.ni.nid) + nid = fwd->kprfd_target_nid; + + conn = ksocknal_get_conn (nid); + if (conn == NULL) { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + tx->tx_nkiov = 0; + tx->tx_kiov = NULL; + + ksocknal_launch_packet (conn, tx); +} + +int +ksocknal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ksocknal_data.ksnd_nthreads); + return (0); +} + +void +ksocknal_thread_fini (void) +{ + atomic_dec (&ksocknal_data.ksnd_nthreads); +} + +void +ksocknal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ksock_fmb_pool_t *fmp = fmb->fmb_pool; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn = NULL; + ksock_sched_t *sched; + long flags; + + if (error != 0) + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid), + error); + else + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", + NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + + spin_lock_irqsave (&fmp->fmp_lock, flags); + + list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); + + if (!list_empty (&fmp->fmp_blocked_conns)) { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + } + + spin_unlock_irqrestore (&fmp->fmp_lock, flags); + + if (conn == NULL) + return; + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); +} + +ksock_fmb_t * +ksocknal_get_idle_fmb (ksock_conn_t *conn) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + long flags; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (ksocknal_data.ksnd_fmbs != NULL); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; + + spin_lock_irqsave (&pool->fmp_lock, flags); + + if (!list_empty (&pool->fmp_idle_fmbs)) { + fmb = list_entry(pool->fmp_idle_fmbs.next, + ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + spin_unlock_irqrestore (&pool->fmp_lock, flags); + + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + + spin_unlock_irqrestore (&pool->fmp_lock, flags); + return (NULL); +} + + +int +ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + /* copy header */ + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + + if (payload_nob == 0) { /* got complete packet already */ + atomic_inc (&ksocknal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, 1, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* forward it now */ + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + } else { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = + page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, niov, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / + sizeof (struct iovec)); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = + (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = + fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], + (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); + return (0); +} + +void +ksocknal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + if (body_len < 0) { /* length corrupt (overflow) */ + CERROR("dropping packet from "LPX64" for "LPX64": packet " + "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, body_len); + ksocknal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (ksocknal_data.ksnd_fmbs == NULL) { /* not forwarding */ + CERROR("dropping packet from "LPX64" for "LPX64": not " + "forwarding\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) { /* too big to forward */ + CERROR ("dropping packet from "LPX64" for "LPX64 + ": packet size %d too big\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid, body_len); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + /* should have gone direct */ + conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); + if (conn2 != NULL) { + CERROR ("dropping packet from "LPX64" for "LPX64 + ": target is a peer\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + ksocknal_put_conn (conn2); /* drop ref from get above */ + + /* on to next packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + return (1); + } + + /* Set up to skip as much a possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_fmb_t *fmb; + int rc; + + /* NB: sched->ksnc_lock lock held */ + + LASSERT (!list_empty (&sched->kss_rx_conns)); + conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + CDEBUG(D_NET, "sched %p conn %p\n", sched, conn); + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* doesn't need a forwarding buffer */ + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) + goto try_read; + + get_fmb: + fmb = ksocknal_get_idle_fmb (conn); + if (fmb == NULL) { /* conn descheduled waiting for idle fmb */ + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + return; + } + + if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + rc = ksocknal_recvmsg(conn); + + if (rc == 0) + goto out; + if (rc < 0) { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read %p: %d\n", conn, rc); + goto out; + } + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + /* got all I wanted, assume there's more - prevent data_ready locking */ + conn->ksnc_rx_ready = 1; + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: + /* It's not for me */ + if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + ksocknal_fwd_parse (conn); + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + goto get_fmb; /* => go get a fwd msg buffer */ + default: + LBUG (); + } + /* Not Reached */ + } + + PROF_START(lib_parse); + /* sets wanted_len, iovs etc */ + lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ksocknal_packets_received); + /* packet is done now */ + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + NTOH__u64 (conn->ksnc_hdr.dest_nid), + conn->ksnc_rx_nob_left); + + atomic_inc (&ksocknal_packets_received); + + /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ + kpr_fwd_start (&ksocknal_data.ksnd_router, + (kpr_fwd_desc_t *)conn->ksnc_cookie); + + /* no slop in forwarded packets */ + LASSERT (conn->ksnc_rx_nob_left == 0); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* no data there to read? */ + if (!conn->ksnc_rx_ready) { + /* let socket callback schedule again */ + conn->ksnc_rx_scheduled = 0; + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); +} + +int +ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int +ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int ksocknal_scheduler (void *arg) +{ + ksock_sched_t *sched = (ksock_sched_t *)arg; + unsigned long flags; + int rc; + int nloops = 0; + int id = sched - ksocknal_data.ksnd_schedulers; + char name[16]; +#if (CONFIG_SMP && CPU_AFFINITY) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + int cpu = cpu_logical_map(id % num_online_cpus()); +#else +#warning "Take care of architecure specific logical APIC map" + int cpu = 1; /* Have to change later. */ +#endif /* LINUX_VERSION_CODE */ + + set_cpus_allowed (current, 1 << cpu); + id = cpu; +#endif /* CONFIG_SMP && CPU_AFFINITY */ + + snprintf (name, sizeof (name),"ksocknald[%d]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&sched->kss_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + did_something = 1; + /* drops & regains kss_lock */ + ksocknal_process_receive (sched, &flags); + } + + if (!list_empty (&sched->kss_tx_conns)) { + did_something = 1; + /* drops and regains kss_lock */ + ksocknal_process_transmit (sched, &flags); + } +#if SOCKNAL_ZC + if (!list_empty (&sched->kss_zctxdone_list)) { + ksock_tx_t *tx = + list_entry(sched->kss_zctxdone_list.next, + ksock_tx_t, tx_list); + did_something = 1; + + list_del (&tx->tx_list); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, flags); + } +#endif + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_irqrestore (&sched->kss_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ +#if SOCKNAL_ZC + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns) || + !list_empty(&sched->kss_zctxdone_list)); +#else + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns)); +#endif + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&sched->kss_lock, flags); + } + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + ksocknal_thread_fini (); + return (0); +} + +void +ksocknal_data_ready (struct sock *sk, int n) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + ENTRY; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->data_ready != &ksocknal_data_ready); + sk->data_ready (sk, n); + } else if (!conn->ksnc_rx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_rx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_receive may have cleared while I blocked for the lock) */ + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + EXIT; +} + +void +ksocknal_write_space (struct sock *sk) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->write_space != &ksocknal_write_space); + sk->write_space (sk); + } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + clear_bit (SOCK_NOSPACE, &sk->socket->flags); + + if (!conn->ksnc_tx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_tx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_transmit may have + cleared while I blocked for the lock) */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && // not being progressed + !list_empty(&conn->ksnc_tx_queue)){//packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); +} + +int +ksocknal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ksocknal_reaper"); + kportal_blockallsigs (); + + while (!ksocknal_data.ksnd_shuttingdown) { + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ksocknal_data.ksnd_reaper_list)) { + conn = NULL; + } else { + conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ksocknal_close_conn (conn); + else { + rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ksocknal_thread_fini (); + return (0); +} + +nal_cb_t ksocknal_lib = { + nal_data: &ksocknal_data, /* NAL private data */ + cb_send: ksocknal_send, + cb_send_pages: ksocknal_send_pages, + cb_recv: ksocknal_recv, + cb_recv_pages: ksocknal_recv_pages, + cb_read: ksocknal_read, + cb_write: ksocknal_write, + cb_callback: ksocknal_callback, + cb_malloc: ksocknal_malloc, + cb_free: ksocknal_free, + cb_printf: ksocknal_printf, + cb_cli: ksocknal_cli, + cb_sti: ksocknal_sti, + cb_dist: ksocknal_dist +}; diff --git a/lnet/klnds/toelnd/.cvsignore b/lnet/klnds/toelnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/klnds/toelnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/klnds/toelnd/Makefile.am b/lnet/klnds/toelnd/Makefile.am new file mode 100644 index 0000000..9bfff64 --- /dev/null +++ b/lnet/klnds/toelnd/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ktoenal +modulenet_DATA = ktoenal.o +EXTRA_PROGRAMS = ktoenal + +DEFS = +ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h diff --git a/lnet/klnds/toelnd/toenal.c b/lnet/klnds/toelnd/toenal.c new file mode 100644 index 0000000..1f5dc38 --- /dev/null +++ b/lnet/klnds/toelnd/toenal.c @@ -0,0 +1,629 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include "toenal.h" + +ptl_handle_ni_t ktoenal_ni; +static nal_t ktoenal_api; +static ksock_nal_data_t ktoenal_data; + +/* +ksocknal_interface_t ktoenal_interface = { + ksni_add_sock: ktoenal_add_sock, + ksni_close_sock: ktoenal_close_sock, + ksni_set_mynid: ktoenal_set_mynid, +}; +*/ + +kpr_nal_interface_t ktoenal_router_interface = { + kprni_nalid: TOENAL, + kprni_arg: &ktoenal_data, + kprni_fwd: ktoenal_fwd_packet, +}; + + +int +ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */ + return PTL_OK; +} + +int +ktoenal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ktoenal_close_sock(0); /* close all sockets */ +} + +void +ktoenal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ktoenal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ktoenal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ktoenal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ktoenal_data.ksnd_mynid); + lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ktoenal_api); +} + +/* + * EXTRA functions follow + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ktoenal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ktoenal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid); + + ktoenal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +int +ktoenal_add_sock (ptl_nid_t nid, int fd) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + int ret; + ENTRY; + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + file->f_flags |= O_NONBLOCK; /* Does this have any conflicts */ + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ktoenal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist); + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + ktoenal_data_ready(conn); + ktoenal_write_space(conn); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + /* Schedule pollthread so that it will poll + * for newly created socket + */ + + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ktoenal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + if (nid == 0) /* close ALL connections */ + { + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ktoenal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ktoenal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + if (list_empty (&death_row)) + return (-ENOENT); + + do { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + ktoenal_put_conn (conn); /* drop ref for ksnd_socklist */ + } while (!list_empty (&death_row)); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + return (0); +} + + +ksock_conn_t * +ktoenal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ktoenal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ktoenal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) + { + ktoenal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list); + wake_up (&ktoenal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); +} + +void +ktoenal_free_buffers (void) +{ + if (ktoenal_data.ksnd_fmbs != NULL) + { + ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ktoenal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ktoenal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); +} + +int +ktoenal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ktoenal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ktoenal_set_mynid (data->ioc_nid); + break; + } + } + + return rc; +} + + +void __exit +ktoenal_module_fini (void) +{ + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ktoenal_data.ksnd_init) + { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(TOENAL); + PORTAL_SYMBOL_UNREGISTER (ktoenal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ktoenal_ni); + lib_fini(&ktoenal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ktoenal_data.ksnd_socklist)); + LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns)); + + kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */ + + /* flag threads to terminate; wake and wait for them to die */ + ktoenal_data.ksnd_shuttingdown = 1; + wake_up_all (&ktoenal_data.ksnd_reaper_waitq); + wake_up_all (&ktoenal_data.ksnd_sched_waitq); + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0) + { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ktoenal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ktoenal_data.ksnd_router); + + ktoenal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +int __init +ktoenal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ktoenal_api.forward = ktoenal_api_forward; + ktoenal_api.shutdown = ktoenal_api_shutdown; + ktoenal_api.yield = ktoenal_api_yield; + ktoenal_api.validate = NULL; /* our api validate is a NOOP */ + ktoenal_api.lock = ktoenal_api_lock; + ktoenal_api.unlock = ktoenal_api_unlock; + ktoenal_api.nal_data = &ktoenal_data; + + ktoenal_lib.nal_data = &ktoenal_data; + + memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist); + rwlock_init(&ktoenal_data.ksnd_socklist_lock); + + ktoenal_data.ksnd_nal_cb = &ktoenal_lib; + spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock); + + spin_lock_init (&ktoenal_data.ksnd_sched_lock); + + init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns); + INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list); + init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq); + spin_lock_init (&ktoenal_data.ksnd_reaper_lock); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */ + + PORTAL_ALLOC(ktoenal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + if (ktoenal_data.ksnd_fmbs == NULL) + RETURN(-ENOMEM); + + /* NULL out buffer pointers etc */ + memset(ktoenal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) + { + ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp; + } + else + { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) + { + fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + LASSERT (page_address (fmb->fmb_pages[j]) != NULL); + } + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + + PORTAL_ALLOC(ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + if (ktoenal_data.ksnd_ltxs == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ktoenal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) + { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ktoenal_data.ksnd_idle_ltx_list : + &ktoenal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni); + if (rc != 0) + { + CERROR("ktoenal: PtlNIInit failed: error %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ktoenal_ni, ~0); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */ + + ktoenal_data.ksnd_slistchange = 1; + for (i = 0; i < TOENAL_N_SCHED; i++) + { + rc = ktoenal_thread_start (ktoenal_scheduler, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); + ktoenal_module_fini (); + RETURN (rc); + } + } + + rc = ktoenal_thread_start (ktoenal_reaper, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = ktoenal_thread_start (ktoenal_pollthread, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal pollthread: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ktoenal_data.ksnd_router, + &ktoenal_router_interface); + if (rc != 0) + CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc); + + rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL); + if (rc != 0) + CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n", + rc); + + PORTAL_SYMBOL_REGISTER(ktoenal_ni); + + /* flag everything initialised */ + ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n", + kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ktoenal_module_init); +module_exit(ktoenal_module_fini); + +EXPORT_SYMBOL (ktoenal_ni); diff --git a/lnet/klnds/toelnd/toenal.h b/lnet/klnds/toelnd/toenal.h new file mode 100644 index 0000000..f793d3b --- /dev/null +++ b/lnet/klnds/toelnd/toenal.h @@ -0,0 +1,236 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include +#include +#include + +#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 32 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +#define TOENAL_N_SCHED 1 + +typedef struct /* pool of forwarding buffers */ +{ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + + kpr_router_t ksnd_router; /* THE router */ + + spinlock_t ksnd_sched_lock; /* serialise packet scheduling */ + wait_queue_head_t ksnd_sched_waitq; /* where scheduler(s) wait */ + + struct list_head ksnd_rx_conns; /* conn waiting to be read */ + struct list_head ksnd_tx_conns; /* conn waiting to be written */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */ + poll_table ksnd_pwait; /* poll wait table for the socket */ + int ksnd_slistchange; /* informs the pollthread that + * the socklist has changed */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet frags */ + struct iovec *tx_iov; /* packet frags */ +} ksock_tx_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + struct iovec ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the addres of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* socket */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # frags */ + struct iovec ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */ + + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + unsigned long ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ktoenal_add_sock (ptl_nid_t nid, int fd); +extern int ktoenal_close_sock(ptl_nid_t nid); +extern int ktoenal_set_mynid(ptl_nid_t nid); +extern int ktoenal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid); +extern void _ktoenal_put_conn (ksock_conn_t *conn); +extern void ktoenal_close_conn (ksock_conn_t *conn); + +static inline void +ktoenal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ktoenal_put_conn (conn); +} + +extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg); +extern int ktoenal_new_packet (ksock_conn_t *conn, int skip); +extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ktoenal_scheduler (void *arg); +extern int ktoenal_reaper (void *arg); +extern int ktoenal_pollthread (void *arg); +extern void ktoenal_data_ready(ksock_conn_t *conn); +extern void ktoenal_write_space(ksock_conn_t *conn); + + +extern nal_cb_t ktoenal_lib; +extern ksock_nal_data_t ktoenal_data; diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c new file mode 100644 index 0000000..ec37f6f --- /dev/null +++ b/lnet/klnds/toelnd/toenal_cb.c @@ -0,0 +1,1219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * Author: Kedar Sovani + * Author: Amey Inamdar + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include "toenal.h" + +atomic_t ktoenal_packets_received; +long ktoenal_packets_launched; +long ktoenal_packets_transmitted; + +/* + * LIB functions follow + * + */ +int +ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ktoenal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ktoenal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ktoenal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ktoenal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ktoenal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ktoenal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ktoenal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) + { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) + { + if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + wait_event (ktoenal_data.ksnd_idle_ltx_waitq, + !list_empty (&ktoenal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + return (ltx); +} + +int +ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags) +{ + /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't) + */ + mm_segment_t oldmm; + int rc; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + oldmm = get_fs(); + set_fs (KERNEL_DS); + +#ifdef PORTAL_DEBUG + { + int total_nob; + int i; + + for (i = total_nob = 0; i < niov; i++) + total_nob += iov[i].iov_len; + + LASSERT (nob == total_nob); + } +#endif + LASSERT (!in_interrupt()); + + rc = sock->f_op->writev(sock, iov, niov, NULL); + + set_fs (oldmm); + + if (rc > 0) /* sent something? */ + { + nob = rc; /* consume iov */ + for (;;) + { + LASSERT (niov > 0); + + if (iov->iov_len >= nob) + { + iov->iov_len -= nob; + iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); + break; + } + nob -= iov->iov_len; + iov->iov_len = 0; + iov++; + niov--; + } + } + + return (rc); +} + +int +ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread) +{ + /* NB This procedure "consumes" iov (actually tcp_recvmsg does) + */ + mm_segment_t oldmm; + int ret, i, len = 0, origlen = 0; + + PROF_START(our_recvmsg); + for(i = 0; i < niov; i++) { + len += iov[i].iov_len; + if(len >= toread) + break; + } + + if(len >= toread) { + origlen = iov[i].iov_len; + iov[i].iov_len -= (len - toread); + } + else { /* i == niov */ + i = niov - 1; + } + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + ret = sock->f_op->readv(sock, iov, i + 1, NULL); + + set_fs(oldmm); + + if(origlen) + iov[i].iov_len = origlen; + + PROF_FINISH(our_recvmsg); + return ret; +} + +void +ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags) +{ + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + int rc; + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + + /* assume transmit will complete now, so dequeue while I've got the lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0; /* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ktoenal_sendmsg (conn->ksnc_file, + tx->tx_iov, tx->tx_niov, tx->tx_nob, + list_empty (&conn->ksnc_tx_queue) ? + MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE)); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc < 0) /* error */ + { + if (rc == -EAGAIN) /* socket full => */ + rc = 0; /* nothing sent */ + else + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + rc = tx->tx_nob; /* kid on for now whole packet went */ + } + } + + if (rc == tx->tx_nob) /* everything went */ + { + conn->ksnc_tx_ready = 1; /* assume more can go (ASAP) */ + ktoenal_put_conn (conn); /* release packet's ref */ + + if (tx->tx_isfwd) /* was a forwarded packet? */ + { + kpr_fwd_done (&ktoenal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + } + else /* local send */ + { + ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list && + waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq)) + wake_up (&ktoenal_data.ksnd_idle_ltx_waitq); + } + ktoenal_packets_transmitted++; + } + else + { + tx->tx_nob -= rc; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) /* nothing to write */ + { + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); +} + +void +ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + long flags; + int nob = tx->tx_nob; + struct iovec *iov = tx->tx_iov; + int niov = 1; + + LASSERT (nob >= sizeof (ptl_hdr_t)); + + /* Truncate iov to exactly match total packet length + * since socket sendmsg pays no attention to requested length. + */ + for (;;) + { + LASSERT (niov <= tx->tx_niov); + LASSERT (iov->iov_len >= 0); + + if (iov->iov_len >= nob) + { + iov->iov_len = nob; + break; + } + nob -= iov->iov_len; + iov++; + niov++; + } + tx->tx_niov = niov; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) /* not scheduled to send */ + { + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + ktoenal_packets_launched++; + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +int +ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + ksock_ltx_t *ltx; + int rc; + int i; + + /* By this point, as it happens, we have absolutely no idea what + * 'private' is. It might be ksock_nal_data or it might be ksock_conn. + * Ha ha, isn't that a funny joke? + * + * FIXME: this is not the right way to fix this; the right way is to + * always pass in the same kind of structure. This is hard right now. + * To revisit this issue, set a breakpoint in here and watch for when + * it's called from lib_finalize. I think this occurs when we send a + * packet as a side-effect of another packet, such as when an ACK has + * been requested. -phil */ + + CDEBUG(D_NET, "sending %d bytes from [%d](%p,%d)... to nid: " + LPX64" pid %d\n", (int)payload_len, payload_niov, + payload_niov > 0 ? payload_iov[0].iov_base : NULL, + (int)(payload_niov > 0 ? payload_iov[0].iov_len : 0), nid, pid); + + if ((conn = ktoenal_get_conn (nid)) == NULL) + { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) + { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (-1); + } + + if ((conn = ktoenal_get_conn (gatewaynid)) == NULL) + { + CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", + nid, gatewaynid); + return (-1); + } + } + + /* This transmit has now got a ref on conn */ + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) + { + CERROR ("Can't allocate tx desc\n"); + ktoenal_put_conn (conn); + return (-1); + } + + /* Init common (to sends and forwards) packet part */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_iov = ltx->ltx_iov; + + /* Init local send packet (storage for hdr, finalize() args, iov) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + ltx->ltx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + for (i = 0; i < payload_niov; i++) + { + ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base; + ltx->ltx_iov[1 + i].iov_len = payload_iov[i].iov_len; + } + + ktoenal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + if (nid == ktoenal_lib.ni.nid) /* I'm the gateway; must be the last hop */ + nid = fwd->kprfd_target_nid; + + conn = ktoenal_get_conn (nid); + if (conn == NULL) + { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + + ktoenal_launch_packet (conn, tx); +} + +int +ktoenal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ktoenal_data.ksnd_nthreads); + return (0); +} + +void +ktoenal_thread_fini (void) +{ + atomic_dec (&ktoenal_data.ksnd_nthreads); +} + +void +ktoenal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn; + long flags; + + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + if (error != 0) + CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + + if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns)) + { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +ksock_fmb_t * +ktoenal_get_idle_fmb (ksock_conn_t *conn) +{ + /* NB called with sched lock held */ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ktoenal_data.ksnd_small_fmp; + else + pool = &ktoenal_data.ksnd_large_fmp; + + if (!list_empty (&pool->fmp_idle_fmbs)) + { + fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + return (NULL); +} + + +int +ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */ + + if (payload_nob == 0) /* got complete packet already */ + { + atomic_inc (&ktoenal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, 1, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + else + { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do + { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, niov, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob); + return (0); +} + +void +ktoenal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + int body_len; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + switch (conn->ksnc_hdr.type) + { + case PTL_MSG_GET: + case PTL_MSG_ACK: + body_len = 0; + break; + case PTL_MSG_PUT: + body_len = conn->ksnc_hdr.msg.put.length; + break; + case PTL_MSG_REPLY: + body_len = conn->ksnc_hdr.msg.reply.length; + break; + default: + /* Unrecognised packet type */ + CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n", + conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + /* Ignore this header and go back to reading a new packet. */ + ktoenal_new_packet (conn, 0); + return; + } + + if (body_len < 0) /* length corrupt */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) /* too big to forward */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, body_len); /* on to new packet (skip this one's body) */ + return; + } + + conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */ + if (conn2 != NULL) + { + CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + ktoenal_put_conn (conn2); /* drop ref from get above */ + + ktoenal_new_packet (conn, body_len); /* on to next packet (skip this one's body) */ + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ktoenal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) /* right at next packet boundary now */ + { + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + return (1); + } + + /* set up to skip as much a possible now */ + /* if there's more left (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + skipped = 0; + niov = 0; + + do + { + nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags) +{ + ksock_fmb_t *fmb; + int len; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* NB: sched lock held */ + CDEBUG(D_NET, "conn %p\n", conn); + + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) /* doesn't need a forwarding buffer */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto try_read; + } + + get_fmb: + /* NB: sched lock held */ + fmb = ktoenal_get_idle_fmb (conn); + if (fmb == NULL) /* conn descheduled waiting for idle fmb */ + return; + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0; /* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + /* NB ktoenal_recvmsg "consumes" the iov passed to it */ + len = ktoenal_recvmsg(conn->ksnc_file, + conn->ksnc_rx_iov, conn->ksnc_rx_niov, + conn->ksnc_rx_nob_wanted); + CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len); + + if (len <= 0) /* nothing ready (EAGAIN) or EOF or error */ + { + if (len != -EAGAIN && /* ! nothing to read now */ + len != 0) /* ! nothing to read ever */ + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read(%d) %p: %d\n", + conn->ksnc_rx_nob_wanted, conn, len); + } + goto out; /* come back when there's data ready */ + } + + LASSERT (len <= conn->ksnc_rx_nob_wanted); + conn->ksnc_rx_nob_wanted -= len; + conn->ksnc_rx_nob_left -= len; + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + conn->ksnc_rx_ready = 1; /* assume there's more to be had */ + + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: + if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */ + { + ktoenal_fwd_parse (conn); + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping this packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto get_fmb; /* => go get a fwd msg buffer */ + default: + } + /* Not Reached */ + LBUG (); + } + + PROF_START(lib_parse); + lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */ + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */ + { + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ktoenal_packets_received); + lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */ + /* Fall through */ + + case SOCKNAL_RX_SLOP: + if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */ + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + atomic_inc (&ktoenal_packets_received); + + /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */ + kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie); + + LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (!conn->ksnc_rx_ready) /* no data there to read? */ + { + conn->ksnc_rx_scheduled = 0; /* let socket callback schedule again */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); +} + +int +ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + int i; + + conn->ksnc_cookie = msg; + + LASSERT (niov <= PTL_MD_MAX_IOV); + for (i = 0; i < niov; i++) + { + conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len; + conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base; + } + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + return (rlen); +} + +int +ktoenal_scheduler (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + int nloops = 0; + + kportal_daemonize ("ktoenal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + while (!ktoenal_data.ksnd_shuttingdown) + { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&ktoenal_data.ksnd_rx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */ + } + + if (!list_empty (&ktoenal_data.ksnd_tx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + + list_del (&conn->ksnc_tx_list); + ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */ + } + + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty (&ktoenal_data.ksnd_rx_conns) || + !list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + ktoenal_thread_fini (); + return (0); +} + + +int +ktoenal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ktoenal_reaper"); + kportal_blockallsigs (); + + while (!ktoenal_data.ksnd_shuttingdown) + { + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ktoenal_data.ksnd_reaper_list)) + conn = NULL; + else + { + conn = list_entry (ktoenal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ktoenal_close_conn (conn); + else { + rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty(&ktoenal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ktoenal_thread_fini (); + return (0); +} + +#define POLLREAD (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI) +#define POLLWRITE (POLLOUT | POLLWRNORM | POLLWRBAND) + +int +ktoenal_pollthread(void *arg) +{ + unsigned int mask; + struct list_head *tmp; + ksock_conn_t *conn; + + /* Save the task struct for waking it up */ + ktoenal_data.ksnd_pollthread_tsk = current; + + kportal_daemonize ("ktoenal_pollthread"); + kportal_blockallsigs (); + + poll_initwait(&ktoenal_data.ksnd_pwait); + + while(!ktoenal_data.ksnd_shuttingdown) { + + set_current_state(TASK_INTERRUPTIBLE); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc(&conn->ksnc_refcount); + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + mask = conn->ksnc_file->f_op->poll(conn->ksnc_file, + ktoenal_data.ksnd_slistchange ? + &ktoenal_data.ksnd_pwait : NULL); + + if(mask & POLLREAD) { + ktoenal_data_ready(conn); + + } + if (mask & POLLWRITE) { + ktoenal_write_space(conn); + + } + if (mask & (POLLERR | POLLHUP)) { + /* Do error processing */ + } + + read_lock (&ktoenal_data.ksnd_socklist_lock); + if(atomic_dec_and_test(&conn->ksnc_refcount)) + _ktoenal_put_conn(conn); + } + ktoenal_data.ksnd_slistchange = 0; + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + if(ktoenal_data.ksnd_slistchange) { + poll_freewait(&ktoenal_data.ksnd_pwait); + poll_initwait(&ktoenal_data.ksnd_pwait); + } + } + poll_freewait(&ktoenal_data.ksnd_pwait); + ktoenal_thread_fini(); + return (0); +} + +void +ktoenal_data_ready (ksock_conn_t *conn) +{ + unsigned long flags; + ENTRY; + + if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail (&conn->ksnc_rx_list, + &ktoenal_data.ksnd_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + /* This is done to avoid the effects of a sequence + * of events in which the rx_ready is lost + */ + conn->ksnc_rx_ready=1; + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } + + EXIT; +} + +void +ktoenal_write_space (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "conn %p%s%s%s\n", + conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued")); + + + if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */ + !conn->ksnc_tx_scheduled) { /* not being progressed */ + + list_add_tail (&conn->ksnc_tx_list, + &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } +} + +nal_cb_t ktoenal_lib = { + nal_data: &ktoenal_data, /* NAL private data */ + cb_send: ktoenal_send, + cb_recv: ktoenal_recv, + cb_read: ktoenal_read, + cb_write: ktoenal_write, + cb_callback: ktoenal_callback, + cb_malloc: ktoenal_malloc, + cb_free: ktoenal_free, + cb_printf: ktoenal_printf, + cb_cli: ktoenal_cli, + cb_sti: ktoenal_sti, + cb_dist: ktoenal_dist +}; diff --git a/lnet/libcfs/.cvsignore b/lnet/libcfs/.cvsignore new file mode 100644 index 0000000..67d1a3d --- /dev/null +++ b/lnet/libcfs/.cvsignore @@ -0,0 +1,4 @@ +.deps +Makefile +Makefile.in +link-stamp diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am new file mode 100644 index 0000000..20d7fbd --- /dev/null +++ b/lnet/libcfs/Makefile.am @@ -0,0 +1,29 @@ +# Copyright (C) 2001, 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +MODULE = portals +modulenet_DATA = portals.o +EXTRA_PROGRAMS = portals + +LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-pid.c +APILINKS := api-eq.c api-errno.c api-init.c api-me.c api-ni.c api-wrap.c +LINKS = $(APILINKS) $(LIBLINKS) +DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej + +$(LINKS): link-stamp +link-stamp: + -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + echo timestamp > link-stamp + +DEFS = +portals_SOURCES = $(LINKS) module.c proc.c debug.c + +# Don't distribute any patched files. +dist-hook: + list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done + +include ../Rules.linux diff --git a/lnet/libcfs/Makefile.mk b/lnet/libcfs/Makefile.mk new file mode 100644 index 0000000..3196ea2 --- /dev/null +++ b/lnet/libcfs/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += libcfs.o +licfs-objs := module.o proc.o debug.o \ No newline at end of file diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c new file mode 100644 index 0000000..8d26dbb --- /dev/null +++ b/lnet/libcfs/debug.c @@ -0,0 +1,830 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include + +#define DEBUG_OVERFLOW 1024 +static char *debug_buf = NULL; +static unsigned long debug_size = 0; +static atomic_t debug_off_a = ATOMIC_INIT(0); +static int debug_wrapped; +wait_queue_head_t debug_ctlwq; +#define DAEMON_SND_SIZE (64 << 10) + +/* + * used by the daemon to keep track the offset into debug_buffer for the next + * write to the file. Usually, the daemon is to write out buffer + * from debug_daemon_next_write upto debug_off + * variable usage + * Reader - portals_debug_msg() + * Writer - portals_debug_daemon() + * portals_debug_daemon_start() during daemon init time + * portals_debug_daemon_continue() to reset to debug_off + * portals_debug_clear_buffer() reset to debug_off for clear + * Note that *_start(), *_continue() & *clear_buffer() should serialized; + */ +static atomic_t debug_daemon_next_write; + +/* + * A debug_daemon can be in following states + * stopped - stopped state means there is no debug_daemon running. + * accordingly, it must be in paused state + * a daemon is in !stopped && !paused state after + * "lctl debug_daemon start" creates debug_daemon successfully + * Variable Usage + * Reader - portals_debug_daemon() + * portals_debug_set_daemon() routines + * Writer - portals_debug_set_daemon() routines + * portals_debug_daemon() on IO error + * paused - a debug_daemon state is changed from !paused into paused + * when "lctl debug_daemon paused" is issued + * "lctl debug_daemon continue" gets a daemon into !paused mode + * Reader - portals_debug_set_daemon() routines + * portals_debug_msg() + * Writer - portals_debug_set_daemon() on init + * portals_debug_daemon() + * + * Daemon state diagram. + * (stopped, paused) + * | <-- debug_daemon start + * V + * (!stopped, !paused) + * | <-- debug_daemon pause + * V + * (!stopped, paused) + * | <-- debug_daemon continue + * V + * (!stopped, !paused) + * | <-- debug_daemon stop + * V + * (stopped, paused) + * Overlapped - this is a state when CDEBUG is too fast for the daemon to + * write out the debug_bufferr. That is, debug_off is to + * overlap debug_daemon_next_write; + * Reader - portals_debug_msg() + * Writer - portals_debug_msg() + */ + +/* + * Description on Trace Daemon Synchronization + * + * Three categories of code are synchronizing between each other + * 1. lctl, portals_debug_set_daemon(), the user debug control code, + * as well as portals_debug_clear_buffer() + * 2. CDEBUG, portals_debug_msg(), the debug put messages routine + * 3. Daemon, portals_debug_daemon(), to write out debug log file + * + * + * Three different controls for synchronizations + * + * 1. debug_daemon_semaphore + * The usage of this semaphore is to serialize multiple lctl controls + * in manipulating debug daemon state. The semaphore serves as the + * gatekeeper to allow only one user control thread, at any giving time, + * to access debug daemon state and keeps the other user control requests + * in wait state until the current control request is serviced. + * + * 2. wait_queue_head_t lctl (paired with lctl_event flag) + * Lctl event is the event between portals_debug_set_daemon() and + * portals_debug_daemon(). Lctl is an indicator for portals_debug_daemon() + * to flush data out to file. portals_debug_daemon() is to use lctl event + * as signal channel to wakeup portals_debug_set_daemon() upon flush + * operation is done. + * + * Producer : + * portals_debug_daemon() uses to wake up + * portals_debug_set_daemon(), pause and stop, routines + * Consumer : + * portals_debug_set_daemon(), stop and pause operations, + * wait and sleep on the event + * + * 3. wait_queue_head_t daemon (paired with daemon_event flag) + * This is an event channel to wakeup portals_debug_daemon. Daemon + * wakes up to run whenever there is an event posted. Daemon handles + * 2 types of operations . 1. Writes data out to debug file, 2. Flushes + * file and terminates base on lctl event. + * File operation - + * Daemon is normally in a sleep state. + * Daemon is woken up through daemon event whenever CDEBUG is + * putting data over any 64K boundary. + * File flush and termination - + * On portals_debug_daemon_stop/pause() operations, lctl control + * is to wake up daemon through daemon event. + * + * We can't use sleep_on() and wake_up() to replace daemon event because + * portals_debug_daemon() must catch the wakeup operation posted by + * portals_debug_daemon_stop/pause(). Otherwise, stop and pause may + * stuck in lctl wait event. + * + * Producer : + * a. portals_debug_daemon_pause() and portals_debug_daemon_stop() + * uses the event to wake up portals_debug_daemon() + * b. portals_debug_msg() uses the event to wake up + * portals_debug_daemon() whenever the data output is acrossing + * a 64K bytes boundary. + * Consumer : + * portals_debug_daemon() wakes up upon daemon event. + * + * Sequence for portals_debug_daemon_stop() operation + * + * _Portals_debug_daemon_stop()_ _Daemon_ + * Wait_event(daemon) or running + * Paused = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Set force_flush flag if lctlevnt + * Flush data + * Wakeup_event (lctl) + * Wait_event(daemon) + * Stopped = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Exit daemon loop if (Stopped) + * Wakeup_event (lctl) + * Exit + * Return to user application + * + * + * _Portals_debug_msg()_ _Daemon_ + * Wait_event(daemon) or running + * If (WriteStart<64Kjournal_info; + current->journal_info = NULL; + sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME); + file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for dumping: %ld\n", debug_file_name, + PTR_ERR(file)); + GOTO(out, PTR_ERR(file)); + } else { + printk(KERN_ALERT "dumping log to %s ... writing ...\n", + debug_file_name); + } + + debug_off = atomic_read(&debug_off_a); + oldfs = get_fs(); + set_fs(get_ds()); + if (debug_wrapped) { + rc = file->f_op->write(file, debug_buf + debug_off + 1, + debug_size-debug_off-1, &file->f_pos); + rc += file->f_op->write(file, debug_buf, debug_off + 1, + &file->f_pos); + } else { + rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos); + } + printk("wrote %d bytes\n", rc); + set_fs(oldfs); + + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc) + CERROR("sync returns %d\n", rc); + filp_close(file, 0); +out: + current->journal_info = journal_info; + wake_up(&debug_ctlwq); + return 0; +} + +int portals_debug_daemon(void *arg) +{ + struct file *file; + void *journal_info; + mm_segment_t oldfs; + unsigned long force_flush = 0; + unsigned long size, off, flags; + int rc; + + kportal_daemonize("ldebug_daemon"); + reparent_to_init(); + journal_info = current->journal_info; + current->journal_info = NULL; + + file = filp_open(debug_daemon_file_path, + O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for logging", debug_daemon_file_path); + GOTO(out1, PTR_ERR(file)); + } else { + printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n", + debug_daemon_file_path); + } + + debug_daemon_state.overlapped = 0; + debug_daemon_state.stopped = 0; + + spin_lock_irqsave(&portals_debug_lock, flags); + off = atomic_read(&debug_off_a) + 1; + if (debug_wrapped) + off = (off >= debug_size)? 0 : off; + else + off = 0; + atomic_set(&debug_daemon_next_write, off); + atomic_set(&debug_daemon_state.paused, 0); + spin_unlock_irqrestore(&portals_debug_lock, flags); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + while (1) { + unsigned long ending; + unsigned long start, tail; + long delta; + + debug_daemon_state.daemon_event = 0; + + ending = atomic_read(&debug_off_a); + start = atomic_read(&debug_daemon_next_write); + + /* check if paused is imposed by lctl ? */ + force_flush = !debug_daemon_state.lctl_event; + + delta = ending - start; + tail = debug_size - start; + size = (delta >= 0) ? delta : tail; + while (size && (force_flush || (delta < 0) || + (size >= DAEMON_SND_SIZE))) { + if (daemon_file_size_limit) { + int ssize = daemon_file_size_limit - file->f_pos; + if (size > ssize) + size = ssize; + } + + rc = file->f_op->write(file, debug_buf+start, + size, &file->f_pos); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon write error %d\n", rc); + goto out; + } + start += rc; + delta = ending - start; + tail = debug_size - start; + if (tail == 0) + start = 0; + if (delta >= 0) + size = delta; + else + size = (tail == 0) ? ending : tail; + if (daemon_file_size_limit == file->f_pos) { + // file wrapped around + file->f_pos = 0; + } + } + atomic_set(&debug_daemon_next_write, start); + if (force_flush) { + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon sync error %d\n", rc); + goto out; + } + if (debug_daemon_state.stopped) + break; + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + } + wait_event(debug_daemon_state.daemon, + debug_daemon_state.daemon_event); + } +out: + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + set_fs(oldfs); + filp_close(file, 0); + current->journal_info = journal_info; +out1: + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + return 0; +} + +void portals_debug_print(void) +{ + unsigned long dumplen = 64 * 1024; + char *start1, *start2; + char *end1, *end2; + unsigned long debug_off = atomic_read(&debug_off_a); + + start1 = debug_buf + debug_off - dumplen; + if (start1 < debug_buf) { + start1 += debug_size; + end1 = debug_buf + debug_size - 1; + start2 = debug_buf; + end2 = debug_buf + debug_off; + } else { + end1 = debug_buf + debug_off; + start2 = debug_buf + debug_off; + end2 = debug_buf + debug_off; + } + + while (start1 < end1) { + int count = MIN(1024, end1 - start1); + printk("%*s", count, start1); + start1 += 1024; + } + while (start2 < end2) { + int count = MIN(1024, end2 - start2); + printk("%*s", count, start2); + start2 += 1024; + } +} + +void portals_debug_dumplog(void) +{ + int rc; + ENTRY; + + init_waitqueue_head(&debug_ctlwq); + + rc = kernel_thread(portals_do_debug_dumplog, + NULL, CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + printk(KERN_ERR "cannot start dump thread\n"); + return; + } + sleep_on(&debug_ctlwq); +} + +int portals_debug_daemon_start(char *file, unsigned int size) +{ + int rc; + + if (!debug_daemon_state.stopped) + return -EALREADY; + + if (file != NULL) + strncpy(debug_daemon_file_path, file, 1024); + + init_waitqueue_head(&debug_daemon_state.lctl); + init_waitqueue_head(&debug_daemon_state.daemon); + + daemon_file_size_limit = size << 20; + + debug_daemon_state.lctl_event = 0; + rc = kernel_thread(portals_debug_daemon, NULL, 0); + if (rc < 0) { + printk(KERN_ERR "cannot start debug daemon thread\n"); + strncpy(debug_daemon_file_path, "\0", 1); + return rc; + } + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_pause(void) +{ + if (atomic_read(&debug_daemon_state.paused)) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.lctl_event = 0; + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_continue(void) +{ + if (!atomic_read(&debug_daemon_state.paused)) + return -EINVAL; + if (debug_daemon_state.stopped) + return -EINVAL; + + debug_daemon_state.overlapped = 0; + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + atomic_set(&debug_daemon_state.paused, 0); + return 0; +} + +int portals_debug_daemon_stop(void) +{ + if (debug_daemon_state.stopped) + return -EALREADY; + + if (!atomic_read(&debug_daemon_state.paused)) + portals_debug_daemon_pause(); + + debug_daemon_state.lctl_event = 0; + debug_daemon_state.stopped = 1; + + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + + debug_daemon_file_path[0] = '\0'; + return 0; +} + +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *filename, unsigned int size) +{ + int rc = -EINVAL; + + down(&debug_daemon_semaphore); + switch (cmd) { + case DEBUG_DAEMON_START: + if (length && (filename[length -1] != '\0')) { + CERROR("Invalid filename for debug_daemon\n"); + rc = -EINVAL; + break; + } + rc = portals_debug_daemon_start(filename, size); + break; + case DEBUG_DAEMON_STOP: + rc = portals_debug_daemon_stop(); + break; + case DEBUG_DAEMON_PAUSE: + rc = portals_debug_daemon_pause(); + break; + case DEBUG_DAEMON_CONTINUE: + rc = portals_debug_daemon_continue(); + break; + default: + CERROR("unknown set_daemon cmd\n"); + } + up(&debug_daemon_semaphore); + return rc; +} + +static int panic_dumplog(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (handled_panic) + return 0; + else + handled_panic = 1; + + if (in_interrupt()) { + portals_debug_print(); + return 0; + } + + while (current->lock_depth >= 0) + unlock_kernel(); + portals_debug_dumplog(); + return 0; +} + +static struct notifier_block lustre_panic_notifier = { + notifier_call : panic_dumplog, + next : NULL, + priority : 10000 +}; + +int portals_debug_init(unsigned long bufsize) +{ + unsigned long debug_off = atomic_read(&debug_off_a); + if (debug_buf != NULL) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + + debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW); + if (debug_buf == NULL) + return -ENOMEM; + memset(debug_buf, 0, debug_size); + debug_wrapped = 0; + + printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n", + bufsize, debug_buf); + atomic_set(&debug_off_a, debug_off); + notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); + debug_size = bufsize; + + return 0; +} + +int portals_debug_cleanup(void) +{ + notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + portals_debug_daemon_stop(); + + vfree(debug_buf); + atomic_set(&debug_off_a, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +int portals_debug_clear_buffer(void) +{ + unsigned long flags; + unsigned long state; + + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + state = atomic_read(&debug_daemon_state.paused); + if (!state) + portals_debug_daemon_pause(); + spin_lock_irqsave(&portals_debug_lock, flags); + atomic_set(&debug_off_a, 0); + debug_wrapped = 0; + atomic_set(&debug_daemon_next_write, 0); + debug_daemon_state.overlapped = 0; + spin_unlock_irqrestore(&portals_debug_lock, flags); + + if (!state) + atomic_set(&debug_daemon_state.paused, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +/* Debug markers, although printed by S_PORTALS + * should not be be marked as such. + */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int portals_debug_mark_buffer(char *text) +{ + if (debug_buf == NULL) + return -EINVAL; + + CDEBUG(0, "*******************************************************************************\n"); + CDEBUG(0, "DEBUG MARKER: %s\n", text); + CDEBUG(0, "*******************************************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_PORTALS + +__s32 portals_debug_copy_to_user(char *buf, unsigned long len) +{ + int rc; + unsigned long debug_off; + unsigned long flags; + + if (len < debug_size) + return -ENOSPC; + + debug_off = atomic_read(&debug_off_a); + spin_lock_irqsave(&portals_debug_lock, flags); + if (debug_wrapped) { + /* All of this juggling with the 1s is to keep the trailing nul + * (which falls at debug_buf + debug_off) at the end of what we + * copy into user space */ + copy_to_user(buf, debug_buf + debug_off + 1, + debug_size - debug_off - 1); + copy_to_user(buf + debug_size - debug_off - 1, + debug_buf, debug_off + 1); + rc = debug_size; + } else { + copy_to_user(buf, debug_buf, debug_off); + rc = debug_off; + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + + return rc; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) +{ + va_list ap; + unsigned long flags; + int max_nob; + int prefix_nob; + int msg_nob; + struct timeval tv; + unsigned long base_offset; + unsigned long debug_off; + + if (debug_buf == NULL) { + printk("portals_debug_msg: debug_buf is NULL!\n"); + return; + } + + spin_lock_irqsave(&portals_debug_lock, flags); + debug_off = atomic_read(&debug_off_a); + if (!atomic_read(&debug_daemon_state.paused)) { + unsigned long available; + long delta; + long v = atomic_read(&debug_daemon_next_write); + + delta = debug_off - v; + available = (delta>=0) ? debug_size-delta : -delta; + // Check if we still have enough debug buffer for CDEBUG + if (available < DAEMON_SND_SIZE) { + /* Drop CDEBUG packets until enough debug_buffer is + * available */ + if (debug_daemon_state.overlapped) + goto out; + /* If this is the first time, leave a marker in the + * output */ + debug_daemon_state.overlapped = 1; + ap = NULL; + format = "DEBUG MARKER: Debug buffer overlapped\n"; + } else /* More space just became available */ + debug_daemon_state.overlapped = 0; + } + + max_nob = debug_size - debug_off + DEBUG_OVERFLOW; + if (max_nob <= 0) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + printk("logic error in portals_debug_msg: <0 bytes to write\n"); + return; + } + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + do_gettimeofday(&tv); + + prefix_nob = snprintf(debug_buf + debug_off, max_nob, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id(), + tv.tv_sec, tv.tv_usec); + max_nob -= prefix_nob; + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.extern_pid, stack); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.mode.tt.extern_pid, stack); +#else + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d+%lu): ", + file, line, fn, current->pid, stack); +#endif + max_nob -= msg_nob; + + va_start(ap, format); + msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); + + /* Print to console, while msg is contiguous in debug_buf */ + /* NB safely terminated see above */ + if ((mask & D_EMERG) != 0) + printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob); + if ((mask & D_ERROR) != 0) + printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob); + else if (portal_printk) + printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob); + base_offset = debug_off & 0xFFFF; + + debug_off += prefix_nob + msg_nob; + if (debug_off > debug_size) { + memcpy(debug_buf, debug_buf + debug_size, + debug_off - debug_size + 1); + debug_off -= debug_size; + debug_wrapped = 1; + } + + atomic_set(&debug_off_a, debug_off); + if (!atomic_read(&debug_daemon_state.paused) && + ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) { + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + } +out: + spin_unlock_irqrestore(&portals_debug_lock, flags); +} + +void portals_debug_set_level(unsigned int debug_level) +{ + printk("Setting portals debug level to %08x\n", debug_level); + portal_debug = debug_level; +} + +void portals_run_lbug_upcall(char * file, char *fn, int line) +{ + char *argv[6]; + char *envp[3]; + char buf[32]; + int rc; + + ENTRY; + snprintf (buf, sizeof buf, "%d", line); + + argv[0] = portals_upcall; + argv[1] = "LBUG"; + argv[2] = file; + argv[3] = fn; + argv[4] = buf; + argv[5] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " + "/proc/sys/portals/upcall\n", + argv[0], argv[1], argv[2], argv[3], argv[4], rc); + + } else { + CERROR("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); + } +} + + +EXPORT_SYMBOL(portals_debug_dumplog); +EXPORT_SYMBOL(portals_debug_msg); +EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c new file mode 100644 index 0000000..5e3fcb5 --- /dev/null +++ b/lnet/libcfs/module.c @@ -0,0 +1,574 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PORTAL_MINOR 240 + +extern void (kping_client)(struct portal_ioctl_data *); + +struct nal_cmd_handler { + nal_cmd_handler_t nch_handler; + void * nch_private; +}; + +static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +struct semaphore nal_cmd_sem; + +#ifdef PORTAL_DEBUG +void +kportal_assertion_failed (char *expr, char *file, char *func, int line) +{ + portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK(), + "ASSERTION(%s) failed\n", expr); + LBUG_WITH_LOC(file, func, line); +} +#endif + +void +kportal_daemonize (char *str) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) + daemonize(str); +#else + daemonize(); + snprintf (current->comm, sizeof (current->comm), "%s", str); +#endif +} + +void +kportal_blockallsigs () +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); +} + +/* called when opening /dev/device */ +static int kportal_psdev_open(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + PORTAL_MODULE_USE; + RETURN(0); +} + +/* called when closing /dev/device */ +static int kportal_psdev_release(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + + PORTAL_MODULE_UNUSE; + RETURN(0); +} + +static inline void freedata(void *data, int len) +{ + PORTAL_FREE(data, len); +} + +static int +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_del_route(ptl_nid_t target) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_del_route (target); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) +{ + int gateway_nalid; + ptl_nid_t gateway_nid; + ptl_nid_t lo_nid; + ptl_nid_t hi_nid; + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, + &hi_nid); + + if (rc == 0) { + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + + *gateway_nalidp = (__u32)gateway_nalid; + *gateway_nidp = (__u32)gateway_nid; + *lo_nidp = (__u32)lo_nid; + *hi_nidp = (__u32)hi_nid; + } + + PORTAL_SYMBOL_PUT (kpr_control_interface); + return (rc); +} + +static int +kportal_nal_cmd(int nal, struct portal_ioctl_data *data) +{ + int rc = -EINVAL; + + ENTRY; + + down(&nal_cmd_sem); + if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd); + rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private); + } + up(&nal_cmd_sem); + RETURN(rc); +} + +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + return (PORTAL_SYMBOL_GET(kqswnal_ni)); + case SOCKNAL: + return (PORTAL_SYMBOL_GET(ksocknal_ni)); + case TOENAL: + return (PORTAL_SYMBOL_GET(ktoenal_ni)); + case GMNAL: + return (PORTAL_SYMBOL_GET(kgmnal_ni)); + case TCPNAL: + /* userspace NAL */ + return (NULL); + case SCIMACNAL: + return (PORTAL_SYMBOL_GET(kscimacnal_ni)); + default: + /* A warning to a naive caller */ + CERROR ("unknown nal: %d\n", nal); + return (NULL); + } +} + +void +kportal_put_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + PORTAL_SYMBOL_PUT(kqswnal_ni); + break; + case SOCKNAL: + PORTAL_SYMBOL_PUT(ksocknal_ni); + break; + case TOENAL: + PORTAL_SYMBOL_PUT(ktoenal_ni); + break; + case GMNAL: + PORTAL_SYMBOL_PUT(kgmnal_ni); + break; + case TCPNAL: + /* A lesson to a malicious caller */ + LBUG (); + case SCIMACNAL: + PORTAL_SYMBOL_PUT(kscimacnal_ni); + break; + default: + CERROR ("unknown nal: %d\n", nal); + } +} + +int +kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + if (nal_cmd[nal].nch_handler != NULL) + rc = -EBUSY; + else { + nal_cmd[nal].nch_handler = handler; + nal_cmd[nal].nch_private = private; + } + up(&nal_cmd_sem); + } + return rc; +} + +int +kportal_nal_unregister(int nal) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + nal_cmd[nal].nch_handler = NULL; + nal_cmd[nal].nch_private = NULL; + up(&nal_cmd_sem); + } + return rc; +} + + +static int kportal_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + char buf[1024]; + struct portal_ioctl_data *data; + + ENTRY; + + if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || + _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || + _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + RETURN(-EINVAL); + } + + if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + RETURN(-EINVAL); + } + + data = (struct portal_ioctl_data *)buf; + + switch (cmd) { + case IOC_PORTAL_SET_DAEMON: + RETURN (portals_debug_set_daemon ( + (unsigned int) data->ioc_count, + (unsigned int) data->ioc_inllen1, + (char *) data->ioc_inlbuf1, + (unsigned int) data->ioc_misc)); + case IOC_PORTAL_GET_DEBUG: { + __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1, + data->ioc_plen1); + + if (size < 0) + RETURN(size); + + data->ioc_size = size; + err = copy_to_user((char *)arg, data, sizeof(*data)); + RETURN(err); + } + case IOC_PORTAL_CLEAR_DEBUG: + portals_debug_clear_buffer(); + RETURN(0); + case IOC_PORTAL_PANIC: + if (!capable (CAP_SYS_BOOT)) + RETURN (-EPERM); + panic("debugctl-invoked panic"); + RETURN(0); + case IOC_PORTAL_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + RETURN(-EINVAL); + portals_debug_mark_buffer(data->ioc_inlbuf1); + RETURN(0); + case IOC_PORTAL_PING: { + void (*ping)(struct portal_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n", + data->ioc_count, data->ioc_nid); + ping = PORTAL_SYMBOL_GET(kping_client); + if (!ping) + CERROR("PORTAL_SYMBOL_GET failed\n"); + else { + ping(data); + PORTAL_SYMBOL_PUT(kping_client); + } + RETURN(0); + } + + case IOC_PORTAL_ADD_ROUTE: + CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, data->ioc_nid2, + data->ioc_nid3); + err = kportal_add_route(data->ioc_nal, data->ioc_nid, + MIN (data->ioc_nid2, data->ioc_nid3), + MAX (data->ioc_nid2, data->ioc_nid3)); + break; + + case IOC_PORTAL_DEL_ROUTE: + CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); + err = kportal_del_route (data->ioc_nid); + break; + + case IOC_PORTAL_GET_ROUTE: + CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); + err = kportal_get_route(data->ioc_count, &data->ioc_nal, + &data->ioc_nid, &data->ioc_nid2, + &data->ioc_nid3); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_GET_NID: { + const ptl_handle_ni_t *nip; + ptl_process_id_t pid; + + CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + RETURN (-EINVAL); + + err = PtlGetId (*nip, &pid); + LASSERT (err == PTL_OK); + kportal_put_ni (data->ioc_nal); + + data->ioc_nid = pid.nid; + if (copy_to_user ((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + } + + case IOC_PORTAL_NAL_CMD: + CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal, + data->ioc_nal_cmd); + err = kportal_nal_cmd(data->ioc_nal, data); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_FAIL_NID: { + const ptl_handle_ni_t *nip; + + CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", + data->ioc_nal, data->ioc_nid, data->ioc_count); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + return (-EINVAL); + + err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count); + break; + } + + default: + err = -EINVAL; + break; + } + + RETURN(err); +} + + +static struct file_operations portalsdev_fops = { + ioctl: kportal_ioctl, + open: kportal_psdev_open, + release: kportal_psdev_release +}; + + +static struct miscdevice portal_dev = { + PORTAL_MINOR, + "portals", + &portalsdev_fops +}; + +extern int insert_proc(void); +extern void remove_proc(void); +MODULE_AUTHOR("Peter J. Braam "); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +static int init_kportals_module(void) +{ + int rc; + + rc = portals_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "portals_debug_init: %d\n", rc); + return (rc); + } + + sema_init(&nal_cmd_sem, 1); + + rc = misc_register(&portal_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_debug; + } + + rc = PtlInit(); + if (rc) { + CERROR("PtlInit: error %d\n", rc); + goto cleanup_deregister; + } + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_fini; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return (0); + + cleanup_fini: + PtlFini(); + cleanup_deregister: + misc_deregister(&portal_dev); + cleanup_debug: + portals_debug_cleanup(); + return rc; +} + +static void exit_kportals_module(void) +{ + int rc; + + remove_proc(); + PtlFini(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&portal_kmemory)); + + + rc = misc_deregister(&portal_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + + if (atomic_read(&portal_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&portal_kmemory)); + + rc = portals_debug_cleanup(); + if (rc) + printk(KERN_ERR "portals_debug_cleanup: %d\n", rc); +} + +EXPORT_SYMBOL(lib_dispatch); +EXPORT_SYMBOL(PtlMEAttach); +EXPORT_SYMBOL(PtlMEInsert); +EXPORT_SYMBOL(PtlMEUnlink); +EXPORT_SYMBOL(PtlEQAlloc); +EXPORT_SYMBOL(PtlMDAttach); +EXPORT_SYMBOL(PtlMDUnlink); +EXPORT_SYMBOL(PtlNIInit); +EXPORT_SYMBOL(PtlNIFini); +EXPORT_SYMBOL(PtlNIDebug); +EXPORT_SYMBOL(PtlInit); +EXPORT_SYMBOL(PtlFini); +EXPORT_SYMBOL(PtlPut); +EXPORT_SYMBOL(PtlGet); +EXPORT_SYMBOL(ptl_err_str); +EXPORT_SYMBOL(portal_subsystem_debug); +EXPORT_SYMBOL(portal_debug); +EXPORT_SYMBOL(portal_stack); +EXPORT_SYMBOL(portal_printk); +EXPORT_SYMBOL(PtlEQWait); +EXPORT_SYMBOL(PtlEQFree); +EXPORT_SYMBOL(PtlEQGet); +EXPORT_SYMBOL(PtlGetId); +EXPORT_SYMBOL(PtlMDBind); +EXPORT_SYMBOL(lib_iov_nob); +EXPORT_SYMBOL(lib_copy_iov2buf); +EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_kiov_nob); +EXPORT_SYMBOL(lib_copy_kiov2buf); +EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_finalize); +EXPORT_SYMBOL(lib_parse); +EXPORT_SYMBOL(lib_init); +EXPORT_SYMBOL(lib_fini); +EXPORT_SYMBOL(portal_kmemory); +EXPORT_SYMBOL(kportal_daemonize); +EXPORT_SYMBOL(kportal_blockallsigs); +EXPORT_SYMBOL(kportal_nal_register); +EXPORT_SYMBOL(kportal_nal_unregister); +EXPORT_SYMBOL(kportal_assertion_failed); +EXPORT_SYMBOL(dispatch_name); +EXPORT_SYMBOL(kportal_get_ni); +EXPORT_SYMBOL(kportal_put_ni); + +module_init(init_kportals_module); +module_exit (exit_kportals_module); diff --git a/lnet/libcfs/proc.c b/lnet/libcfs/proc.c new file mode 100644 index 0000000..2fa739a --- /dev/null +++ b/lnet/libcfs/proc.c @@ -0,0 +1,290 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include +#include + +static struct ctl_table_header *portals_table_header = NULL; +extern char debug_file_path[1024]; +extern char debug_daemon_file_path[1024]; +extern char portals_upcall[1024]; + +#define PSDEV_PORTALS (0x100) +#define PSDEV_DEBUG 1 /* control debugging */ +#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ +#define PSDEV_PRINTK 3 /* force all errors to console */ +#define PSDEV_DEBUG_PATH 4 /* crashdump log location */ +#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */ +#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */ + +#define PORTALS_PRIMARY_CTLCNT 6 +static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { + {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, + sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, + {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path, + sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring, + &sysctl_string}, + {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, + sizeof(portals_upcall), 0644, NULL, &proc_dostring, + &sysctl_string}, + {0} +}; + +static struct ctl_table top_table[2] = { + {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, + {0} +}; + + +#ifdef PORTALS_PROFILING +/* + * profiling stuff. we do this statically for now 'cause its simple, + * but we could do some tricks with elf sections to have this array + * automatically built. + */ +#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } + +struct prof_ent prof_ents[] = { + def_prof(our_recvmsg), + def_prof(our_sendmsg), + def_prof(socknal_recv), + def_prof(lib_parse), + def_prof(conn_list_walk), + def_prof(memcpy), + def_prof(lib_finalize), + def_prof(pingcli_time), + def_prof(gmnal_send), + def_prof(gmnal_recv), +}; + +EXPORT_SYMBOL(prof_ents); + +/* + * this function is as crazy as the proc filling api + * requires. + * + * buffer: page allocated for us to scribble in. the + * data returned to the user will be taken from here. + * *start: address of the pointer that will tell the + * caller where in buffer the data the user wants is. + * ppos: offset in the entire /proc file that the user + * currently wants. + * wanted: the amount of data the user wants. + * + * while going, 'curpos' is the offset in the entire + * file where we currently are. We only actually + * start filling buffer when we get to a place in + * the file that the user cares about. + * + * we take care to only sprintf when the user cares because + * we're holding a lock while we do this. + * + * we're smart and know that we generate fixed size lines. + * we only start writing to the buffer when the user cares. + * This is unpredictable because we don't snapshot the + * list between calls that are filling in a file from + * the list. The list could change mid read and the + * output will look very weird indeed. oh well. + */ + +static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, + int *eof, void *data) +{ + int len = 0, i; + int curpos; + char *header = "Interval Cycles_per (Starts Finishes Total)\n"; + int header_len = strlen(header); + char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; + int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); + + *start = buffer; + + if (ppos < header_len) { + int diff = MIN(header_len, wanted); + memcpy(buffer, header + ppos, diff); + len += diff; + ppos += diff; + } + + if (len >= wanted) + goto out; + + curpos = header_len; + + for ( i = 0; i < MAX_PROFS ; i++) { + int copied; + struct prof_ent *pe = &prof_ents[i]; + long long cycles_per; + /* + * find the part of the array that the buffer wants + */ + if (ppos >= (curpos + line_len)) { + curpos += line_len; + continue; + } + /* the clever caller split a line */ + if (ppos > curpos) { + *start = buffer + (ppos - curpos); + } + + if (pe->finishes == 0) + cycles_per = 0; + else + { + cycles_per = pe->total_cycles; + do_div (cycles_per, pe->finishes); + } + + copied = sprintf(buffer + len, format, pe->str, cycles_per, + pe->starts, pe->finishes, pe->total_cycles); + + len += copied; + + /* pad to line len, -1 for \n */ + if ((copied < line_len-1)) { + int diff = (line_len-1) - copied; + memset(buffer + len, ' ', diff); + len += diff; + copied += diff; + } + + buffer[len++]= '\n'; + + /* bail if we have enough */ + if (((buffer + len) - *start) >= wanted) + break; + + curpos += line_len; + } + + /* lameness */ + if (i == MAX_PROFS) + *eof = 1; + out: + + return MIN(((buffer + len) - *start), wanted); +} + +/* + * all kids love /proc :/ + */ +static unsigned char basedir[]="net/portals"; +#endif /* PORTALS_PROFILING */ + +int insert_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + struct proc_dir_entry *ent; + + if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { + CERROR("profiling enum and array are out of sync.\n"); + return -1; + } + + /* + * This is pretty lame. assuming that failure just + * means that they already existed. + */ + strcat(dir, basedir); + create_proc_entry(dir, S_IFDIR, 0); + + strcat(dir, "/cycles"); + ent = create_proc_entry(dir, 0, 0); + if (!ent) { + CERROR("couldn't register %s?\n", dir); + return -1; + } + + ent->data = NULL; + ent->read_proc = prof_read_proc; +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (!portals_table_header) + portals_table_header = register_sysctl_table(top_table, 0); +#endif + + return 0; +} + +void remove_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + int end; + + dir[0]='\0'; + strcat(dir, basedir); + + end = strlen(dir); + + strcat(dir, "/cycles"); + remove_proc_entry(dir,0); + + dir[end] = '\0'; + remove_proc_entry(dir,0); +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (portals_table_header) + unregister_sysctl_table(portals_table_header); + portals_table_header = NULL; +#endif +} diff --git a/lnet/lnet/.cvsignore b/lnet/lnet/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/lnet/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/lnet/Makefile.am b/lnet/lnet/Makefile.am new file mode 100644 index 0000000..8c03749 --- /dev/null +++ b/lnet/lnet/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include +lib_LIBRARIES= libportals.a +libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c diff --git a/lnet/lnet/Makefile.mk b/lnet/lnet/Makefile.mk new file mode 100644 index 0000000..5627ef7 --- /dev/null +++ b/lnet/lnet/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += portals.o +portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o diff --git a/lnet/lnet/api-eq.c b/lnet/lnet/api-eq.c new file mode 100644 index 0000000..e066619 --- /dev/null +++ b/lnet/lnet/api-eq.c @@ -0,0 +1,158 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-eq.c + * User-level event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +int ptl_eq_init(void) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_fini(void) +{ + /* Nothing to do anymore... */ +} + +int ptl_eq_ni_init(nal_t * nal) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_ni_fini(nal_t * nal) +{ + /* Nothing to do anymore... */ +} + +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) +{ + ptl_eq_t *eq; + int rc, new_index; + unsigned long flags; + ptl_event_t *new_event; + nal_t *nal; + ENTRY; + + if (!ptl_init) + RETURN(PTL_NOINIT); + + nal = ptl_hndl2nal(&eventq); + if (!nal) + RETURN(PTL_INV_EQ); + + eq = ptl_handle2usereq(&eventq); + nal->lock(nal, &flags); + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + + new_index = eq->sequence & (eq->size - 1); + new_event = &eq->base[new_index]; + CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->sequence, eq->size); + if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { + nal->unlock(nal, &flags); + RETURN(PTL_EQ_EMPTY); + } + + *ev = *new_event; + + /* Set the unlinked_me interface number if there is one to pass + * back, since the NAL hasn't a clue what it is and therefore can't + * set it. */ + if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) + ev->unlinked_me.nal_idx = eventq.nal_idx; + + /* ensure event is delivered correctly despite possible + races with lib_finalize */ + if (eq->sequence != new_event->sequence) { + CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", + eq->sequence, new_event->sequence); + rc = PTL_EQ_DROPPED; + } else { + rc = PTL_OK; + } + + eq->sequence = new_event->sequence + 1; + nal->unlock(nal, &flags); + RETURN(rc); +} + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) +{ + int rc; + + /* PtlEQGet does the handle checking */ + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + } + + return rc; +} + +#ifndef __KERNEL__ +static jmp_buf eq_jumpbuf; + +static void eq_timeout(int signal) +{ + longjmp(eq_jumpbuf, -1); +} + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + static void (*prev) (int); + static int left_over; + time_t time_at_start; + int rc; + + if (setjmp(eq_jumpbuf)) { + signal(SIGALRM, prev); + alarm(left_over - timeout); + return PTL_EQ_EMPTY; + } + + left_over = alarm(timeout); + prev = signal(SIGALRM, eq_timeout); + time_at_start = time(NULL); + if (left_over < timeout) + alarm(left_over); + + rc = PtlEQWait(eventq_in, event_out); + + signal(SIGALRM, prev); + alarm(left_over); /* Should compute how long we waited */ + + return rc; +} + +#endif + diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c new file mode 100644 index 0000000..026c93b --- /dev/null +++ b/lnet/lnet/api-errno.c @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-errno.c + * Instantiate the string table of errors + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + */ + +/* If you change these, you must update the number table in portals/errno.h */ +const char *ptl_err_str[] = { + "PTL_OK", + "PTL_SEGV", + + "PTL_NOSPACE", + "PTL_INUSE", + "PTL_VAL_FAILED", + + "PTL_NAL_FAILED", + "PTL_NOINIT", + "PTL_INIT_DUP", + "PTL_INIT_INV", + "PTL_AC_INV_INDEX", + + "PTL_INV_ASIZE", + "PTL_INV_HANDLE", + "PTL_INV_MD", + "PTL_INV_ME", + "PTL_INV_NI", +/* If you change these, you must update the number table in portals/errno.h */ + "PTL_ILL_MD", + "PTL_INV_PROC", + "PTL_INV_PSIZE", + "PTL_INV_PTINDEX", + "PTL_INV_REG", + + "PTL_INV_SR_INDX", + "PTL_ML_TOOLONG", + "PTL_ADDR_UNKNOWN", + "PTL_INV_EQ", + "PTL_EQ_DROPPED", + + "PTL_EQ_EMPTY", + "PTL_NOUPDATE", + "PTL_FAIL", + "PTL_NOT_IMPLEMENTED", + "PTL_NO_ACK", + + "PTL_IOV_TOO_MANY", + "PTL_IOV_TOO_SMALL", + + "PTL_EQ_INUSE", + "PTL_MD_INUSE" +}; +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lnet/lnet/api-init.c b/lnet/lnet/api-init.c new file mode 100644 index 0000000..e59c922 --- /dev/null +++ b/lnet/lnet/api-init.c @@ -0,0 +1,71 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-init.c + * Initialization and global data for the p30 user side library + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +int ptl_init; +unsigned int portal_subsystem_debug = 0xfff7e3ff; +unsigned int portal_debug = ~0; +unsigned int portal_printk; +unsigned int portal_stack; + +#ifdef __KERNEL__ +atomic_t portal_kmemory = ATOMIC_INIT(0); +#endif + +int __p30_initialized; +int __p30_myr_initialized; +int __p30_ip_initialized; +ptl_handle_ni_t __myr_ni_handle; +ptl_handle_ni_t __ip_ni_handle; + +int __p30_myr_timeout = 10; +int __p30_ip_timeout; + +int PtlInit(void) +{ + + if (ptl_init) + return PTL_OK; + + ptl_ni_init(); + ptl_me_init(); + ptl_eq_init(); + ptl_init = 1; + __p30_initialized = 1; + + return PTL_OK; +} + + +void PtlFini(void) +{ + + /* Reverse order of initialization */ + ptl_eq_fini(); + ptl_me_fini(); + ptl_ni_fini(); + ptl_init = 0; +} diff --git a/lnet/lnet/api-me.c b/lnet/lnet/api-me.c new file mode 100644 index 0000000..e724e58 --- /dev/null +++ b/lnet/lnet/api-me.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-me.c + * Match Entry local operations. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +int ptl_me_init(void) +{ + return PTL_OK; +} +void ptl_me_fini(void) +{ /* Nothing to do */ +} +int ptl_me_ni_init(nal_t * nal) +{ + return PTL_OK; +} + +void ptl_me_ni_fini(nal_t * nal) +{ /* Nothing to do... */ +} diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c new file mode 100644 index 0000000..b2e069e --- /dev/null +++ b/lnet/lnet/api-ni.c @@ -0,0 +1,197 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-ni.c + * Network Interface code + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include + +/* Put some magic in the NI handle so uninitialised/zeroed handles are easy + * to spot */ +#define NI_HANDLE_MAGIC 0xebc0de00 +#define NI_HANDLE_MASK 0x000000ff +#define MAX_NIS 8 +static nal_t *ptl_interfaces[MAX_NIS]; +int ptl_num_interfaces = 0; + +nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) +{ + unsigned int idx = handle->nal_idx; + + /* XXX we really rely on the caller NOT racing with interface + * setup/teardown. That ensures her NI handle can't get + * invalidated out from under her (or worse, swapped for a + * completely different interface!) */ + + if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) + return NULL; + + idx &= NI_HANDLE_MASK; + if (idx < MAX_NIS) + return ptl_interfaces[idx]; + + return NULL; +} + +int ptl_ni_init(void) +{ + int i; + + LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1)); + + for (i = 0; i < MAX_NIS; i++) + ptl_interfaces[i] = NULL; + + return PTL_OK; +} + +void ptl_ni_fini(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) { + nal_t *nal = ptl_interfaces[i]; + if (!nal) + continue; + + if (nal->shutdown) + nal->shutdown(nal, i); + } +} + +#ifdef __KERNEL__ +DECLARE_MUTEX(ptl_ni_init_mutex); + +static void ptl_ni_init_mutex_enter (void) +{ + down (&ptl_ni_init_mutex); +} + +static void ptl_ni_init_mutex_exit (void) +{ + up (&ptl_ni_init_mutex); +} + +#else +static void ptl_ni_init_mutex_enter (void) +{ +} + +static void ptl_ni_init_mutex_exit (void) +{ +} + +#endif + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, ptl_pid_t requested_pid, + ptl_handle_ni_t * handle) +{ + nal_t *nal; + int i; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid); + + if (!nal) { + ptl_ni_init_mutex_exit (); + return PTL_NAL_FAILED; + } + + for (i = 0; i < ptl_num_interfaces; i++) { + if (ptl_interfaces[i] == nal) { + nal->refct++; + handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i; + fprintf(stderr, "Returning existing NAL (%d)\n", i); + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + } + nal->refct = 1; + + if (ptl_num_interfaces >= MAX_NIS) { + if (nal->shutdown) + nal->shutdown (nal, ptl_num_interfaces); + ptl_ni_init_mutex_exit (); + return PTL_NOSPACE; + } + + handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces; + ptl_interfaces[ptl_num_interfaces++] = nal; + + ptl_eq_ni_init(nal); + ptl_me_ni_init(nal); + + ptl_ni_init_mutex_exit (); + return PTL_OK; +} + + +int PtlNIFini(ptl_handle_ni_t ni) +{ + nal_t *nal; + int idx; + int rc; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = ptl_hndl2nal (&ni); + if (nal == NULL) { + ptl_ni_init_mutex_exit (); + return PTL_INV_HANDLE; + } + + idx = ni.nal_idx & NI_HANDLE_MASK; + + nal->refct--; + if (nal->refct > 0) { + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + + ptl_me_ni_fini(nal); + ptl_eq_ni_fini(nal); + + rc = PTL_OK; + if (nal->shutdown) + rc = nal->shutdown(nal, idx); + + ptl_interfaces[idx] = NULL; + ptl_num_interfaces--; + + ptl_ni_init_mutex_exit (); + return rc; +} + +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) +{ + *ni_out = handle_in; + + return PTL_OK; +} diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c new file mode 100644 index 0000000..e54707f --- /dev/null +++ b/lnet/lnet/api-wrap.c @@ -0,0 +1,599 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-wrap.c + * User-level wrappers that dispatch across the protection boundaries + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, + int argsize, void *retbuf, int retsize) +{ + nal_t *nal; + + if (!ptl_init) { + fprintf(stderr, "PtlGetId: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(&any_h); + if (!nal) + return PTL_INV_HANDLE; + + nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); + + return PTL_OK; +} + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) +{ + PtlGetId_in args; + PtlGetId_out ret; + int rc; + + args.handle_in = ni_handle; + + rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return rc; + + if (id) + *id = ret.id_out; + + return ret.rc; +} + +int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) +{ + PtlFailNid_in args; + PtlFailNid_out ret; + int rc; + + args.interface = interface; + args.nid = nid; + args.threshold = threshold; + + rc = do_forward (interface, PTL_FAILNID, + &args, sizeof(args), &ret, sizeof (ret)); + + return ((rc != PTL_OK) ? rc : ret.rc); +} + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out) +{ + PtlNIStatus_in args; + PtlNIStatus_out ret; + int rc; + + args.interface_in = interface_in; + args.register_in = register_in; + + rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (status_out) + *status_out = ret.status_out; + + return ret.rc; +} + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out) +{ + PtlNIDist_in args; + PtlNIDist_out ret; + int rc; + + args.interface_in = interface_in; + args.process_in = process_in; + + rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (distance_out) + *distance_out = ret.distance_out; + + return ret.rc; +} + + + +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in) +{ + PtlNIDebug_in args; + PtlNIDebug_out ret; + int rc; + + args.mask_in = mask_in; + + rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) +{ + PtlMEAttach_in args; + PtlMEAttach_out ret; + int rc; + + args.interface_in = interface_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = pos_in; + + rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = interface_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + + return ret.rc; +} + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out) +{ + PtlMEInsert_in args; + PtlMEInsert_out ret; + int rc; + + args.current_in = current_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = position_in; + + rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = current_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMEUnlink(ptl_handle_me_t current_in) +{ + PtlMEUnlink_in args; + PtlMEUnlink_out ret; + int rc; + + args.current_in = current_in; + args.unlink_in = PTL_RETAIN; + + rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +int PtlTblDump(ptl_handle_ni_t ni, int index_in) +{ + PtlTblDump_in args; + PtlTblDump_out ret; + int rc; + + args.index_in = index_in; + + rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEDump(ptl_handle_me_t current_in) +{ + PtlMEDump_in args; + PtlMEDump_out ret; + int rc; + + args.current_in = current_in; + + rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) +{ + nal_t *nal; + int rc; + int i; + + if (!ptl_init) { + fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(¤t_in); + if (!nal) + return PTL_INV_HANDLE; + + if (nal->validate != NULL) /* nal->validate not a NOOP */ + { + if ((md_in.options & PTL_MD_IOV) == 0) /* contiguous */ + { + rc = nal->validate (nal, md_in.start, md_in.length); + if (rc) + return (PTL_SEGV); + } + else + { + struct iovec *iov = (struct iovec *)md_in.start; + + for (i = 0; i < md_in.niov; i++, iov++) + { + rc = nal->validate (nal, iov->iov_base, iov->iov_len); + if (rc) + return (PTL_SEGV); + } + } + } + + return 0; +} + +static ptl_handle_eq_t md2eq (ptl_md_t *md) +{ + if (PtlHandleEqual (md->eventq, PTL_EQ_NONE)) + return (PTL_EQ_NONE); + + return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); +} + + +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) +{ + PtlMDAttach_in args; + PtlMDAttach_out ret; + int rc; + + rc = validate_md(me_in, md_in); + if (rc == PTL_OK) { + args.eq_in = md2eq(&md_in); + args.me_in = me_in; + args.md_in = md_in; + args.unlink_in = unlink_in; + + rc = do_forward(me_in, PTL_MDATTACH, + &args, sizeof(args), &ret, sizeof(ret)); + } + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = me_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + + + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out) +{ + PtlMDBind_in args; + PtlMDBind_out ret; + int rc; + + rc = validate_md(ni_in, md_in); + if (rc != PTL_OK) + return rc; + + args.eq_in = md2eq(&md_in); + args.ni_in = ni_in; + args.md_in = md_in; + + rc = do_forward(ni_in, PTL_MDBIND, + &args, sizeof(args), &ret, sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = ni_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +{ + PtlMDUpdate_internal_in args; + PtlMDUpdate_internal_out ret; + int rc; + + args.md_in = md_in; + + if (old_inout) { + args.old_inout = *old_inout; + args.old_inout_valid = 1; + } else + args.old_inout_valid = 0; + + if (new_inout) { + rc = validate_md (md_in, *new_inout); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + args.new_inout = *new_inout; + args.new_inout_valid = 1; + } else + args.new_inout_valid = 0; + + if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) { + args.testq_in = PTL_EQ_NONE; + args.sequence_in = -1; + } else { + ptl_eq_t *eq = ptl_handle2usereq (&testq_in); + + args.testq_in = eq->cb_eq_handle; + args.sequence_in = eq->sequence; + } + + rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + if (old_inout) + *old_inout = ret.old_inout; + + return ret.rc; +} + +int PtlMDUnlink(ptl_handle_md_t md_in) +{ + PtlMDUnlink_in args; + PtlMDUnlink_out ret; + int rc; + + args.md_in = md_in; + rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + return ret.rc; +} + +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out) +{ + ptl_eq_t *eq = NULL; + ptl_event_t *ev = NULL; + PtlEQAlloc_in args; + PtlEQAlloc_out ret; + int rc, i; + nal_t *nal; + + if (!ptl_init) + return PTL_NOINIT; + + nal = ptl_hndl2nal (&interface); + if (nal == NULL) + return PTL_INV_HANDLE; + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + + PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); + if (!ev) + return PTL_NOSPACE; + + for (i = 0; i < count; i++) + ev[i].sequence = 0; + + if (nal->validate != NULL) { + rc = nal->validate(nal, ev, count * sizeof(ptl_event_t)); + if (rc != PTL_OK) + goto fail; + } + + args.ni_in = interface; + args.count_in = count; + args.base_in = ev; + args.len_in = count * sizeof(*ev); + args.callback_in = callback; + + rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + goto fail; + if (ret.rc) + GOTO(fail, rc = ret.rc); + + PORTAL_ALLOC(eq, sizeof(*eq)); + if (!eq) { + rc = PTL_NOSPACE; + goto fail; + } + + eq->sequence = 1; + eq->size = count; + eq->base = ev; + + /* EQ handles are a little wierd. PtlEQGet() just looks at the + * queued events in shared memory. It doesn't want to do_forward() + * at all, so the cookie in the EQ handle we pass out of here is + * simply a pointer to the event queue we just set up. We stash + * the handle returned by do_forward(), so we can pass it back via + * do_forward() when we need to. */ + + eq->cb_eq_handle.nal_idx = interface.nal_idx; + eq->cb_eq_handle.cookie = ret.handle_out.cookie; + + handle_out->nal_idx = interface.nal_idx; + handle_out->cookie = (__u64)((unsigned long)eq); + return PTL_OK; + +fail: + PORTAL_FREE(ev, count * sizeof(ptl_event_t)); + return rc; +} + +int PtlEQFree(ptl_handle_eq_t eventq) +{ + PtlEQFree_in args; + PtlEQFree_out ret; + ptl_eq_t *eq; + int rc; + + eq = ptl_handle2usereq (&eventq); + args.eventq_in = eq->cb_eq_handle; + + rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, + sizeof(args), &ret, sizeof(ret)); + + /* XXX we're betting rc == PTL_OK here */ + PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); + PORTAL_FREE(eq, sizeof(*eq)); + + return rc; +} + +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) +{ + PtlACEntry_in args; + PtlACEntry_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.ni_in = ni_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.portal_in = portal_in; + + rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, + sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) +{ + PtlPut_in args; + PtlPut_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.ack_req_in = ack_req_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + args.hdr_data_in = hdr_data_in; + + rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in) +{ + PtlGet_in args; + PtlGet_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + + rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} diff --git a/lnet/lnet/lib-dispatch.c b/lnet/lnet/lib-dispatch.c new file mode 100644 index 0000000..13036c7 --- /dev/null +++ b/lnet/lnet/lib-dispatch.c @@ -0,0 +1,80 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-dispatch.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +typedef struct { + int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); + char *name; +} dispatch_table_t; + +static dispatch_table_t dispatch_table[] = { + [PTL_GETID] {do_PtlGetId, "PtlGetId"}, + [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, + [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, + [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"}, + [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, + [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, + [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, + [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, + [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, + [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, + [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, + [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, + [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, + [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, + [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, + [PTL_PUT] {do_PtlPut, "PtlPut"}, + [PTL_GET] {do_PtlGet, "PtlGet"}, + [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, + /* */ {0, ""} +}; + +/* + * This really should be elsewhere, but lib-p30/dispatch.c is + * an automatically generated file. + */ +void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, + void *ret_block) +{ + lib_ni_t *ni = &nal->ni; + + if (index < 0 || index > LIB_MAX_DISPATCH || + !dispatch_table[index].fun) { + CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); + return; + } + + CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, + dispatch_table[index].name, index); + + dispatch_table[index].fun(nal, private, arg_block, ret_block); +} + +char *dispatch_name(int index) +{ + return dispatch_table[index].name; +} diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c new file mode 100644 index 0000000..ce343c1 --- /dev/null +++ b/lnet/lnet/lib-eq.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-eq.c + * Library level Event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_size_t count_in + * void * base_in + * + * Outgoing: + * ptl_handle_eq_t * handle_out + */ + + PtlEQAlloc_in *args = v_args; + PtlEQAlloc_out *ret = v_ret; + + lib_eq_t *eq; + unsigned long flags; + + /* api should have rounded up */ + if (args->count_in != LOWEST_BIT_SET (args->count_in)) + return ret->rc = PTL_VAL_FAILED; + + eq = lib_eq_alloc (nal); + if (eq == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + if (nal->cb_map != NULL) { + struct iovec iov = { + .iov_base = args->base_in, + .iov_len = args->count_in * sizeof (ptl_event_t) }; + + ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); + if (ret->rc != PTL_OK) { + lib_eq_free (nal, eq); + + state_unlock (nal, &flags); + return (ret->rc); + } + } + + eq->sequence = 1; + eq->base = args->base_in; + eq->size = args->count_in; + eq->eq_refcount = 0; + eq->event_callback = args->callback_in; + + lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); + list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + + state_unlock(nal, &flags); + + ptl_eq2handle(&ret->handle_out, eq); + return (ret->rc = PTL_OK); +} + +int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_eq_t eventq_in + * + * Outgoing: + */ + + PtlEQFree_in *args = v_args; + PtlEQFree_out *ret = v_ret; + lib_eq_t *eq; + long flags; + + state_lock (nal, &flags); + + eq = ptl_handle2eq(&args->eventq_in, nal); + if (eq == NULL) { + ret->rc = PTL_INV_EQ; + } else if (eq->eq_refcount != 0) { + ret->rc = PTL_EQ_INUSE; + } else { + if (nal->cb_unmap != NULL) { + struct iovec iov = { + .iov_base = eq->base, + .iov_len = eq->size * sizeof (ptl_event_t) }; + + nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + } + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + ret->rc = PTL_OK; + } + + state_unlock (nal, &flags); + + return (ret->rc); +} diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c new file mode 100644 index 0000000..99c4d32 --- /dev/null +++ b/lnet/lnet/lib-init.c @@ -0,0 +1,474 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-init.c + * Start up the internal library and clear all structures + * Called by the NAL when it initializes. Safe to call multiple times. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include + +#ifdef __KERNEL__ +# include /* for memset() */ +# include +# ifdef KERNEL_ADDR_CACHE +# include +# endif +#else +# include +# include +#endif + +#ifdef PTL_USE_SLAB_CACHE +static int ptl_slab_users; + +kmem_cache_t *ptl_md_slab; +kmem_cache_t *ptl_msg_slab; +kmem_cache_t *ptl_me_slab; +kmem_cache_t *ptl_eq_slab; + +atomic_t md_in_use_count; +atomic_t msg_in_use_count; +atomic_t me_in_use_count; +atomic_t eq_in_use_count; + +/* NB zeroing in ctor and on freeing ensures items that + * kmem_cache_validate() OK, but haven't been initialised + * as an MD/ME/EQ can't have valid handles + */ +static void +ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_md_t)); +} + +static void +ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_me_t)); +} + +static void +ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_eq_t)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + + /* We'll have 1 set of slabs for ALL the nals :) */ + + if (ptl_slab_users++) + return 0; + + ptl_md_slab = kmem_cache_create("portals_MD", + sizeof(lib_md_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_md_slab_ctor, NULL); + if (!ptl_md_slab) { + CERROR("couldn't allocate ptl_md_t slab"); + RETURN (PTL_NOSPACE); + } + + /* NB no ctor for msgs; they don't need handle verification */ + ptl_msg_slab = kmem_cache_create("portals_MSG", + sizeof(lib_msg_t), 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ptl_msg_slab) { + CERROR("couldn't allocate ptl_msg_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_me_slab = kmem_cache_create("portals_ME", + sizeof(lib_me_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_me_slab_ctor, NULL); + if (!ptl_me_slab) { + CERROR("couldn't allocate ptl_me_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_eq_slab = kmem_cache_create("portals_EQ", + sizeof(lib_eq_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_eq_slab_ctor, NULL); + if (!ptl_eq_slab) { + CERROR("couldn't allocate ptl_eq_t slab"); + RETURN (PTL_NOSPACE); + } + + RETURN(PTL_OK); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + if (--ptl_slab_users != 0) + return; + + LASSERT (atomic_read (&md_in_use_count) == 0); + LASSERT (atomic_read (&me_in_use_count) == 0); + LASSERT (atomic_read (&eq_in_use_count) == 0); + LASSERT (atomic_read (&msg_in_use_count) == 0); + + if (ptl_md_slab != NULL) + kmem_cache_destroy(ptl_md_slab); + if (ptl_msg_slab != NULL) + kmem_cache_destroy(ptl_msg_slab); + if (ptl_me_slab != NULL) + kmem_cache_destroy(ptl_me_slab); + if (ptl_eq_slab != NULL) + kmem_cache_destroy(ptl_eq_slab); +} +#else + +int +lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lib_freeobj_t, fo_contents); + + space = nal->cb_malloc (nal, n * size); + if (space == NULL) + return (PTL_NOSPACE); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (PTL_OK); +} + +void +lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (fl)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + int rc; + + memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); + memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); + memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); + memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + MAX_MES, sizeof (lib_me_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + MAX_MSGS, sizeof (lib_msg_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + MAX_MDS, sizeof (lib_md_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + MAX_EQS, sizeof (lib_eq_t)); + return (rc); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + lib_freelist_fini (nal, &nal->ni.ni_free_mes); + lib_freelist_fini (nal, &nal->ni.ni_free_msgs); + lib_freelist_fini (nal, &nal->ni.ni_free_mds); + lib_freelist_fini (nal, &nal->ni.ni_free_eqs); +} + +#endif + +__u64 +lib_create_interface_cookie (nal_cb_t *nal) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid in a new instance of the same + * interface. Initialisation time, even if it's only implemented + * to millisecond resolution is probably easily good enough. */ + struct timeval tv; + __u64 cookie; +#ifndef __KERNEL__ + int rc = gettimeofday (&tv, NULL); + LASSERT (rc == 0); +#else + do_gettimeofday(&tv); +#endif + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return (cookie); +} + +int +lib_setup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + int i; + + /* Arbitrary choice of hash table size */ +#ifdef __KERNEL__ + ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); +#else + ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; +#endif + ni->ni_lh_hash_table = + (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size + * sizeof (struct list_head)); + if (ni->ni_lh_hash_table == NULL) + return (PTL_NOSPACE); + + for (i = 0; i < ni->ni_lh_hash_size; i++) + INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); + + ni->ni_next_object_cookie = PTL_COOKIE_TYPES; + + return (PTL_OK); +} + +void +lib_cleanup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->ni_lh_hash_table == NULL) + return; + + nal->cb_free (nal, ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); +} + +lib_handle_t * +lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + struct list_head *list; + struct list_head *el; + unsigned int hash; + + if ((cookie & (PTL_COOKIE_TYPES - 1)) != type) + return (NULL); + + hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; + list = &ni->ni_lh_hash_table[hash]; + + list_for_each (el, list) { + lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); + + if (lh->lh_cookie == cookie) + return (lh); + } + + return (NULL); +} + +void +lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + unsigned int hash; + + LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); + lh->lh_cookie = ni->ni_next_object_cookie | type; + ni->ni_next_object_cookie += PTL_COOKIE_TYPES; + + hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; + list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); +} + +void +lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + list_del (&lh->lh_hash_chain); +} + +int +lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size) +{ + int rc = PTL_OK; + lib_ni_t *ni = &nal->ni; + int i; + ENTRY; + + /* NB serialised in PtlNIInit() */ + + if (ni->refcnt != 0) { /* already initialised */ + ni->refcnt++; + goto out; + } + + lib_assert_wire_constants (); + + /* + * Allocate the portal table for this interface + * and all per-interface objects. + */ + memset(&ni->counters, 0, sizeof(lib_counters_t)); + + rc = kportal_descriptor_setup (nal); + if (rc != PTL_OK) + goto out; + + INIT_LIST_HEAD (&ni->ni_active_msgs); + INIT_LIST_HEAD (&ni->ni_active_mds); + INIT_LIST_HEAD (&ni->ni_active_eqs); + + INIT_LIST_HEAD (&ni->ni_test_peers); + + ni->ni_interface_cookie = lib_create_interface_cookie (nal); + ni->ni_next_object_cookie = 0; + rc = lib_setup_handle_hash (nal); + if (rc != PTL_OK) + goto out; + + ni->nid = nid; + ni->pid = pid; + + ni->num_nodes = gsize; + ni->tbl.size = ptl_size; + + ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); + if (ni->tbl.tbl == NULL) { + rc = PTL_NOSPACE; + goto out; + } + + for (i = 0; i < ptl_size; i++) + INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + + ni->debug = PTL_DEBUG_NONE; + ni->up = 1; + ni->refcnt++; + + out: + if (rc != PTL_OK) { + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + } + + RETURN (rc); +} + +int +lib_fini(nal_cb_t * nal) +{ + lib_ni_t *ni = &nal->ni; + int idx; + + ni->refcnt--; + + if (ni->refcnt != 0) + goto out; + + /* NB no stat_lock() since this is the last reference. The NAL + * should have shut down already, so it should be safe to unlink + * and free all descriptors, even those that appear committed to a + * network op (eg MD with non-zero pending count) + */ + + for (idx = 0; idx < ni->tbl.size; idx++) + while (!list_empty (&ni->tbl.tbl[idx])) { + lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + lib_me_t, me_list); + + CERROR ("Active me %p on exit\n", me); + list_del (&me->me_list); + lib_me_free (nal, me); + } + + while (!list_empty (&ni->ni_active_mds)) { + lib_md_t *md = list_entry (ni->ni_active_mds.next, + lib_md_t, md_list); + + CERROR ("Active md %p on exit\n", md); + list_del (&md->md_list); + lib_md_free (nal, md); + } + + while (!list_empty (&ni->ni_active_eqs)) { + lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, + lib_eq_t, eq_list); + + CERROR ("Active eq %p on exit\n", eq); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + } + + while (!list_empty (&ni->ni_active_msgs)) { + lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, + lib_msg_t, msg_list); + + CERROR ("Active msg %p on exit\n", msg); + list_del (&msg->msg_list); + lib_msg_free (nal, msg); + } + + nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + ni->up = 0; + + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + + out: + return (PTL_OK); +} diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c new file mode 100644 index 0000000..a79e2be --- /dev/null +++ b/lnet/lnet/lib-md.c @@ -0,0 +1,412 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-md.c + * Memory Descriptor management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +/* + * must be called with state lock held + */ +void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +{ + lib_me_t *me = md->me; + + if (md->pending != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + md->md_flags |= PTL_MD_FLAG_UNLINK; + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if ((md->options & PTL_MD_KIOV) != 0) { + if (nal->cb_unmap_pages != NULL) + nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->cb_unmap != NULL) + nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, + &md->md_addrkey); + + if (me) { + me->md = NULL; + if (me->unlink == PTL_UNLINK) + lib_me_unlink(nal, me); + } + + if (md->eq != NULL) + { + md->eq->eq_refcount--; + LASSERT (md->eq->eq_refcount >= 0); + } + + lib_invalidate_handle (nal, &md->md_lh); + list_del (&md->md_list); + lib_md_free(nal, md); +} + +/* must be called with state lock held */ +static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, + ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +{ + const int max_size_opts = PTL_MD_AUTO_UNLINK | + PTL_MD_MAX_SIZE; + lib_eq_t *eq = NULL; + int rc; + int i; + + /* NB we are passes an allocated, but uninitialised/active md. + * if we return success, caller may lib_md_unlink() it. + * otherwise caller may only lib_md_free() it. + */ + + if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) { + eq = ptl_handle2eq(eqh, nal); + if (eq == NULL) + return PTL_INV_EQ; + } + + if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ + md->niov > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_TOO_MANY; + + if ((md->options & max_size_opts) != 0 && /* max size used */ + (md->max_size < 0 || md->max_size > md->length)) // illegal max_size + return PTL_INV_MD; + + new->me = NULL; + new->start = md->start; + new->length = md->length; + new->offset = 0; + new->max_size = md->max_size; + new->unlink = unlink; + new->options = md->options; + new->user_ptr = md->user_ptr; + new->eq = eq; + new->threshold = md->threshold; + new->pending = 0; + new->md_flags = 0; + + if ((md->options & PTL_MD_IOV) != 0) { + int total_length = 0; + + if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.iov, md->start, + md->niov * sizeof (new->md_iov.iov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the base address on trust */ + if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_VAL_FAILED; + + total_length += new->md_iov.iov[i].iov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } else if ((md->options & PTL_MD_KIOV) != 0) { +#ifndef __KERNEL__ + return PTL_INV_MD; +#else + int total_length = 0; + + /* Trap attempt to use paged I/O if unsupported early. */ + if (nal->cb_send_pages == NULL || + nal->cb_recv_pages == NULL) + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, + md->niov * sizeof (new->md_iov.kiov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the page pointer on trust */ + if (new->md_iov.kiov[i].kiov_offset + + new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + return PTL_VAL_FAILED; /* invalid length */ + + total_length += new->md_iov.kiov[i].kiov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map_pages != NULL) { + rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } +#endif + } else { /* contiguous */ + new->md_niov = 1; + new->md_iov.iov[0].iov_base = md->start; + new->md_iov.iov[0].iov_len = md->length; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } + + if (eq != NULL) + eq->eq_refcount++; + + /* It's good; let handle2md succeed and add to active mds */ + lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD); + list_add (&new->md_list, &nal->ni.ni_active_mds); + + return PTL_OK; +} + +/* must be called with state lock held */ +void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + new->start = md->start; + new->length = md->length; + new->threshold = md->threshold; + new->max_size = md->max_size; + new->options = md->options; + new->user_ptr = md->user_ptr; + ptl_eq2handle(&new->eventq, md->eq); + new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov; +} + +int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_me_t current_in + * ptl_md_t md_in + * ptl_unlink_t unlink_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDAttach_in *args = v_args; + PtlMDAttach_out *ret = v_ret; + lib_me_t *me; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->me_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else if (me->md != NULL) { + ret->rc = PTL_INUSE; + } else { + ret->rc = lib_md_build(nal, md, private, &args->md_in, + &args->eq_in, args->unlink_in); + + if (ret->rc == PTL_OK) { + me->md = md; + md->me = me; + + ptl_md2handle(&ret->handle_out, md); + + state_unlock (nal, &flags); + return (PTL_OK); + } + } + + lib_md_free (nal, md); + + state_unlock (nal, &flags); + return (ret->rc); +} + +int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_md_t md_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDBind_in *args = v_args; + PtlMDBind_out *ret = v_ret; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + ret->rc = lib_md_build(nal, md, private, + &args->md_in, &args->eq_in, PTL_UNLINK); + + if (ret->rc == PTL_OK) { + ptl_md2handle(&ret->handle_out, md); + + state_unlock(nal, &flags); + return (PTL_OK); + } + + lib_md_free (nal, md); + + state_unlock(nal, &flags); + return (ret->rc); +} + +int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMDUnlink_in *args = v_args; + PtlMDUnlink_out *ret = v_ret; + + lib_md_t *md; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + } else if (md->pending != 0) { /* being filled/spilled */ + ret->rc = PTL_MD_INUSE; + } else { + /* Callers attempting to unlink a busy MD which will get + * unlinked once the net op completes should see INUSE, + * before completion and INV_MD thereafter. LASSERT we've + * got that right... */ + LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); + + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_md_t * old_inout + * ptl_md_t * new_inout + * ptl_handle_eq_t testq_in + * ptl_seq_t sequence_in + * + * Outgoing: + * ptl_md_t * old_inout + * ptl_md_t * new_inout + */ + PtlMDUpdate_internal_in *args = v_args; + PtlMDUpdate_internal_out *ret = v_ret; + lib_md_t *md; + lib_eq_t *test_eq = NULL; + ptl_md_t *new = &args->new_inout; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (args->old_inout_valid) + lib_md_deconstruct(nal, md, &ret->old_inout); + + if (!args->new_inout_valid) { + ret->rc = PTL_OK; + goto out; + } + + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(&args->testq_in, nal); + if (test_eq == NULL) { + ret->rc = PTL_INV_EQ; + goto out; + } + } + + if (md->pending != 0) { + ret->rc = PTL_NOUPDATE; + goto out; + } + + if (test_eq == NULL || + test_eq->sequence == args->sequence_in) { + lib_me_t *me = md->me; + +#warning this does not track eq refcounts properly + + ret->rc = lib_md_build(nal, md, private, + new, &new->eventq, md->unlink); + + md->me = me; + } else { + ret->rc = PTL_NOUPDATE; + } + + out: + state_unlock(nal, &flags); + return (ret->rc); +} diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c new file mode 100644 index 0000000..bd1af5b --- /dev/null +++ b/lnet/lnet/lib-me.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-me.c + * Match Entry management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include +#include + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); + +int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEAttach_in *args = v_args; + PtlMEAttach_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_ptl_t *tbl = &ni->tbl; + unsigned long flags; + lib_me_t *me; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + /* Should check for valid matchid, but not yet */ + if (0) + return ret->rc = PTL_INV_PROC; + + me = lib_me_alloc (nal); + if (me == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me->match_id = args->match_id_in; + me->match_bits = args->match_bits_in; + me->ignore_bits = args->ignore_bits_in; + me->unlink = args->unlink_in; + me->md = NULL; + + lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + else + list_add(&me->me_list, &(tbl->tbl[args->index_in])); + + ptl_me2handle(&ret->handle_out, me); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEInsert_in *args = v_args; + PtlMEInsert_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + lib_me_t *new; + + new = lib_me_alloc (nal); + if (new == NULL) + return (ret->rc = PTL_NOSPACE); + + /* Should check for valid matchid, but not yet */ + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + lib_me_free (nal, new); + + state_unlock (nal, &flags); + return (ret->rc = PTL_INV_ME); + } + + new->match_id = args->match_id_in; + new->match_bits = args->match_bits_in; + new->ignore_bits = args->ignore_bits_in; + new->unlink = args->unlink_in; + new->md = NULL; + + lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&new->me_list, &me->me_list); + else + list_add(&new->me_list, &me->me_list); + + ptl_me2handle(&ret->handle_out, new); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEUnlink_in *args = v_args; + PtlMEUnlink_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_unlink(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +/* call with state_lock please */ +void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->debug & PTL_DEBUG_UNLINK) { + ptl_handle_any_t handle; + ptl_me2handle(&handle, me); + } + + list_del (&me->me_list); + + if (me->md) { + me->md->me = NULL; + lib_md_unlink(nal, me->md); + } + + lib_invalidate_handle (nal, &me->me_lh); + lib_me_free(nal, me); +} + +int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlTblDump_in *args = v_args; + PtlTblDump_out *ret = v_ret; + lib_ptl_t *tbl = &nal->ni.tbl; + ptl_handle_any_t handle; + struct list_head *tmp; + unsigned long flags; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + nal->cb_printf(nal, "Portal table index %d\n", args->index_in); + + state_lock(nal, &flags); + list_for_each(tmp, &(tbl->tbl[args->index_in])) { + lib_me_t *me = list_entry(tmp, lib_me_t, me_list); + ptl_me2handle(&handle, me); + lib_me_dump(nal, me); + } + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEDump_in *args = v_args; + PtlMEDump_out *ret = v_ret; + lib_me_t *me; + unsigned long flags; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_dump(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return ret->rc; +} + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) +{ + nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); + + nal->cb_printf(nal, "\tMD\t= %p\n", me->md); + nal->cb_printf(nal, "\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + nal->cb_printf(nal, "\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); +} diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c new file mode 100644 index 0000000..fde4f16 --- /dev/null +++ b/lnet/lnet/lib-move.c @@ -0,0 +1,1379 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif +#include +#include +#include + +/* + * Right now it does not check access control lists. + * + * We only support one MD per ME, which is how the Portals 3.1 spec is written. + * All previous complication is removed. + */ + +static lib_me_t * +lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, + ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, + ptl_match_bits_t match_bits, ptl_size_t *mlength_out, + ptl_size_t *offset_out, int *unlink_out) +{ + lib_ni_t *ni = &nal->ni; + struct list_head *match_list = &ni->tbl.tbl[index]; + struct list_head *tmp; + lib_me_t *me; + lib_md_t *md; + ptl_size_t mlength; + ptl_size_t offset; + + ENTRY; + + CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " + "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); + + if (index < 0 || index >= ni->tbl.size) { + CERROR("Invalid portal %d not in [0-%d]\n", + index, ni->tbl.size); + goto failed; + } + + list_for_each (tmp, match_list) { + me = list_entry(tmp, lib_me_t, me_list); + md = me->md; + + /* ME attached but MD not attached yet */ + if (md == NULL) + continue; + + LASSERT (me == md->me); + + /* MD deactivated */ + if (md->threshold == 0) + continue; + + /* mismatched MD op */ + if ((md->options & op_mask) == 0) + continue; + + /* mismatched ME nid/pid? */ + if (me->match_id.nid != PTL_NID_ANY && + me->match_id.nid != src_nid) + continue; + + if (me->match_id.pid != PTL_PID_ANY && + me->match_id.pid != src_pid) + continue; + + /* mismatched ME matchbits? */ + if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) + continue; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) + offset = md->offset; + else + offset = roffset; + + mlength = md->length - offset; + if ((md->options & PTL_MD_MAX_SIZE) != 0 && + mlength > md->max_size) + mlength = md->max_size; + + if (rlength <= mlength) { /* fits in allowed space */ + mlength = rlength; + } else if ((md->options & PTL_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet %d too big: %d left, " + "%d allowed\n", rlength, md->length - offset, + mlength); + goto failed; + } + + md->offset = offset + mlength; + + *offset_out = offset; + *mlength_out = mlength; + *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 && + md->offset >= (md->length - md->max_size)); + RETURN (me); + } + + failed: + CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 + " offset %d length %d: no match\n", + ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + src_nid, src_pid, index, match_bits, roffset, rlength); + RETURN(NULL); +} + +int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +{ + PtlFailNid_in *args = v_args; + PtlFailNid_out *ret = v_ret; + lib_test_peer_t *tp; + unsigned long flags; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + if (args->threshold != 0) { + /* Adding a new entry */ + tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + if (tp == NULL) + return (ret->rc = PTL_FAIL); + + tp->tp_nid = args->nid; + tp->tp_threshold = args->threshold; + + state_lock (nal, &flags); + list_add (&tp->tp_list, &nal->ni.ni_test_peers); + state_unlock (nal, &flags); + return (ret->rc = PTL_OK); + } + + /* removing entries */ + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + args->nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == args->nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + + list_del (&tp->tp_list); + nal->cb_free (nal, tp, sizeof (*tp)); + } + return (ret->rc = PTL_OK); +} + +static int +fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +{ + lib_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + unsigned long flags; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != PTL_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + list_del (&tp->tp_list); + + nal->cb_free (nal, tp, sizeof (*tp)); + } + + return (fail); +} + +ptl_size_t +lib_iov_nob (int niov, struct iovec *iov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} + +void +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (dest, iov->iov_base, nob); + + len -= nob; + dest += nob; + niov--; + iov++; + } +} + +void +lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (iov->iov_base, src, nob); + + len -= nob; + src += nob; + niov--; + iov++; + } +} + +static int +lib_extract_iov (struct iovec *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + struct iovec *src = md->md_iov.iov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (dst_niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} + +#ifndef __KERNEL__ +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + LASSERT (0); + return (0); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + LASSERT (0); +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +{ + LASSERT (0); +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + LASSERT (0); +} + +#else + +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (dest, addr, nob); + kunmap (kiov->kiov_page); + + len -= nob; + dest += nob; + niov--; + kiov++; + } +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (addr, src, nob); + kunmap (kiov->kiov_page); + + len -= nob; + src += nob; + niov--; + kiov++; + } +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + ptl_kiov_t *src = md->md_iov.kiov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + return (dst_niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} +#endif + +void +lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +{ + int niov; + + if (mlen == 0) + nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); + else if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); + nal->cb_recv (nal, private, msg, + niov, msg->msg_iov.iov, mlen, rlen); + } else { + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); + nal->cb_recv_pages (nal, private, msg, + niov, msg->msg_iov.kiov, mlen, rlen); + } +} + +int +lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len) +{ + int niov; + + if (len == 0) + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + 0, NULL, 0)); + + if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.iov, len)); + } + + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); + return (nal->cb_send_pages (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.kiov, len)); +} + +static lib_msg_t * +get_new_msg (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called holding the state_lock */ + lib_counters_t *counters = &nal->ni.counters; + lib_msg_t *msg = lib_msg_alloc (nal); + + if (msg == NULL) + return (NULL); + + memset (msg, 0, sizeof (*msg)); + + msg->send_ack = 0; + + msg->md = md; + msg->ev.arrival_time = get_cycles(); + md->pending++; + if (md->threshold != PTL_MD_THRESH_INF) { + LASSERT (md->threshold > 0); + md->threshold--; + } + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; + + list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + + return (msg); +} + + +/* + * Incoming messages have a ptl_msg_t object associated with them + * by the library. This object encapsulates the state of the + * message and allows the NAL to do non-blocking receives or sends + * of long messages. + * + */ +static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + unsigned long flags; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); + hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, + hdr->src_nid, hdr->src_pid, + PTL_HDR_LENGTH (hdr), hdr->msg.put.offset, + hdr->msg.put.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " + "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + !(md->options & PTL_MD_ACK_DISABLE)) { + msg->send_ack = 1; + msg->ack_wmd = hdr->msg.put.ack_wmd; + msg->nid = hdr->src_nid; + msg->pid = hdr->src_pid; + msg->ev.match_bits = hdr->msg.put.match_bits; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += mlength; + + /* only unlink after MD's pending count has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + ptl_hdr_t reply; + unsigned long flags; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); + hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); + + /* compatibility check until field is deleted */ + if (hdr->msg.get.return_offset != 0) + CERROR("Unexpected non-zero get.return_offset %x from " + LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, + hdr->src_nid, hdr->src_pid, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " + "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.send_count++; + ni->counters.send_length += mlength; + + /* only unlink after MD's refcount has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + memset (&reply, 0, sizeof (reply)); + reply.type = HTON__u32 (PTL_MSG_REPLY); + reply.dest_nid = HTON__u64 (hdr->src_nid); + reply.src_nid = HTON__u64 (ni->nid); + reply.dest_pid = HTON__u32 (hdr->src_pid); + reply.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength); + + reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + + rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, + hdr->src_nid, hdr->src_pid, md, offset, mlength); + if (rc != 0) { + CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", + ni->nid, hdr->src_nid); + state_lock (nal, &flags); + goto drop; + } + + /* Complete the incoming message */ + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (rc); + drop: + ni->counters.drop_count++; + ni->counters.drop_length += hdr->msg.get.sink_length; + state_unlock(nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + int rlength; + int length; + lib_msg_t *msg; + unsigned long flags; + + /* compatibility check until field is deleted */ + if (hdr->msg.reply.dst_offset != 0) + CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n", + hdr->msg.reply.dst_offset, hdr->src_nid); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", + ni->nid, hdr->src_nid, + md == NULL ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + goto drop; + } + + LASSERT (md->offset == 0); + + length = rlength = PTL_HDR_LENGTH(hdr); + + if (length > md->length) { + if ((md->options & PTL_MD_TRUNCATE) == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64 + " length %d for MD "LPX64" would overflow (%d)\n", + ni->nid, hdr->src_nid, length, + hdr->msg.reply.dst_wmd.wh_object_cookie, + md->length); + goto drop; + } + length = md->length; + } + + CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", + hdr->src_nid, length, rlength, + hdr->msg.reply.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping REPLY from "LPU64": can't " + "allocate msg\n", ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += length; + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, 0, length, rlength); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + unsigned long flags; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " + LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + goto drop; + } + + CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", + ni->nid, hdr->src_nid, + hdr->msg.ack.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + state_unlock(nal, &flags); + lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + case PTL_MSG_HELLO: + return ("HELLO"); + default: + return (""); + } +} + +void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); + nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, + hdr->src_pid); + nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, + hdr->dest_pid); + + switch (hdr->type) { + default: + break; + + case PTL_MSG_PUT: + nal->cb_printf(nal, + " Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + nal->cb_printf(nal, + " Length %d, offset %d, hdr data "LPX64"\n", + PTL_HDR_LENGTH(hdr), hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + nal->cb_printf(nal, + " Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + nal->cb_printf(nal, + " Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case PTL_MSG_ACK: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case PTL_MSG_REPLY: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + PTL_HDR_LENGTH(hdr)); + } + +} /* end of print_hdr() */ + + +int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + unsigned long flags; + + /* NB static check; optimizer will elide this if it's right */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.put.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.get.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.reply.length)); + + /* convert common fields to host byte order */ + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->src_nid = NTOH__u64 (hdr->src_nid); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); + hdr->src_pid = NTOH__u32 (hdr->src_pid); + hdr->type = NTOH__u32 (hdr->type); + PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr)); +#if 0 + nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", + nal->ni.nid, nal, hdr, hdr->type); + print_hdr(nal, hdr); +#endif + if (hdr->type == PTL_MSG_HELLO) { + /* dest_nid is really ptl_magicversion_t */ + ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; + + CERROR (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->ni.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (hdr->dest_nid != nal->ni.nid) { + CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 + " (not me)\n", nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + + state_lock (nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ + { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": simulated failure\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + return (-1); + } + + switch (hdr->type) { + case PTL_MSG_ACK: + return (parse_ack(nal, hdr, private)); + case PTL_MSG_PUT: + return (parse_put(nal, hdr, private)); + break; + case PTL_MSG_GET: + return (parse_get(nal, hdr, private)); + break; + case PTL_MSG_REPLY: + return (parse_reply(nal, hdr, private)); + break; + default: + CERROR(LPU64": Dropping message from "LPU64 + ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, + hdr->type); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } +} + + +int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_ack_req_t ack_req_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlPut_in *args = v_args; + PtlPut_out *ret = v_ret; + ptl_hdr_t hdr; + + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + ptl_process_id_t *id = &args->target_in; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + ret->rc = PTL_OK; + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_PUT); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length); + + /* NB handles only looked up by creator (no flips) */ + if (args->ack_req_in == PTL_ACK_REQ) { + hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; + } else { + hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; + } + + hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.put.offset = HTON__u32 (args->offset_in); + hdr.msg.put.hdr_data = args->hdr_data_in; + + ni->counters.send_count++; + ni->counters.send_length += md->length; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("BAD: could not allocate msg!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we need to allocate a message state object and record the + * information about this operation that will be recorded into + * event queue once the message has been completed. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + + return ret->rc = PTL_OK; +} + + +int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlGet_in *args = v_args; + PtlGet_out *ret = v_ret; + ptl_hdr_t hdr; + lib_msg_t *msg = NULL; + lib_ni_t *ni = &nal->ni; + ptl_process_id_t *id = &args->target_in; + lib_md_t *md; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + LASSERT (md->offset == 0); + + CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_GET); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = 0; + + /* NB handles only looked up by creator (no flips) */ + hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; + + hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.sink_length = HTON__u32 (md->length); + + ni->counters.send_count++; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we must allocate a message state object that will record + * the information to be filled in once the message has been + * completed. More information is in the do_PtlPut() comments. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + + return ret->rc = PTL_OK; +} + +void lib_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' */ + + /* Constants... */ + LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded); + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + LASSERT (PORTALS_PROTO_VERSION_MINOR == 1); + LASSERT (PTL_MSG_ACK == 0); + LASSERT (PTL_MSG_PUT == 1); + LASSERT (PTL_MSG_GET == 2); + LASSERT (PTL_MSG_REPLY == 3); + LASSERT (PTL_MSG_HELLO == 4); + + /* Checks for struct ptl_handle_wire_t */ + LASSERT (sizeof (ptl_handle_wire_t) == 16); + LASSERT (offsetof (ptl_handle_wire_t, wh_interface_cookie) == 0); + LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8); + LASSERT (offsetof (ptl_handle_wire_t, wh_object_cookie) == 8); + LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_object_cookie) == 8); + + /* Checks for struct ptl_magicversion_t */ + LASSERT (sizeof (ptl_magicversion_t) == 8); + LASSERT (offsetof (ptl_magicversion_t, magic) == 0); + LASSERT (sizeof (((ptl_magicversion_t *)0)->magic) == 4); + LASSERT (offsetof (ptl_magicversion_t, version_major) == 4); + LASSERT (sizeof (((ptl_magicversion_t *)0)->version_major) == 2); + LASSERT (offsetof (ptl_magicversion_t, version_minor) == 6); + LASSERT (sizeof (((ptl_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct ptl_hdr_t */ + LASSERT (sizeof (ptl_hdr_t) == 72); + LASSERT (offsetof (ptl_hdr_t, dest_nid) == 0); + LASSERT (sizeof (((ptl_hdr_t *)0)->dest_nid) == 8); + LASSERT (offsetof (ptl_hdr_t, src_nid) == 8); + LASSERT (sizeof (((ptl_hdr_t *)0)->src_nid) == 8); + LASSERT (offsetof (ptl_hdr_t, dest_pid) == 16); + LASSERT (sizeof (((ptl_hdr_t *)0)->dest_pid) == 4); + LASSERT (offsetof (ptl_hdr_t, src_pid) == 20); + LASSERT (sizeof (((ptl_hdr_t *)0)->src_pid) == 4); + LASSERT (offsetof (ptl_hdr_t, type) == 24); + LASSERT (sizeof (((ptl_hdr_t *)0)->type) == 4); + + /* Ack */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.mlength) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.mlength) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.ack.dst_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.ack.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.length) == 4); + + /* Put */ + LASSERT (offsetof (ptl_hdr_t, msg.put.ptl_index) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ptl_index) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.ack_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.put.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.put.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.length) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.offset) == 60); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.hdr_data) == 64); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.hdr_data) == 8); + + /* Get */ + LASSERT (offsetof (ptl_hdr_t, msg.get.ptl_index) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.ptl_index) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.return_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.get.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.get.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.length) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.src_offset) == 60); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.src_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.return_offset) == 64); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.sink_length) == 68); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_offset) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.reply.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.length) == 4); +} diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c new file mode 100644 index 0000000..f10892c --- /dev/null +++ b/lnet/lnet/lib-msg.c @@ -0,0 +1,163 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-msg.c + * Message decoding, parsing and finalizing routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include +#endif + +#include + +int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +{ + lib_md_t *md; + lib_eq_t *eq; + int rc; + unsigned long flags; + + /* ni went down while processing this message */ + if (nal->ni.up == 0) { + return -1; + } + + if (msg == NULL) + return 0; + + rc = 0; + if (msg->send_ack) { + ptl_hdr_t ack; + + LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + + memset (&ack, 0, sizeof (ack)); + ack.type = HTON__u32 (PTL_MSG_ACK); + ack.dest_nid = HTON__u64 (msg->nid); + ack.src_nid = HTON__u64 (nal->ni.nid); + ack.dest_pid = HTON__u32 (msg->pid); + ack.src_pid = HTON__u32 (nal->ni.pid); + PTL_HDR_LENGTH(&ack) = 0; + + ack.msg.ack.dst_wmd = msg->ack_wmd; + ack.msg.ack.match_bits = msg->ev.match_bits; + ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); + + rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, + msg->nid, msg->pid, NULL, 0, 0); + } + + md = msg->md; + LASSERT (md->pending > 0); /* I've not dropped my ref yet */ + eq = md->eq; + + state_lock(nal, &flags); + + if (eq != NULL) { + ptl_event_t *ev = &msg->ev; + ptl_event_t *eq_slot; + + /* I have to hold the lock while I bump the sequence number + * and copy the event into the queue. If not, and I was + * interrupted after bumping the sequence number, other + * events could fill the queue, including the slot I just + * allocated to this event. On resuming, I would overwrite + * a more 'recent' event with old event state, and + * processes taking events off the queue would not detect + * overflow correctly. + */ + + ev->sequence = eq->sequence++;/* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Invalidate unlinked_me unless this is the last + * event for an auto-unlinked MD. Note that if md was + * auto-unlinked, md->pending can only decrease + */ + if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ + md->pending != 1) /* not last ref */ + ev->unlinked_me = PTL_HANDLE_NONE; + + /* Copy the event into the allocated slot, ensuring all the + * rest of the event's contents have been copied _before_ + * the sequence number gets updated. A processes 'getting' + * an event waits on the next queue slot's sequence to be + * 'new'. When it is, _all_ other event fields had better + * be consistent. I assert 'sequence' is the last member, + * so I only need a 2 stage copy. + */ + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' */ + + /* cb_write is not necessarily atomic, so this could + cause a race with PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + + /* I must also ensure that (a) callbacks are made in the + * same order as the events land in the queue, and (b) the + * callback occurs before the event can be removed from the + * queue, so I can't drop the lock during the callback. */ + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + (void)((eq->event_callback) (ev)); + } + + LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + + md->pending--; + if (md->pending == 0 && /* no more outstanding operations on this md */ + (md->threshold == 0 || /* done its business */ + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + lib_md_unlink(nal, md); + + list_del (&msg->msg_list); + nal->ni.counters.msgs_alloc--; + lib_msg_free(nal, msg); + + state_unlock(nal, &flags); + + return rc; +} diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c new file mode 100644 index 0000000..aa30329 --- /dev/null +++ b/lnet/lnet/lib-ni.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-ni.c + * Network status registers and distance functions. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include +#include + +#define MAX_DIST 18446744073709551615UL + +int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlNIDebug_in *args = v_args; + PtlNIDebug_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->rc = ni->debug; + ni->debug = args->mask_in; + + return 0; +} + +int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_sr_index_t register_in + * + * Outgoing: + * ptl_sr_value_t * status_out + */ + + PtlNIStatus_in *args = v_args; + PtlNIStatus_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_counters_t *count = &ni->counters; + + if (!args) + return ret->rc = PTL_SEGV; + + ret->rc = PTL_OK; + ret->status_out = 0; + + /* + * I hate this sort of code.... Hash tables, offset lists? + * Treat the counters as an array of ints? + */ + if (args->register_in == PTL_SR_DROP_COUNT) + ret->status_out = count->drop_count; + + else if (args->register_in == PTL_SR_DROP_LENGTH) + ret->status_out = count->drop_length; + + else if (args->register_in == PTL_SR_RECV_COUNT) + ret->status_out = count->recv_count; + + else if (args->register_in == PTL_SR_RECV_LENGTH) + ret->status_out = count->recv_length; + + else if (args->register_in == PTL_SR_SEND_COUNT) + ret->status_out = count->send_count; + + else if (args->register_in == PTL_SR_SEND_LENGTH) + ret->status_out = count->send_length; + + else if (args->register_in == PTL_SR_MSGS_MAX) + ret->status_out = count->msgs_max; + else + ret->rc = PTL_INV_SR_INDX; + + return ret->rc; +} + + +int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_process_id_t process_in + + * + * Outgoing: + * unsigned long * distance_out + + */ + + PtlNIDist_in *args = v_args; + PtlNIDist_out *ret = v_ret; + + unsigned long dist; + ptl_process_id_t id_in = args->process_in; + ptl_nid_t nid; + int rc; + + nid = id_in.nid; + + if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { + ret->distance_out = (unsigned long) MAX_DIST; + return PTL_INV_PROC; + } + + ret->distance_out = dist; + + return ret->rc = PTL_OK; +} diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c new file mode 100644 index 0000000..12eebb5 --- /dev/null +++ b/lnet/lnet/lib-pid.c @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-pid.c + * + * Process identification routines + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This should be removed. The NAL should have the PID information */ +#define DEBUG_SUBSYSTEM S_PORTALS + +#if defined (__KERNEL__) +# include +extern int getpid(void); +#else +# include +# include +#endif +#include +#include + +int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t handle_in + * + * Outgoing: + * ptl_process_id_t * id_out + * ptl_id_t * gsize_out + */ + + PtlGetId_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->id_out.nid = ni->nid; + ret->id_out.pid = ni->pid; + + return ret->rc = PTL_OK; +} diff --git a/lnet/packaging/.cvsignore b/lnet/packaging/.cvsignore new file mode 100644 index 0000000..fd1d56a --- /dev/null +++ b/lnet/packaging/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +aclocal.m4 +config.log +config.status +config.cache +configure +portals.spec diff --git a/lnet/packaging/Makefile.am b/lnet/packaging/Makefile.am new file mode 100644 index 0000000..126bc69 --- /dev/null +++ b/lnet/packaging/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = portals.spec \ No newline at end of file diff --git a/lnet/packaging/portals.spec.in b/lnet/packaging/portals.spec.in new file mode 100644 index 0000000..e196b3f --- /dev/null +++ b/lnet/packaging/portals.spec.in @@ -0,0 +1,116 @@ +%define kversion @RELEASE@ +%define linuxdir @LINUX@ +%define version HEAD + +Summary: Sandia Portals Message Passing - utilities +Name: portals +Version: %{version} +Release: 0210101748uml +Copyright: LGPL +Group: Utilities/System +BuildRoot: /var/tmp/portals-%{version}-root +Source: http://sandiaportals.org/portals-%{version}.tar.gz + +%description +Sandia Portals message passing package. Contains kernel modules, libraries and utilities. + +%package -n portals-modules +Summary: Kernel modules and NAL's for portals +Group: Development/Kernel + +%description -n portals-modules +Object-Based Disk storage drivers for Linux %{kversion}. + +%package -n portals-source +Summary: Portals kernel source for rebuilding with other kernels +Group: Development/Kernel + +%description -n portals-source +Portals kernel source for rebuilding with other kernels + +%prep +%setup -n portals-%{version} + +%build +rm -rf $RPM_BUILD_ROOT + +# Create the pristine source directory. +srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version} +mkdir -p $srcdir +find . -name CVS -prune -o -print | cpio -ap $srcdir + +# Set an explicit path to our Linux tree, if we can. +conf_flag= +linuxdir=%{linuxdir} +test -d $linuxdir && conf_flag=--with-linux=$linuxdir +./configure $conf_flag +make + +%install +make install prefix=$RPM_BUILD_ROOT + +%ifarch alpha +# this hurts me + conf_flag= + linuxdir=%{linuxdir} + test -d $linuxdir && conf_flag=--with-linux=$linuxdir + make clean + ./configure --enable-rtscts-myrinet $conf_flag + make + cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o + cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload +%endif + + +%files +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /usr/sbin/acceptor +%attr(-, root, root) /usr/sbin/ptlctl +%attr(-, root, root) /usr/sbin/debugctl +%ifarch alpha +%attr(-, root, root) /usr/sbin/mcpload +%endif +%attr(-, root, root) /lib/libmyrnal.a +%attr(-, root, root) /lib/libptlapi.a +%attr(-, root, root) /lib/libptlctl.a +%attr(-, root, root) /lib/libprocbridge.a +%attr(-, root, root) /lib/libptllib.a +%attr(-, root, root) /lib/libtcpnal.a +%attr(-, root, root) /lib/libtcpnalutil.a +%attr(-, root, root) /usr/include/portals/*.h +%attr(-, root, root) /usr/include/portals/base/*.h +%attr(-, root, root) /usr/include/linux/*.h + +%files -n portals-modules +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o +%ifarch alpha +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o +%endif +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o + +%files -n portals-source +%attr(-, root, root) /usr/src/portals-%{version} + +%post +if [ ! -e /dev/portals ]; then + mknod /dev/portals c 10 240 +fi +depmod -ae || exit 0 + +grep -q portals /etc/modules.conf || \ + echo 'alias char-major-10-240 portals' >> /etc/modules.conf + +grep -q '/dev/portals' /etc/modules.conf || \ + echo 'alias /dev/portals portals' >> /etc/modules.conf + +%postun +depmod -ae || exit 0 + +%clean +#rm -rf $RPM_BUILD_ROOT + +# end of file diff --git a/lnet/router/.cvsignore b/lnet/router/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/router/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/router/Makefile.am b/lnet/router/Makefile.am new file mode 100644 index 0000000..1c8087b --- /dev/null +++ b/lnet/router/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +MODULE = kptlrouter +modulenet_DATA = kptlrouter.o +EXTRA_PROGRAMS = kptlrouter + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kptlrouter_SOURCES = router.c proc.c router.h diff --git a/lnet/router/Makefile.mk b/lnet/router/Makefile.mk new file mode 100644 index 0000000..64bd09b --- /dev/null +++ b/lnet/router/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += kptlrouter.o +kptlrouter-objs := router.o proc.o diff --git a/lnet/router/proc.c b/lnet/router/proc.c new file mode 100644 index 0000000..dd65b34 --- /dev/null +++ b/lnet/router/proc.c @@ -0,0 +1,78 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +#define KPR_PROC_ROUTER "sys/portals/router" + +int +kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data) +{ + unsigned long long bytes = kpr_fwd_bytes; + unsigned long packets = kpr_fwd_packets; + unsigned long errors = kpr_fwd_errors; + unsigned int qdepth = atomic_read (&kpr_queue_depth); + int len; + + *eof = 1; + if (off != 0) + return (0); + + len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); + + *start = page; + return (len); +} + +int +kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data) +{ + /* Ignore what we've been asked to write, and just zero the stats counters */ + kpr_fwd_bytes = 0; + kpr_fwd_packets = 0; + kpr_fwd_errors = 0; + + return (count); +} + +void +kpr_proc_init(void) +{ + struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL); + + if (entry == NULL) + { + CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); + return; + } + + entry->data = NULL; + entry->read_proc = kpr_proc_read; + entry->write_proc = kpr_proc_write; +} + +void +kpr_proc_fini(void) +{ + remove_proc_entry(KPR_PROC_ROUTER, 0); +} diff --git a/lnet/router/router.c b/lnet/router/router.c new file mode 100644 index 0000000..6074c3c --- /dev/null +++ b/lnet/router/router.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +struct list_head kpr_routes; +struct list_head kpr_nals; + +unsigned long long kpr_fwd_bytes; +unsigned long kpr_fwd_packets; +unsigned long kpr_fwd_errors; +atomic_t kpr_queue_depth; + +/* Mostly the tables are read-only (thread and interrupt context) + * + * Once in a blue moon we register/deregister NALs and add/remove routing + * entries (thread context only)... */ +rwlock_t kpr_rwlock; + +kpr_router_interface_t kpr_router_interface = { + kprri_register: kpr_register_nal, + kprri_lookup: kpr_lookup_target, + kprri_fwd_start: kpr_forward_packet, + kprri_fwd_done: kpr_complete_packet, + kprri_shutdown: kpr_shutdown_nal, + kprri_deregister: kpr_deregister_nal, +}; + +kpr_control_interface_t kpr_control_interface = { + kprci_add_route: kpr_add_route, + kprci_del_route: kpr_del_route, + kprci_get_route: kpr_get_route, +}; + +int +kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) +{ + long flags; + struct list_head *e; + kpr_nal_entry_t *ne; + + CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + + PORTAL_ALLOC (ne, sizeof (*ne)); + if (ne == NULL) + return (-ENOMEM); + + memset (ne, 0, sizeof (*ne)); + memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); + + LASSERT (!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) + { + write_unlock_irqrestore (&kpr_rwlock, flags); + + CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid); + + PORTAL_FREE (ne, sizeof (*ne)); + return (-EEXIST); + } + } + + list_add (&ne->kpne_list, &kpr_nals); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + *argp = ne; + PORTAL_MODULE_USE; + return (0); +} + +void +kpr_shutdown_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (!ne->kpne_shutdown); + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */ + ne->kpne_shutdown = 1; + write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */ + + while (atomic_read (&ne->kpne_refcount) != 0) + { + CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", + ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); + + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } +} + +void +kpr_deregister_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ + LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); + + list_del (&ne->kpne_list); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (ne, sizeof (*ne)); + PORTAL_MODULE_UNUSE; +} + + +int +kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + int rc = -ENOENT; + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + + if (ne->kpne_shutdown) /* caller is shutting down */ + return (-ENOENT); + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid on the callers network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || + re->kpre_hi_nid < target_nid) + continue; + + /* found table entry */ + + if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ + rc = -EHOSTUNREACH; + else + { + rc = 0; + *gateway_nidp = re->kpre_gateway_nid; + } + break; + } + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + target_nid, ne->kpne_interface.kprni_nalid, rc, + (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); + return (rc); +} + +void +kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + struct list_head *e; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ + LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + + atomic_inc (&kpr_queue_depth); + atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ + + kpr_fwd_packets++; /* (loose) stats accounting */ + kpr_fwd_bytes += nob; + + if (src_ne->kpne_shutdown) /* caller is shutting down */ + goto out; + + fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || /* no match */ + re->kpre_hi_nid < target_nid) + continue; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + re->kpre_gateway_nid, re->kpre_gateway_nalid); + + if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) + break; /* don't route to same NAL */ + + /* Search for gateway's NAL's entry */ + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ + break; + + fwd->kprfd_gateway_nid = re->kpre_gateway_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; + } + break; + } + + read_unlock (&kpr_rwlock); + out: + kpr_fwd_errors++; + + CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + /* Can't find anywhere to forward to */ + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); +} + +void +kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) +{ + kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; + + CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ + + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); + + CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ +} + +int +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + long flags; + struct list_head *e; + kpr_route_entry_t *re; + + CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + gateway_nalid, gateway_nid, lo_nid, hi_nid); + + LASSERT(lo_nid <= hi_nid); + + PORTAL_ALLOC (re, sizeof (*re)); + if (re == NULL) + return (-ENOMEM); + + re->kpre_gateway_nalid = gateway_nalid; + re->kpre_gateway_nid = gateway_nid; + re->kpre_lo_nid = lo_nid; + re->kpre_hi_nid = hi_nid; + + LASSERT(!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > re2->kpre_hi_nid || + re->kpre_hi_nid < re2->kpre_lo_nid) + continue; + + CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" + "to ["LPX64" - "LPX64"]\n", + re->kpre_lo_nid, re->kpre_hi_nid, + re2->kpre_lo_nid, re2->kpre_hi_nid); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (re, sizeof (*re)); + return (-EINVAL); + } + + list_add (&re->kpre_list, &kpr_routes); + + write_unlock_irqrestore (&kpr_rwlock, flags); + return (0); +} + +int +kpr_del_route (ptl_nid_t nid) +{ + long flags; + struct list_head *e; + + CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + + LASSERT(!in_interrupt()); + write_lock_irqsave(&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + continue; + + list_del (&re->kpre_list); + write_unlock_irqrestore(&kpr_rwlock, flags); + + PORTAL_FREE(re, sizeof (*re)); + return (0); + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + return (-ENOENT); +} + +int +kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +{ + struct list_head *e; + + read_lock(&kpr_rwlock); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (idx-- == 0) { + *gateway_nalid = re->kpre_gateway_nalid; + *gateway_nid = re->kpre_gateway_nid; + *lo_nid = re->kpre_lo_nid; + *hi_nid = re->kpre_hi_nid; + + read_unlock(&kpr_rwlock); + return (0); + } + } + + read_unlock (&kpr_rwlock); + return (-ENOENT); +} + +static void __exit +kpr_finalise (void) +{ + LASSERT (list_empty (&kpr_nals)); + + while (!list_empty (&kpr_routes)) { + kpr_route_entry_t *re = list_entry(kpr_routes.next, + kpr_route_entry_t, + kpre_list); + + list_del(&re->kpre_list); + PORTAL_FREE(re, sizeof (*re)); + } + + kpr_proc_fini(); + + PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); + PORTAL_SYMBOL_UNREGISTER(kpr_control_interface); + + CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kpr_initialise (void) +{ + CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", + atomic_read(&portal_kmemory)); + + rwlock_init(&kpr_rwlock); + INIT_LIST_HEAD(&kpr_routes); + INIT_LIST_HEAD(&kpr_nals); + + kpr_proc_init(); + + PORTAL_SYMBOL_REGISTER(kpr_router_interface); + PORTAL_SYMBOL_REGISTER(kpr_control_interface); + return (0); +} + +MODULE_AUTHOR("Eric Barton"); +MODULE_DESCRIPTION("Kernel Portals Router v0.01"); +MODULE_LICENSE("GPL"); + +module_init (kpr_initialise); +module_exit (kpr_finalise); + +EXPORT_SYMBOL (kpr_control_interface); +EXPORT_SYMBOL (kpr_router_interface); diff --git a/lnet/router/router.h b/lnet/router/router.h new file mode 100644 index 0000000..b8c3bec --- /dev/null +++ b/lnet/router/router.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _KPTLROUTER_H +#define _KPTLROUTER_H +#define EXPORT_SYMTAB + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLROUTER + +#include +#include +#include + +typedef struct +{ + struct list_head kpne_list; + kpr_nal_interface_t kpne_interface; + atomic_t kpne_refcount; + int kpne_shutdown; +} kpr_nal_entry_t; + +typedef struct +{ + struct list_head kpre_list; + int kpre_gateway_nalid; + ptl_nid_t kpre_gateway_nid; + ptl_nid_t kpre_lo_nid; + ptl_nid_t kpre_hi_nid; +} kpr_route_entry_t; + +extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_shutdown_nal (void *arg); +extern void kpr_deregister_nal (void *arg); + +extern void kpr_proc_init (void); +extern void kpr_proc_fini (void); + +extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); +extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + +extern unsigned long long kpr_fwd_bytes; +extern unsigned long kpr_fwd_packets; +extern unsigned long kpr_fwd_errors; +extern atomic_t kpr_queue_depth; + +#endif /* _KPLROUTER_H */ diff --git a/lnet/tests/.cvsignore b/lnet/tests/.cvsignore new file mode 100644 index 0000000..051d1bd --- /dev/null +++ b/lnet/tests/.cvsignore @@ -0,0 +1,3 @@ +Makefile +Makefile.in +.deps diff --git a/lnet/tests/Makefile.am b/lnet/tests/Makefile.am new file mode 100644 index 0000000..7b47ae0 --- /dev/null +++ b/lnet/tests/Makefile.am @@ -0,0 +1,23 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r +LINK = $(LD) $(LDFLAGS) -o $@ +DEFS = +LIBS = +MODULE = $(basename) +EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh + +noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o + +pingsrv_o_SOURCES = ping_srv.c ping.h + +pingcli_o_SOURCES = ping_cli.c ping.h + +spingsrv_o_SOURCES = sping_srv.c ping.h + +spingcli_o_SOURCES = sping_cli.c ping.h diff --git a/lnet/tests/ping.h b/lnet/tests/ping.h new file mode 100644 index 0000000..f07444b --- /dev/null +++ b/lnet/tests/ping.h @@ -0,0 +1,80 @@ +#ifndef _KPING_INCLUDED +#define _KPING_INCLUDED + +#include + + +#define PTL_PING_IN_SIZE 256 // n packets per buffer +#define PTL_PING_IN_BUFFERS 2 // n fallback buffers + +#define PTL_PING_CLIENT 4 +#define PTL_PING_SERVER 5 + +#define PING_HEADER_MAGIC 0xDEADBEEF +#define PING_BULK_MAGIC 0xCAFEBABE + +#define PING_HEAD_BITS 0x00000001 +#define PING_BULK_BITS 0x00000002 +#define PING_IGNORE_BITS 0xFFFFFFFC + +#define PTL_PING_ACK 0x01 +#define PTL_PING_VERBOSE 0x02 +#define PTL_PING_VERIFY 0x04 +#define PTL_PING_PREALLOC 0x08 + + +#define NEXT_PRIMARY_BUFFER(index) \ + (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) + +#define PDEBUG(str, err) \ + CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) + + +/* Ping data to be passed via the ioctl to kernel space */ + +#if __KERNEL__ + + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +struct pingsrv_data { + + ptl_handle_ni_t ni; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + void *in_buf; + ptl_process_id_t my_id; + ptl_process_id_t id_local; + ptl_md_t mdin; + ptl_md_t mdout; + ptl_handle_md_t mdin_h; + ptl_handle_md_t mdout_h; + ptl_event_t evnt; + struct task_struct *tsk; +}; /* struct pingsrv_data */ + +struct pingcli_data { + + struct portal_ioctl_data *args; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + char *inbuf; + char *outbuf; + ptl_process_id_t myid; + ptl_process_id_t id_local; + ptl_process_id_t id_remote; + ptl_md_t md_in_head; + ptl_md_t md_out_head; + ptl_handle_md_t md_in_head_h; + ptl_handle_md_t md_out_head_h; + ptl_event_t ev; + struct task_struct *tsk; +}; /* struct pingcli_data */ + + +#endif /* __KERNEL__ */ + +#endif /* _KPING_INCLUDED */ diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c new file mode 100644 index 0000000..389ffbb --- /dev/null +++ b/lnet/tests/ping_cli.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) + +#define MAX_TIME 100000 + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + if ((rc = PtlMDUnlink (client->md_in_head_h))) + PDEBUG ("PtlMDUnlink", rc); + + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + int i, magic; + i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned)); + magic = *(int *)(ev->mem_desc.start + ev->offset); + + if(magic != 0xcafebabe) { + printk ("Unexpected response \n"); + return 1; + } + + if((i == count) || !count) + wake_up_process (client->tsk); + else + printk ("Received response after timeout for %d\n",i); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + unsigned ping_bulk_magic = PING_BULK_MAGIC; + int rc; + struct timeval tv1, tv2; + client->tsk = current; + client->args = args; + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + count = args->ioc_count; + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = (args->ioc_size + STDSIZE) + * count; + client->md_in_head.threshold = PTL_MD_THRESH_INF; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE + args->ioc_size; + client->md_out_head.threshold = args->ioc_count; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); + + count = 0; + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return NULL; + } + while ((args->ioc_count - count)) { + memcpy (client->outbuf + sizeof(unsigned), + &(count), sizeof(unsigned)); + /* Put the ping packet */ + do_gettimeofday (&tv1); + + memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, + sizeof(struct timeval)); + + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + printk ("sent msg no %d", count); + + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" :: timeout .....\n"); + } else { + do_gettimeofday (&tv2); + printk(" :: Reply in %u usec\n", + (unsigned)((tv2.tv_sec - tv1.tv_sec) + * 1000000 + (tv2.tv_usec - tv1.tv_usec))); + } + count++; + } + + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + memset (client, 0, sizeof(struct pingcli_data)); + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c new file mode 100644 index 0000000..1037d09 --- /dev/null +++ b/lnet/tests/ping_srv.c @@ -0,0 +1,308 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) +#define MAXSIZE (16*1024*1024) + +static unsigned ping_head_magic; +static unsigned ping_bulk_magic; +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + case 5: + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, MAXSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + unsigned long magic; + unsigned long ping_bulk_magic = 0xcafebabe; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + magic = *((int *)(server->evnt.mem_desc.start + + server->evnt.offset)); + + + if(magic != 0xdeadbeef) { + printk("Unexpected Packet to the server\n"); + + } + memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); + + server->mdout.length = server->evnt.rlength; + server->mdout.start = server->in_buf; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset)), + *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))), + *((int *)(ev->mem_desc.start + ev->offset + 2 * + sizeof(unsigned)))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "NAL %d not loaded\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, MAXSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + ping_head_magic = PING_HEADER_MAGIC; + ping_bulk_magic = PING_BULK_MAGIC; + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c new file mode 100644 index 0000000..4cef08b --- /dev/null +++ b/lnet/tests/sping_cli.c @@ -0,0 +1,276 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include +#include +#include +#include +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes + assumed */ + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, STDSIZE); + + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + wake_up_process (client->tsk); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + const ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + int rc; + + client->tsk = current; + client->args = args; + + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, STDSIZE); + + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded.\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = STDSIZE; + client->md_in_head.threshold = 1; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, STDSIZE); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE; + client->md_out_head.threshold = 1; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Put the ping packet */ + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + + count = 0; + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" Time out on the server\n"); + pingcli_shutdown (2); + return NULL; + } else + printk("Received respose from the server \n"); + + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + memset (client, 0, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c new file mode 100644 index 0000000..a18ea35 --- /dev/null +++ b/lnet/tests/sping_srv.c @@ -0,0 +1,295 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf + * Amey Inamdar + * Kedar Sovani + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include +#include "ping.h" + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include +#else +#include +#endif +#include +#include + +#include +#include + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) + +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#endif + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, STDSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + server->mdout.start = server->in_buf; + server->mdout.length = STDSIZE; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, STDSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh new file mode 100644 index 0000000..c9b7c16 --- /dev/null +++ b/lnet/tests/startclient.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingcli.o +else + PING=spingcli.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +exit 0; diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh new file mode 100644 index 0000000..942300e --- /dev/null +++ b/lnet/tests/startserver.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingsrv.o +else + PING=spingsrv.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING nal=4 + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING nal=2 + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING nal=4 + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +../utils/acceptor 9999& +exit 0; diff --git a/lnet/tests/stopclient.sh b/lnet/tests/stopclient.sh new file mode 100644 index 0000000..f7e3aa1 --- /dev/null +++ b/lnet/tests/stopclient.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingcli +else + PING=pingcli +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +rmmod portals diff --git a/lnet/tests/stopserver.sh b/lnet/tests/stopserver.sh new file mode 100644 index 0000000..3e81831 --- /dev/null +++ b/lnet/tests/stopserver.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingsrv +else + PING=pingsrv +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +killall -9 acceptor +rm -f /var/run/acceptor-9999.pid +rmmod portals diff --git a/lnet/ulnds/.cvsignore b/lnet/ulnds/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/ulnds/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am new file mode 100644 index 0000000..dc427b0 --- /dev/null +++ b/lnet/ulnds/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lnet/ulnds/README b/lnet/ulnds/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lnet/ulnds/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lnet/ulnds/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include +#include +#include +#include +#include +#include +#include + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lnet/ulnds/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c new file mode 100644 index 0000000..310e899 --- /dev/null +++ b/lnet/ulnds/connection.c @@ -0,0 +1,294 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offsetm->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(void *, void *), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h new file mode 100644 index 0000000..6f57287 --- /dev/null +++ b/lnet/ulnds/connection.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +manager init_connections(unsigned short, int (*f)(void *, void *), void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, unsigned char *dest, int len); diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lnet/ulnds/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lnet/ulnds/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lnet/ulnds/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lnet/ulnds/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include +#include +#include + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lnet/ulnds/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lnet/ulnds/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lnet/ulnds/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include +#include +#include + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lnet/ulnds/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lnet/ulnds/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am new file mode 100644 index 0000000..dc427b0 --- /dev/null +++ b/lnet/ulnds/socklnd/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lnet/ulnds/socklnd/README b/lnet/ulnds/socklnd/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lnet/ulnds/socklnd/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lnet/ulnds/socklnd/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include +#include +#include +#include +#include +#include +#include + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lnet/ulnds/socklnd/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c new file mode 100644 index 0000000..310e899 --- /dev/null +++ b/lnet/ulnds/socklnd/connection.c @@ -0,0 +1,294 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offsetm->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(void *, void *), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h new file mode 100644 index 0000000..6f57287 --- /dev/null +++ b/lnet/ulnds/socklnd/connection.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +manager init_connections(unsigned short, int (*f)(void *, void *), void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, unsigned char *dest, int len); diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lnet/ulnds/socklnd/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lnet/ulnds/socklnd/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lnet/ulnds/socklnd/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lnet/ulnds/socklnd/pqtimer.c b/lnet/ulnds/socklnd/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lnet/ulnds/socklnd/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include +#include +#include + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lnet/ulnds/socklnd/pqtimer.h b/lnet/ulnds/socklnd/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lnet/ulnds/socklnd/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lnet/ulnds/socklnd/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lnet/ulnds/socklnd/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include +#include +#include + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lnet/ulnds/socklnd/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lnet/ulnds/socklnd/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include +#else +#include +#endif + +#include +#include +#include +#include +#include + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lnet/ulnds/socklnd/table.c b/lnet/ulnds/socklnd/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lnet/ulnds/socklnd/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;inext; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;isize;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lnet/ulnds/socklnd/table.h b/lnet/ulnds/socklnd/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lnet/ulnds/socklnd/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c new file mode 100644 index 0000000..534fc17 --- /dev/null +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -0,0 +1,198 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a, void *d) +{ + connection c = d; + bridge b=a; + ptl_hdr_t hdr; + + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lnet/ulnds/socklnd/timer.h b/lnet/ulnds/socklnd/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lnet/ulnds/socklnd/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lnet/ulnds/socklnd/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lnet/ulnds/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;inext; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;isize;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;isize;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lnet/ulnds/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c new file mode 100644 index 0000000..534fc17 --- /dev/null +++ b/lnet/ulnds/tcplnd.c @@ -0,0 +1,198 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a, void *d) +{ + connection c = d; + bridge b=a; + ptl_hdr_t hdr; + + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lnet/ulnds/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lnet/ulnds/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore new file mode 100644 index 0000000..148310a --- /dev/null +++ b/lnet/utils/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +acceptor +debugctl +ptlctl +.deps +routerstat +wirecheck \ No newline at end of file diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am new file mode 100644 index 0000000..05af598 --- /dev/null +++ b/lnet/utils/Makefile.am @@ -0,0 +1,27 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +COMPILE = gcc -Wall -g -I$(srcdir)/../include +LINK = gcc -o $@ + +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck +lib_LIBRARIES = libptlctl.a + +acceptor_SOURCES = acceptor.c # -lefence + +wirecheck_SOURCES = wirecheck.c + +libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h + +ptlctl_SOURCES = ptlctl.c +ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_DEPENDENCIES = libptlctl.a + +debugctl_SOURCES = debugctl.c +debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_DEPENDENCIES = libptlctl.a + +routerstat_SOURCES = routerstat.c diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c new file mode 100644 index 0000000..c6590db --- /dev/null +++ b/lnet/utils/acceptor.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +/* should get this from autoconf somehow */ +#ifndef PIDFILE_DIR +#define PIDFILE_DIR "/var/run" +#endif + +#define PROGNAME "acceptor" + +void create_pidfile(char *name, int port) +{ + char pidfile[1024]; + FILE *fp; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if ((fp = fopen(pidfile, "w"))) { + fprintf(fp, "%d\n", getpid()); + fclose(fp); + } else { + syslog(LOG_ERR, "%s: %s\n", pidfile, + strerror(errno)); + } +} + +int pidfile_exists(char *name, int port) +{ + char pidfile[1024]; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if (!access(pidfile, F_OK)) { + fprintf(stderr, "%s: exists, acceptor already running.\n", + pidfile); + return (1); + } + return (0); +} + +int +parse_size (int *sizep, char *str) +{ + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) + { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +void +show_connection (int fd, __u32 net_ip, ptl_nid_t nid) +{ + struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); + __u32 host_ip = ntohl (net_ip); + int rxmem = 0; + int txmem = 0; + int nonagle = 0; + int len; + char host[1024]; + + len = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0) + perror ("Cannot get write buffer size"); + + len = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0) + perror ("Cannot get read buffer size"); + + len = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0) + perror ("Cannot get nagle"); + + if (h == NULL) + snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, + (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); + else + snprintf (host, sizeof(host), "%s", h->h_name); + + syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", + host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled"); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +void +usage (char *myname) +{ + fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname); + exit (1); +} + +int main(int argc, char **argv) +{ + int o, fd, rc, port, pfd; + struct sockaddr_in srvaddr; + int c; + int rxmem = 0; + int txmem = 0; + int noclose = 0; + int nonagle = 1; + int nal = SOCKNAL; + int xchg_nids = 0; + int bind_irq = 0; + + while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1) + switch (c) + { + case 'r': + if (parse_size (&rxmem, optarg) != 0 || rxmem < 0) + usage (argv[0]); + break; + + case 's': + if (parse_size (&txmem, optarg) != 0 || txmem < 0) + usage (argv[0]); + break; + + case 'n': + nonagle = 0; + break; + + case 'l': + noclose = 1; + break; + + case 'x': + xchg_nids = 1; + break; + + case 'i': + bind_irq = 1; + break; + + case 'N': + if (parse_size(&nal, optarg) != 0 || + nal < 0 || nal > NAL_MAX_NR) + usage(argv[0]); + break; + + default: + usage (argv[0]); + break; + } + + if (optind >= argc) + usage (argv[0]); + + port = atol(argv[optind++]); + + if (pidfile_exists(PROGNAME, port)) + exit(1); + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = INADDR_ANY; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("opening socket"); + exit(1); + } + + o = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { + perror("Cannot set REUSEADDR socket opt"); + exit(1); + } + + if (nonagle) + { + o = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); + if (rc != 0) + { + perror ("Cannot disable nagle"); + exit (1); + } + } + + if (txmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem)); + if (rc != 0) + { + perror ("Cannot set write buffer size"); + exit (1); + } + } + + if (rxmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem)); + if (rc != 0) + { + perror ("Cannot set read buffer size"); + exit (1); + } + } + + rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + perror("bind: "); + exit(1); + } + + if (listen(fd, 127)) { + perror("listen: "); + exit(1); + } + fprintf(stderr, "listening on port %d\n", port); + + pfd = open("/dev/portals", O_RDWR); + if ( pfd < 0 ) { + perror("opening portals device"); + exit(1); + } + + rc = daemon(1, noclose); + if (rc < 0) { + perror("daemon(): "); + exit(1); + } + + openlog(PROGNAME, LOG_PID, LOG_DAEMON); + syslog(LOG_INFO, "started, listening on port %d\n", port); + create_pidfile(PROGNAME, port); + + while (1) { + struct sockaddr_in clntaddr; + int len = sizeof(clntaddr); + int cfd; + struct portal_ioctl_data data; + ptl_nid_t peer_nid; + + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); + if ( cfd < 0 ) { + perror("accept"); + exit(0); + continue; + } + + if (!xchg_nids) + peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */ + else + { + PORTAL_IOC_INIT (data); + data.ioc_nal = nal; + rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + { + perror ("Can't get my NID"); + close (cfd); + continue; + } + + rc = exchange_nids (cfd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (cfd); + continue; + } + } + + show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid); + + PORTAL_IOC_INIT(data); + data.ioc_fd = cfd; + data.ioc_nal = nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { + perror("ioctl failed"); + + } else { + printf("client registered\n"); + } + rc = close(cfd); + if (rc) + perror ("close failed"); + } + + closelog(); + exit(0); + +} diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c new file mode 100644 index 0000000..9ab1c73d --- /dev/null +++ b/lnet/utils/debug.c @@ -0,0 +1,618 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#define BUG() /* workaround for module.h includes */ +#include + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include +#endif + +#include +#include +#include "parser.h" + +static char rawbuf[8192]; +static char *buf = rawbuf; +static int max = 8192; +//static int g_pfd = -1; +static int subsystem_array[1 << 8]; +static int debug_mask = ~0; + +static const char *portal_debug_subsystems[] = + {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite", + "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter", + "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL}; +static const char *portal_debug_masks[] = + {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", + "blocks", "net", "warning", "buffs", "other", "dentry", "portals", + "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL}; + +struct debug_daemon_cmd { + char *cmd; + unsigned int cmdv; +}; + +static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { + {"start", DEBUG_DAEMON_START}, + {"stop", DEBUG_DAEMON_STOP}, + {"pause", DEBUG_DAEMON_PAUSE}, + {"continue", DEBUG_DAEMON_CONTINUE}, + {0, 0} +}; + +static int do_debug_mask(char *name, int enable) +{ + int found = 0, i; + + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || + strcasecmp(name, "all_subs") == 0) { + printf("%s output from subsystem \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_subsystems[i]); + subsystem_array[i] = enable; + found = 1; + } + } + for (i = 0; portal_debug_masks[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_masks[i]) == 0 || + strcasecmp(name, "all_types") == 0) { + printf("%s output of type \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_masks[i]); + if (enable) + debug_mask |= (1 << i); + else + debug_mask &= ~(1 << i); + found = 1; + } + } + + return found; +} + +int dbg_initialize(int argc, char **argv) +{ + memset(subsystem_array, 1, sizeof(subsystem_array)); + return 0; +} + +int jt_dbg_filter(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 0)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + return 0; +} + +int jt_dbg_show(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 1)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + + return 0; +} + +static int applymask(char* procpath, int value) +{ + int rc; + char buf[64]; + int len = snprintf(buf, 64, "%d", value); + + int fd = open(procpath, O_WRONLY); + if (fd == -1) { + fprintf(stderr, "Unable to open %s: %s\n", + procpath, strerror(errno)); + return fd; + } + rc = write(fd, buf, len+1); + if (rc<0) { + fprintf(stderr, "Write to %s failed: %s\n", + procpath, strerror(errno)); + return rc; + } + close(fd); + return 0; +} + +extern char *dump_filename; +extern int dump(int dev_id, int opc, void *buf); + +static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) +{ + if (!dump_filename) { + applymask("/proc/sys/portals/subsystem_debug", subs_mask); + applymask("/proc/sys/portals/debug", debug_mask); + } else { + struct portals_debug_ioctl_data data; + + data.hdr.ioc_len = sizeof(data); + data.hdr.ioc_version = 0; + data.subs = subs_mask; + data.debug = debug_mask; + + dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); + } + printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", + subs_mask, debug_mask); +} + +int jt_dbg_list(int argc, char **argv) +{ + int i; + + if (argc != 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 0; + } + + if (strcasecmp(argv[1], "subs") == 0) { + printf("Subsystems: all_subs"); + for (i = 0; portal_debug_subsystems[i] != NULL; i++) + printf(", %s", portal_debug_subsystems[i]); + printf("\n"); + } else if (strcasecmp(argv[1], "types") == 0) { + printf("Types: all_types"); + for (i = 0; portal_debug_masks[i] != NULL; i++) + printf(", %s", portal_debug_masks[i]); + printf("\n"); + } + else if (strcasecmp(argv[1], "applymasks") == 0) { + unsigned int subsystem_mask = 0; + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (subsystem_array[i]) subsystem_mask |= (1 << i); + } + applymask_all(subsystem_mask, debug_mask); + } + return 0; +} + +/* if 'raw' is true, don't strip the debug information from the front of the + * lines */ +static void dump_buffer(FILE *fd, char *buf, int size, int raw) +{ + char *p, *z; + unsigned long subsystem, debug, dropped = 0, kept = 0; + int max_sub, max_type; + + for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++) + ; + for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++) + ; + + while (size) { + p = memchr(buf, '\n', size); + if (!p) + break; + subsystem = strtoul(buf, &z, 16); + debug = strtoul(z + 1, &z, 16); + + z++; + /* for some reason %*s isn't working. */ + *p = '\0'; + if (subsystem < max_sub && + subsystem_array[subsystem] && + (!debug || (debug_mask & debug))) { + if (raw) + fprintf(fd, "%s\n", buf); + else + fprintf(fd, "%s\n", z); + //printf("%s\n", buf); + kept++; + } else { + //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf); + dropped++; + } + *p = '\n'; + p++; + size -= (p - buf); + buf = p; + } + + printf("Debug log: %lu lines, %lu kept, %lu dropped.\n", + dropped + kept, kept, dropped); +} + +int jt_dbg_debug_kernel(int argc, char **argv) +{ + int rc, raw = 1; + FILE *fd = stdout; + const int databuf_size = (6 << 20); + struct portal_ioctl_data data, *newdata; + char *databuf = NULL; + + if (argc > 3) { + fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); + return 0; + } + + if (argc > 1) { + fd = fopen(argv[1], "w"); + if (fd == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } + } + if (argc > 2) + raw = atoi(argv[2]); + + databuf = malloc(databuf_size); + if (!databuf) { + fprintf(stderr, "No memory for buffer.\n"); + goto out; + } + + memset(&data, 0, sizeof(data)); + data.ioc_plen1 = databuf_size; + data.ioc_pbuf1 = databuf; + + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + goto out; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n", + strerror(errno)); + goto out; + } + + newdata = (struct portal_ioctl_data *)buf; + if (newdata->ioc_size > 0) + dump_buffer(fd, databuf, newdata->ioc_size, raw); + + out: + if (databuf) + free(databuf); + if (fd != stdout) + fclose(fd); + return 0; +} + +int jt_dbg_debug_daemon(int argc, char **argv) +{ + int i, rc; + unsigned int cmd = 0; + FILE *fd = stdout; + struct portal_ioctl_data data; + + if (argc <= 1) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) { + if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) { + cmd = portal_debug_daemon_cmd[i].cmdv; + break; + } + } + if (portal_debug_daemon_cmd[i].cmd == NULL) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + memset(&data, 0, sizeof(data)); + if (cmd == DEBUG_DAEMON_START) { + if (argc < 3) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|" + "pause|continue]\n", argv[0]); + return 0; + } + if (access(argv[2], F_OK) != 0) { + fd = fopen(argv[2], "w"); + if (fd != NULL) { + fclose(fd); + remove(argv[2]); + goto ok; + } + } + if (access(argv[2], W_OK) == 0) + goto ok; + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + return -1; +ok: + data.ioc_inllen1 = strlen(argv[2]) + 1; + data.ioc_inlbuf1 = argv[2]; + data.ioc_misc = 0; + if (argc == 4) { + unsigned long size; + errno = 0; + size = strtoul(argv[3], NULL, 0); + if (errno) { + fprintf(stderr, "file size(%s): error %s\n", + argv[3], strerror(errno)); + return -1; + } + data.ioc_misc = size; + } + } + data.ioc_count = cmd; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf); + if (rc < 0) { + fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n", + strerror(errno)); + return rc; + } + return 0; +} + +int jt_dbg_debug_file(int argc, char **argv) +{ + int rc, fd = -1, raw = 1; + FILE *output = stdout; + char *databuf = NULL; + struct stat statbuf; + + if (argc > 4 || argc < 2) { + fprintf(stderr, "usage: %s [output] [raw]\n", argv[0]); + return 0; + } + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + rc = syscall(__SYS_fstat__, fd, &statbuf); + if (rc < 0) { + fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + goto out; + } + + if (argc >= 3) { + output = fopen(argv[2], "w"); + if (output == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + goto out; + } + } + + if (argc == 4) + raw = atoi(argv[3]); + + databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (databuf == NULL) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + goto out; + } + + dump_buffer(output, databuf, statbuf.st_size, raw); + + out: + if (databuf) + munmap(databuf, statbuf.st_size); + if (output != stdout) + fclose(output); + if (fd > 0) + close(fd); + return 0; +} + +int jt_dbg_clear_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_dbg_mark_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + char *text; + time_t now = time(NULL); + + if (argc > 2) { + fprintf(stderr, "usage: %s [marker text]\n", argv[0]); + return 0; + } + + if (argc == 2) { + text = argv[1]; + } else { + text = ctime(&now); + text[strlen(text) - 1] = '\0'; /* stupid \n */ + } + + memset(&data, 0, sizeof(data)); + data.ioc_inllen1 = strlen(text) + 1; + data.ioc_inlbuf1 = text; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + + +int jt_dbg_modules(int argc, char **argv) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct mod_paths { + char *name, *path; + } *mp, mod_paths[] = { + {"portals", "lustre/portals/libcfs"}, + {"ksocknal", "lustre/portals/knals/socknal"}, + {"obdclass", "lustre/obdclass"}, + {"ptlrpc", "lustre/ptlrpc"}, + {"obdext2", "lustre/obdext2"}, + {"ost", "lustre/ost"}, + {"osc", "lustre/osc"}, + {"mds", "lustre/mds"}, + {"mdc", "lustre/mdc"}, + {"llite", "lustre/llite"}, + {"obdecho", "lustre/obdecho"}, + {"ldlm", "lustre/ldlm"}, + {"obdfilter", "lustre/obdfilter"}, + {"extN", "lustre/extN"}, + {"lov", "lustre/lov"}, + {"fsfilt_ext3", "lustre/obdclass"}, + {"fsfilt_extN", "lustre/obdclass"}, + {"mds_ext2", "lustre/mds"}, + {"mds_ext3", "lustre/mds"}, + {"mds_extN", "lustre/mds"}, + {"ptlbd", "lustre/ptlbd"}, + {NULL, NULL} + }; + char *path = ".."; + char *kernel = "linux"; + + if (argc >= 2) + path = argv[1]; + if (argc == 3) + kernel = argv[2]; + if (argc > 3) { + printf("%s [path] [kernel]\n", argv[0]); + return 0; + } + + for (mp = mod_paths; mp->name != NULL; mp++) { + struct module_info info; + int rc; + size_t crap; + int query_module(const char *name, int which, void *buf, + size_t bufsize, size_t *ret); + + rc = query_module(mp->name, QM_INFO, &info, sizeof(info), + &crap); + if (rc < 0) { + if (errno != ENOENT) + printf("query_module(%s) failed: %s\n", + mp->name, strerror(errno)); + } else { + printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, + mp->path, mp->name, + info.addr + sizeof(struct module)); + } + } + + return 0; +#else + printf("jt_dbg_module is not yet implemented for Linux 2.5\n"); + return 0; +#endif /* linux 2.5 */ +} + +int jt_dbg_panic(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} diff --git a/lnet/utils/debugctl.c b/lnet/utils/debugctl.c new file mode 100644 index 0000000..02cb9b4 --- /dev/null +++ b/lnet/utils/debugctl.c @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include +#include +#include +#include +#include "parser.h" + + +command_t list[] = { + {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"}, + {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, + {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file [output] [raw], read debug buffer from input and print it [to output]"}, + {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"}, + {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"}, + {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"}, + {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"}, + {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"}, + {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: )"}, + {"panic", jt_dbg_panic, 0, "cause the kernel to panic"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (dbg_initialize(argc, argv) < 0) + exit(2); + + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + + Parser_init("debugctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + unregister_ioc_dev(PORTALS_DEV_ID); + return 0; +} diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c new file mode 100644 index 0000000..722bb57 --- /dev/null +++ b/lnet/utils/l_ioctl.c @@ -0,0 +1,281 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ioc_dev { + const char * dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +struct dump_hdr { + int magic; + int dev_id; + int opc; +}; + +char * dump_filename; + +static int +open_ioc_dev(int dev_id) +{ + const char * dev_name; + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + + +static int +do_ioctl(int dev_id, int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + return rc; + +} + +static FILE * +get_dump_file() +{ + FILE *fp = NULL; + + if (!dump_filename) { + fprintf(stderr, "no dump filename\n"); + } else + fp = fopen(dump_filename, "a"); + return fp; +} + +/* + * The dump file should start with a description of which devices are + * used, but for now it will assumed whatever app reads the file will + * know what to do. */ +int +dump(int dev_id, int opc, void *buf) +{ + FILE *fp; + struct dump_hdr dump_hdr; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + int rc; + + printf("dumping opc %x to %s\n", opc, dump_filename); + + + dump_hdr.magic = 0xdeadbeef; + dump_hdr.dev_id = dev_id; + dump_hdr.opc = opc; + + fp = get_dump_file(); + if (fp == NULL) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char * dev_name) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return; + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +/* If this file is set, then all ioctl buffers will be + appended to the file. */ +int +set_ioctl_dump(char * file) +{ + if (dump_filename) + free(dump_filename); + + dump_filename = strdup(file); + return 0; +} + +int +l_ioctl(int dev_id, int opc, void *buf) +{ + if (dump_filename) + return dump(dev_id, opc, buf); + else + return do_ioctl(dev_id, opc, buf); +} + +/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer + * in the file. For example: + * + * parse_dump("lctl.dump", l_ioctl); + * + * Note: if using l_ioctl, then you also need to register_ioc_dev() for + * each device used in the dump. + */ +int +parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) +{ + int fd, line =0; + struct stat st; + char *buf, *end; + + fd = syscall(SYS_open, dump_file, O_RDONLY); + +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + if (syscall(__SYS_fstat__, fd, &st)) { + perror("stat fails"); + exit(1); + } + + if (st.st_size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } + + buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = buf + st.st_size; + close(fd); + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } +#if 0 + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); +#endif + + memcpy(tmp, data, data->ioc_len); + + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } + + buf += data->ioc_len + sizeof(*dump_hdr); + } + return 0; +} + +int +jt_ioc_dump(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + printf("setting dumpfile to: %s\n", argv[1]); + + set_ioctl_dump(argv[1]); + return 0; +} diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c new file mode 100644 index 0000000..4d93645 --- /dev/null +++ b/lnet/utils/parser.c @@ -0,0 +1,703 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#ifdef HAVE_LIBREADLINE +#define READLINE_LIBRARY +#include +#endif +//extern char **completion_matches __P((char *, rl_compentry_func_t *)); +extern void using_history(void); +extern void stifle_history(int); +extern void add_history(char *); + +#include "parser.h" + +static command_t * top_level; /* Top level of commands, initialized by + * InitParser */ +static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ +static int done; /* Set to 1 if user types exit or quit */ + + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); +static void print_commands(char *str, command_t *table); + +static char * skipwhitespace(char * s) +{ + char * t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++); + return(t); +} + + +static char * skiptowhitespace(char * s) +{ + char * t; + + for (t = s; *t && !isspace(*t); t++); + return(t); +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if ( arg ) { + argv[i] = arg; + i++; + } else + return 0; + + while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) { + argv[i] = arg; + i++; + } + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if ( cmd ) { + return (cmd->pc_func)(argc, argv); + } else { + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\" ", cmd->pc_name); + printf("\nas argument.\n"); + } + return -1; +} + +/* returns the command_t * (NULL if not found) corresponding to a + _partial_ match with the first token in name. It sets *next to + point to the following token. Does not modify *name. */ +static command_t * find_cmd(char * name, command_t cmds[], char ** next) +{ + int i, len; + + if (!cmds || !name ) + return NULL; + + /* This sets name to point to the first non-white space character, + and next to the first whitespace after name, len to the length: do + this with strtok*/ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = *next - name; + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return(&cmds[i]); + } + } + return NULL; +} + +/* Recursively process a command line string s and find the command + corresponding to it. This can be ambiguous, full, incomplete, + non-existent. */ +static int process(char *s, char ** next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if ( ! *result ) + return CMD_NONE; + + /* found entry: is it ambigous, i.e. not exact command name and + more than one command in the list matches. Note that find_cmd + points to the first ambiguous entry */ + if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) && + find_cmd(s, (*result) + 1, next)) + return CMD_AMBIG; + + /* found a unique command: component or full? */ + if ( (*result)->pc_func ) { + return CMD_COMPLETE; + } else { + if ( *next == '\0' ) { + return CMD_INCOMPLETE; + } else { + return process(*next, next, (*result)->pc_sub_cmd, result, prev); + } + } +} + +#ifdef HAVE_LIBREADLINE +static command_t * match_tbl; /* Command completion against this table */ +static char * command_generator(const char * text, int state) +{ + static int index, + len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ( (name = (match_tbl + index)->pc_name) ) { + index++; + + if (strncasecmp(name, text, len) == 0) { + return(strdup(name)); + } + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(char * text, int start, int end) +{ + command_t * table; + char * pos; + + match_tbl = top_level; + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; + table = find_cmd(pos, match_tbl, &pos)) { + + if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; + } + + return(completion_matches(text, command_generator)); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char * line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch( process(line, &next, top_level, &cmd, &prev) ) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, cmd, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + i = line2args(line, argv, MAXARGS); + rc = (cmd->pc_func)(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +int +noop_fn () +{ + return (0); +} + +/* just in case you're ever in an airplane and discover you + forgot to install readline-dev. :) */ +int init_input() +{ + int interactive = isatty (fileno (stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) + { + rl_prep_term_function = (rl_vintfunc_t *)noop_fn; + rl_deprep_term_function = (rl_voidfunc_t *)noop_fn; + } + + rl_attempted_completion_function = (CPPFunction *)command_completion; + rl_completion_entry_function = (void *)command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char * readline(char * prompt) +{ + char line[2048]; + int n = 0; + if (prompt) + printf ("%s", prompt); + if (fgets(line, sizeof(line), stdin) == NULL) + return (NULL); + n = strlen(line); + if (n && line[n-1] == '\n') + line[n-1] = '\0'; + return strdup(line); +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0; + int interactive; + + interactive = init_input(); + + while(!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + + free(line); + } + return rc; +} + + +/* sets the parser prompt */ +void Parser_init(char * prompt, command_t * cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') + ret = sscanf(s, "%d", val); + else if (*(s+1) != 'x') + ret = sscanf(s, "%o", val); + else { + s++; + ret = sscanf(++s, "%x", val); + } + + return(ret); +} + + +void Parser_qhelp(int argc, char *argv[]) { + + printf("Available commands are:\n"); + + print_commands(NULL, top_level); + printf("For more help type: help command-name\n"); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if ( argc == 1 ) { + Parser_qhelp(argc, argv); + return 0; + } + + line[0]='\0'; + for ( i = 1 ; i < argc ; i++ ) { + strcat(line, argv[i]); + } + + switch ( process(line, &next, top_level, &result, &prev) ) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n",line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, result, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + Parser_help(2, argv); +} + +/************************************************************************* + * COMMANDS * + *************************************************************************/ + + +static void print_commands(char * str, command_t * table) { + command_t * cmds; + char buf[80]; + + for (cmds = table; cmds->pc_name; cmds++) { + if (cmds->pc_func) { + if (str) printf("\t%s %s\n", str, cmds->pc_name); + else printf("\t%s\n", cmds->pc_name); + } + if (cmds->pc_sub_cmd) { + if (str) { + sprintf(buf, "%s %s", str, cmds->pc_name); + print_commands(buf, cmds->pc_sub_cmd); + } else { + print_commands(cmds->pc_name, cmds->pc_sub_cmd); + } + } + } +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + theprompt = malloc(size); + assert(theprompt); + + sprintf(theprompt, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + if ( line == NULL || *line == '\0' ) { + strncpy(res, deft, len); + } else { + strncpy(res, line, len); + } + + if ( line ) { + free(line); + return res; + } else { + return NULL; + } +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + assert(theprompt); + sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); + + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if ( !line ) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if ( *line == '\0' ) { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if ( rc != 0 ) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if ( result > max || result < min ) { + fprintf(stdout, "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while ( 1 ) ; + + if (theprompt) + free(theprompt); + return result; + +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + assert(theprompt); + + fflush(stdout); + + if ( deft != 0 && deft != 1 ) { + fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n", + deft); + assert ( 0 ); + } + sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if ( line == NULL ) { + result = deft; + break; + } + if ( *line == '\0' ) { + result = deft; + break; + } + if ( *line == 'y' || *line == 'Y' ) { + result = 1; + break; + } + if ( *line == 'n' || *line == 'N' ) { + result = 0; + break; + } + if ( line ) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while ( 1 ); + + if ( line ) + free(line); + if ( theprompt ) + free(theprompt); + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + + if ( rc == 0 ) { + return result; + } else { + return Parser_getint(prompt, deft, min, max, base); + } +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if ( inp == NULL || *inp == '\0' ) { + return Parser_getstr(prompt, deft, answer, len); + } else + return inp; +} + +/* change a string into a number: return 0 on success. No invalid characters + allowed. The processing of base and validity follows strtol(3)*/ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ( (base !=0) && (base < 2 || base > 36) ) + return 1; + + *result = strtol(inp, &endptr, base); + + if ( *inp != '\0' && *endptr == '\0' ) + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size (int *sizep, char *str) { + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} diff --git a/lnet/utils/parser.h b/lnet/utils/parser.h new file mode 100644 index 0000000..dead9f5 --- /dev/null +++ b/lnet/utils/parser.h @@ -0,0 +1,73 @@ +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 100 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(int *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c new file mode 100644 index 0000000..90d66f5 --- /dev/null +++ b/lnet/utils/portals.c @@ -0,0 +1,985 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "parser.h" + +unsigned int portal_debug; +unsigned int portal_printk; +unsigned int portal_stack; + + +static ptl_nid_t g_nid = 0; +static unsigned int g_nal = 0; +static unsigned short g_port = 0; + +static int g_socket_txmem = 0; +static int g_socket_rxmem = 0; +static int g_socket_nonagle = 1; + +typedef struct +{ + char *name; + int num; +} name2num_t; + +static name2num_t nalnames[] = { + {"tcp", SOCKNAL}, + {"toe", TOENAL}, + {"elan", QSWNAL}, + {"gm", GMNAL}, + {"scimac", SCIMACNAL}, + {NULL, -1} +}; + +static name2num_t * +name2num_lookup_name (name2num_t *table, char *str) +{ + while (table->name != NULL) + if (!strcmp (str, table->name)) + return (table); + else + table++; + return (NULL); +} + +static name2num_t * +name2num_lookup_num (name2num_t *table, int num) +{ + while (table->name != NULL) + if (num == table->num) + return (table); + else + table++; + return (NULL); +} + +int +ptl_name2nal (char *str) +{ + name2num_t *e = name2num_lookup_name (nalnames, str); + + return ((e == NULL) ? 0 : e->num); +} + +static char * +nal2name (int nal) +{ + name2num_t *e = name2num_lookup_num (nalnames, nal); + + return ((e == NULL) ? "???" : e->name); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + struct hostent *he; + int a; + int b; + int c; + int d; + + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) + { + __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; + + *nidp = (ptl_nid_t)addr; + return (0); + } + + if ((('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) && + (he = gethostbyname (str)) != NULL) + { + __u32 addr = *(__u32 *)he->h_addr; + + *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + return (0); + } + + if (sscanf (str, "%i", &a) == 1) + { + *nidp = (ptl_nid_t)a; + return (0); + } + + if (sscanf (str, "%x", &a) == 1) + { + *nidp = (ptl_nid_t) a; + return (0); + } + + return (-1); +} + +char * +ptl_nid2str (char *buffer, ptl_nid_t nid) +{ + __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ + struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); + + if (he != NULL) + strcpy (buffer, he->h_name); + else + sprintf (buffer, "0x"LPX64, nid); + + return (buffer); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int ptl_initialize(int argc, char **argv) +{ + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + return 0; +} + + +int jt_ptl_network(int argc, char **argv) +{ + int nal; + + if (argc != 2 || + (nal = ptl_name2nal (argv[1])) == 0) + { + name2num_t *entry; + + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + } + else + g_nal = nal; + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +int jt_ptl_connect(int argc, char **argv) +{ + if (argc < 2) { + usage: + fprintf(stderr, "usage: %s or \n", + argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + ptl_nid_t peer_nid; + struct hostent *he; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int o; + int olen; + + if (argc < 3) { + goto usage; + } + + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + g_port = atol(argv[2]); + + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); + return (-1); + } + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(g_port); + srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", + strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_rxmem != 0) + { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_txmem != 0) + { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", + strerror(errno)); + return -1; + } + + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (xchange_nids) { + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) + { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); + close (fd); + return (-1); + } + + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (fd); + return (-1); + } + } + else + peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: " + "%s\n", strerror(errno)); + close (fd); + return -1; + } + + g_nid = peer_nid; + printf("Connection to "LPX64" registered with socknal\n", g_nid); + + rc = close(fd); + if (rc) { + fprintf(stderr, "close failed: %d\n", rc); + } + } else if (g_nal == QSWNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == GMNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == SCIMACNAL) { + unsigned int tmpnid; + if(sscanf(argv[1], "%x", &tmpnid) == 1) { + g_nid=tmpnid; + } + else { + fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + } + + + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + } + + return 0; +} + +int jt_ptl_disconnect(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Disconnecting ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'disconnect' doesn't make any sense for " + "elan.\n"); + } else if (g_nal == GMNAL) { + printf("'disconnect' doesn't make any sense for " + "GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'disconnect' doesn't make any sense for " + "SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_push_connection (int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Pushing ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'push' doesn't make any sense for elan.\n"); + } else if (g_nal == GMNAL) { + printf("'push' doesn't make any sense for GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'push' doesn't make any sense for SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_ping(int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + long count = 1; + long size = 4; + long timeout = 1; + struct portal_ioctl_data data; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc > 2) + { + count = atol(argv[2]); + + if (count < 0 || count > 20000) + { + fprintf(stderr, "are you insane? %ld is a crazy count.\n", count); + return -1; + } + } + + if (argc > 3) + size= atol(argv[3]); + + if (argc > 4) + timeout = atol (argv[4]); + + PORTAL_IOC_INIT (data); + data.ioc_count = count; + data.ioc_size = size; + data.ioc_nid = nid; + data.ioc_nal = g_nal; + data.ioc_timeout = timeout; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); + if (rc) { + fprintf(stderr, "failed to start pinger: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_ptl_shownid(int argc, char **argv) +{ + struct portal_ioctl_data data; + int rc; + + if (argc > 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command first\n"); + return -1; + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + fprintf(stderr, "getting my NID failed: %s\n", + strerror (errno)); + else + printf(LPX64"\n", data.ioc_nid); + return 0; +} + +int jt_ptl_mynid(int argc, char **argv) +{ + int rc; + char hostname[1024]; + char *nidstr; + struct portal_ioctl_data data; + ptl_nid_t mynid; + + if (argc > 2) { + fprintf(stderr, "usage: %s [NID]\n", argv[0]); + fprintf(stderr, "NID defaults to the primary IP address of the machine.\n"); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (argc >= 2) + nidstr = argv[1]; + else if (gethostname(hostname, sizeof(hostname)) != 0) { + fprintf(stderr, "gethostname failed: %s\n", + strerror(errno)); + return -1; + } + else + nidstr = hostname; + + rc = ptl_parse_nid (&mynid, nidstr); + if (rc != 0) { + fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr); + return -1; + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = mynid; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc < 0) + fprintf(stderr, "setting my NID failed: %s\n", + strerror(errno)); + else + printf("registered my nid "LPX64" (%s)\n", mynid, hostname); + return 0; +} + +int +jt_ptl_fail_nid (int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + unsigned int threshold; + struct portal_ioctl_data data; + + if (argc < 2 || argc > 3) + { + fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (!strcmp (argv[1], "_all_")) + nid = PTL_NID_ANY; + else if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc < 3) + threshold = PTL_MD_THRESH_INF; + else if (sscanf (argv[2], "%i", &threshold) != 1) { + fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); + return (-1); + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_count = threshold; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); + if (rc < 0) + fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", + strerror (errno)); + else + printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); + + return (0); +} + +int +jt_ptl_rxmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + + g_socket_rxmem = size; + } + printf ("Socket rmem = %d\n", g_socket_rxmem); + return (0); +} + +int +jt_ptl_txmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + g_socket_txmem = size; + } + printf ("Socket txmem = %d\n", g_socket_txmem); + return (0); +} + +int +jt_ptl_nagle (int argc, char **argv) +{ + int enable; + + if (argc > 1) + { + if (Parser_bool (&enable, argv[1]) != 0) + { + fprintf (stderr, "Can't parse boolean %s\n", argv[1]); + return (0); + } + g_socket_nonagle = !enable; + } + printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled"); + return (0); +} + +int +jt_ptl_add_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid1; + ptl_nid_t nid2; + ptl_nid_t gateway_nid; + int rc; + + if (argc < 3) + { + fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); + return (-1); + } + + if (ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) + nid2 = nid1; + else if (ptl_parse_nid (&nid2, argv[3]) != 0) + { + fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = gateway_nid; + data.ioc_nal = g_nal; + data.ioc_nid2 = MIN (nid1, nid2); + data.ioc_nid3 = MAX (nid1, nid2); + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_del_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + int rc; + + if (argc < 2) + { + fprintf (stderr, "usage: %s targetNID\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_print_routes (int argc, char **argv) +{ + char buffer[3][128]; + struct portal_ioctl_data data; + int rc; + int index; + int gateway_nal; + ptl_nid_t gateway_nid; + ptl_nid_t nid1; + ptl_nid_t nid2; + + + for (index = 0;;index++) + { + PORTAL_IOC_INIT(data); + data.ioc_count = index; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data); + if (rc != 0) + break; + + gateway_nal = data.ioc_nal; + gateway_nid = data.ioc_nid; + nid1 = data.ioc_nid2; + nid2 = data.ioc_nid3; + + printf ("%8s %18s : %s - %s\n", + nal2name (gateway_nal), + ptl_nid2str (buffer[0], gateway_nid), + ptl_nid2str (buffer[1], nid1), + ptl_nid2str (buffer[2], nid2)); + } + return (0); +} + diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c new file mode 100644 index 0000000..8c56d93 --- /dev/null +++ b/lnet/utils/ptlctl.c @@ -0,0 +1,65 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include + +#include "parser.h" + + +command_t list[] = { + {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: | for tcp/elan respectively)"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"shownid", jt_ptl_shownid, 0, "print the local NID"}, + {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, + {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, + {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, + {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, + {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (ptl_initialize(argc, argv) < 0) + exit(1); + + Parser_init("ptlctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + return 0; +} diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c new file mode 100644 index 0000000..37da12c --- /dev/null +++ b/lnet/utils/routerstat.c @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +double +timenow () +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000.0); +} + +void +do_stat (int fd) +{ + static char buffer[1024]; + static double last = 0.0; + double now; + double t; + long long bytes; + long packets; + long errors; + long depth; + int n; + + lseek (fd, 0, SEEK_SET); + now = timenow(); + n = read (fd, buffer, sizeof (buffer)); + if (n < 0) + { + fprintf (stderr, "Can't read statfile\n"); + exit (1); + } + buffer[n] = 0; + + n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth); + + if (n < 3) + { + fprintf (stderr, "Can't parse statfile\n"); + exit (1); + } + + if (last == 0.0) + printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", + bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors); + else + { + t = now - last; + + printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", + bytes, ((double)bytes)/((1<<20) * t), + packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t), + errors, (long)(errors/t)); + } + + if (n == 4) + printf (" (%ld)\n", depth); + else + printf ("\n"); + + fflush (stdout); + + lseek (fd, 0, SEEK_SET); + write (fd, "\n", 1); + last = timenow(); +} + +int main (int argc, char **argv) +{ + int interval = 0; + int fd; + + if (argc > 1) + interval = atoi (argv[1]); + + fd = open ("/proc/sys/portals/router", O_RDWR); + if (fd < 0) + { + fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); + return (1); + } + + do_stat (fd); + if (interval == 0) + return (0); + + for (;;) + { + sleep (interval); + do_stat (fd); + } +} diff --git a/lnet/utils/wirecheck.c b/lnet/utils/wirecheck.c new file mode 100644 index 0000000..6a4377b --- /dev/null +++ b/lnet/utils/wirecheck.c @@ -0,0 +1,141 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include +#include +#include +#include + +#define BLANK_LINE() \ +do { \ + printf ("\n"); \ +} while (0) + +#define COMMENT(c) \ +do { \ + printf (" /* "c" */\n"); \ +} while (0) + +#define STRINGIFY(a) #a + +#define CHECK_DEFINE(a) \ +do { \ + printf (" LASSERT ("#a" == "STRINGIFY(a)");\n"); \ +} while (0) + +#define CHECK_VALUE(a) \ +do { \ + printf (" LASSERT ("#a" == %d);\n", a); \ +} while (0) + +#define CHECK_MEMBER_OFFSET(s,m) \ +do { \ + CHECK_VALUE(offsetof(s, m)); \ +} while (0) + +#define CHECK_MEMBER_SIZEOF(s,m) \ +do { \ + CHECK_VALUE((int)sizeof(((s *)0)->m)); \ +} while (0) + +#define CHECK_MEMBER(s,m) \ +do { \ + CHECK_MEMBER_OFFSET(s, m); \ + CHECK_MEMBER_SIZEOF(s, m); \ +} while (0) + +#define CHECK_STRUCT(s) \ +do { \ + BLANK_LINE (); \ + COMMENT ("Checks for struct "#s); \ + CHECK_VALUE((int)sizeof(s)); \ +} while (0) + +void +check_ptl_handle_wire (void) +{ + CHECK_STRUCT (ptl_handle_wire_t); + CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie); + CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie); +} + +void +check_ptl_magicversion (void) +{ + CHECK_STRUCT (ptl_magicversion_t); + CHECK_MEMBER (ptl_magicversion_t, magic); + CHECK_MEMBER (ptl_magicversion_t, version_major); + CHECK_MEMBER (ptl_magicversion_t, version_minor); +} + +void +check_ptl_hdr (void) +{ + CHECK_STRUCT (ptl_hdr_t); + CHECK_MEMBER (ptl_hdr_t, dest_nid); + CHECK_MEMBER (ptl_hdr_t, src_nid); + CHECK_MEMBER (ptl_hdr_t, dest_pid); + CHECK_MEMBER (ptl_hdr_t, src_pid); + CHECK_MEMBER (ptl_hdr_t, type); + + BLANK_LINE (); + COMMENT ("Ack"); + CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength); + CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.ack.length); + + BLANK_LINE (); + COMMENT ("Put"); + CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index); + CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.put.length); + CHECK_MEMBER (ptl_hdr_t, msg.put.offset); + CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data); + + BLANK_LINE (); + COMMENT ("Get"); + CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index); + CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.get.length); + CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset); + CHECK_MEMBER (ptl_hdr_t, msg.get.return_offset); + CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length); + + BLANK_LINE (); + COMMENT ("Reply"); + CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_offset); + CHECK_MEMBER (ptl_hdr_t, msg.reply.length); +} + +int +main (int argc, char **argv) +{ + printf ("void lib_assert_wire_constants (void)\n" + "{\n"); + + COMMENT ("Wire protocol assertions generated by 'wirecheck'"); + BLANK_LINE (); + + COMMENT ("Constants..."); + CHECK_DEFINE (PORTALS_PROTO_MAGIC); + CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR); + CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR); + + CHECK_VALUE (PTL_MSG_ACK); + CHECK_VALUE (PTL_MSG_PUT); + CHECK_VALUE (PTL_MSG_GET); + CHECK_VALUE (PTL_MSG_REPLY); + CHECK_VALUE (PTL_MSG_HELLO); + + check_ptl_handle_wire (); + check_ptl_magicversion (); + check_ptl_hdr (); + + printf ("}\n\n"); + + return (0); +} diff --git a/lustre/.cvsignore b/lustre/.cvsignore index 34373dd..776ef36 100644 --- a/lustre/.cvsignore +++ b/lustre/.cvsignore @@ -1,4 +1,5 @@ .Xrefs +.Xrefs-2.5 aclocal.m4 config.log config.status @@ -13,3 +14,5 @@ lustre*.tar.gz cscope.files cscope.out autom4te-2.53.cache +autom4te.cache + diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 97789a8..89eaef7 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,3 +1,45 @@ +tbd + * version v0_7 + * bug fixes + - imports and exports cleanup too early, need refcounts (349, 879, 1045) + - per-import/export recovery handling (958, 931, 959) + - multiple last-rcvd slots, for serving multiple FSes (949) + - connections are again shared between multiple imp/exports (963, 964) + - "umount -f" would hang if any requests needed to be sent (393, 978) + - avoid pinning large req buffer by copying for queued messages (989) + - add "uuid" to "lctl device" command to help upcalls (991) + - "open" RPCs with transnos would confuse recovery counters (1037) + - do proper endian conversion of all wire messages (288, 340, 891) + - remove OST bulk get LBUGs, fix ost_brw_write cleanup (1126) + - call waiting locks callback from LDLM recovery thread (1127, 1151) + - fix ptlrpc_connection leak in target_handle_connect (1174) + - fix import refcounting bug in OST and MDS cleanup (1134) + - if an invalid-at-open-time OSC returned before close(), LBUG (1150) + - fix very unlikely obd_types race condition (501) + - remove osc_open hack for echo_client (1187) + - we leaked exports/dlmimps for forcibly disconnected clients (1143) + - a failure in read_inode2 leads to deadlock (1139) + - cancel ack-locks as soon as transaction is committed (1072) + - fix major leaks and crashes in the bulk I/O path (937, 1057) + - make sure to commitrw after any preprw to avoid deadlock (1162) + - failing to execute a file in a lustre FS would lock inode (1203) + - small DEBUG_REQ fix to avoid dereferencing a NULL (1227) + - don't ASSERT while cleaning up an incompletely-setup obd (1248) + - obd_uuid2tgt would walk off the end of the list (1255) + - on IA64 the osc would give portals incorrect bulk size (1258) + - fix debug daemon ioctl interface; allows daemon on ia64 (1274) + - fix lock inversion caused by new llite matching code (1282) + - limit the number of dirty pages on a client to 10MB (1286) + - timed out locks were not being corrected cancelled (1289) + - fix O_DIRECT above 4GB on IA-32 (1292) + * major user-visible changes + - fail out/fail over policy now controlled by the upcall (993) + * protocol changes + - add OBD_PING to check server availability and failure (954) + - lustre messages are now sent in sending host order (288, 340, 891) + - add eadatalen to MDS getattr reply (340) + - OST read replies may contain second buffer, with per-page status (593) + 2003-03-11 Phil Schwan * version v0_6 * bug fixes diff --git a/lustre/Makefile.am b/lustre/Makefile.am index 7ad7358..47d3c28 100644 --- a/lustre/Makefile.am +++ b/lustre/Makefile.am @@ -8,22 +8,21 @@ AUTOMAKE_OPTIONS = foreign if LINUX25 DIRS24 = else -DIRS24 = extN ptlbd +DIRS24 = ptlbd endif if LIBLUSTRE -#SUBDIRS = lov obdclass ptlrpc obdecho ldlm osc liblustre utils -SUBDIRS = lov obdclass ptlrpc obdecho ldlm osc utils +SUBDIRS = portals lov obdclass ptlrpc obdecho ldlm osc utils mdc #liblustre else # NOTE: keep extN before obdclass, mds, and obdfilter. Keep obdclass as early # as possible, to have the best chance at stopping with "wrong kernel version" # instead of some related build failure. -SUBDIRS = $(DIRS24) obdclass mds utils ptlrpc ldlm lib obdfilter mdc osc ost -SUBDIRS+= llite obdecho lov cobd tests doc scripts conf +SUBDIRS = portals $(DIRS24) obdclass mds utils ldlm obdfilter mdc osc ost +SUBDIRS+= llite obdecho lov cobd tests doc scripts conf ptlrpc endif DIST_SUBDIRS = $(SUBDIRS) liblustre -EXTRA_DIST = BUGS FDL Rules include archdep.m4 kernel_patches +EXTRA_DIST = BUGS FDL Rules include kernel_patches # We get the version from the spec file. CONFIGURE_DEPENDENCIES = scripts/lustre.spec.in @@ -36,3 +35,4 @@ include $(top_srcdir)/Rules rpms: dist Makefile rpmbuild -ta $(distdir).tar.gz + diff --git a/lustre/Makefile.mk b/lustre/Makefile.mk new file mode 100644 index 0000000..e540148 --- /dev/null +++ b/lustre/Makefile.mk @@ -0,0 +1,4 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += mds/ diff --git a/lustre/README b/lustre/README index a7b7240..1a80657 100644 --- a/lustre/README +++ b/lustre/README @@ -1,4 +1,4 @@ -Instructions for building, configuring, and running Lustre can be found in +Instructions for building, configuring and running Lustre can be found in the file doc/lustre-HOWTO.txt. If you have checked lustre directly out of CVS, then you either need to diff --git a/lustre/Rules b/lustre/Rules index d4e5ed7..0d92246 100644 --- a/lustre/Rules +++ b/lustre/Rules @@ -10,6 +10,17 @@ # name_SOURCES = my.c files.c # include $(top_srcdir)/Rules +if LINUX25 + +# We still need to link each module with vermagic.o to get rid of "kernel taited" warnings. +basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g') +AM_CPPFLAGS=-I$(top_builddir)/include -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2 -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename) + +else + +AM_CPPFLAGS=-I$(top_builddir)/include + +endif $(MODULE).o: $($(MODULE)_OBJECTS) $(LD) -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r -o $(MODULE).o $($(MODULE)_OBJECTS) @@ -17,9 +28,6 @@ $(MODULE).o: $($(MODULE)_OBJECTS) tags: rm -f $(top_srcdir)/TAGS rm -f $(top_srcdir)/tags - find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a - find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a -AM_CPPFLAGS=-I$(top_builddir)/include diff --git a/lustre/archdep.m4 b/lustre/archdep.m4 deleted file mode 100644 index 2bdd785..0000000 --- a/lustre/archdep.m4 +++ /dev/null @@ -1,127 +0,0 @@ -AC_ARG_WITH(lib, [ --with-lib compile lustre library], host_cpu="lib") - -AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...) -if test $host_cpu = "lib" ; then - host_cpu="lib" - AC_MSG_RESULT(no building Lustre library) -else -if test -e $LINUX/include/asm-um ; then -if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then - host_cpu="um"; - AC_MSG_RESULT(yes) -else - AC_MSG_RESULT(no (asm doesn't point at asm-um)) -fi - -else - AC_MSG_RESULT(no (asm-um missing)) -fi -fi - -AC_MSG_CHECKING(setting make flags system architecture: ) -case ${host_cpu} in - lib ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -Wall ' - KCPPFLAGS='-D__arch_lib__ ' - MOD_LINK=elf_i386 -;; - um ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common ' - case ${linux25} in - yes ) - KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) ' - ;; - * ) - KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include ' - ;; - esac - - MOD_LINK=elf_i386 -;; - i*86 ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe' - case ${linux25} in - yes ) - KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' - ;; - * ) - KCPPFLAGS='-D__KERNEL__ -DMODULE ' - ;; - esac - MOD_LINK=elf_i386 -;; - - alphaev6 ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' - KCPPFLAGS='-D__KERNEL__ -DMODULE ' - MOD_LINK=elf64alpha -;; - - alphaev67 ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' - KCPPFLAGS='-D__KERNEL__ -DMODULE ' - MOD_LINK=elf64alpha -;; - - alpha* ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5' - KCPPFLAGS='-D__KERNEL__ -DMODULE ' - MOD_LINK=elf64alpha -;; - - ia64 ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' - KCPPFLAGS='-D__KERNEL__ -DMODULE' - MOD_LINK=elf64_ia64 -;; - - sparc64 ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs' - KCPPFLAGS='-D__KERNEL__' - MOD_LINK=elf64_sparc - -;; - - powerpc ) - AC_MSG_RESULT($host_cpu) - KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' - KCPPFLAGS='-D__KERNEL__' - MOD_LINK=elf32ppclinux -;; - - *) - AC_ERROR("Unknown Linux Platform: $host_cpu") -;; -esac - -if test $host_cpu != lib ; then -AC_MSG_CHECKING(for MODVERSIONS) -if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1; -then - MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB" - AC_MSG_RESULT(yes) -else - MFLAGS= - AC_MSG_RESULT(no) -fi - -AC_MSG_CHECKING(for SMP) -if egrep -e SMP=y $LINUX/.config >/dev/null 2>&1; then - SMPFLAG= - AC_MSG_RESULT(yes) -else - SMPFLAG= - AC_MSG_RESULT(no) -fi -fi - -CFLAGS="$KCFLAGS $MFLAGS" -ARCHCPPFLAGS="$KCPPFLAGS" diff --git a/lustre/autogen.sh b/lustre/autogen.sh index 087ff09..9deed73 100644 --- a/lustre/autogen.sh +++ b/lustre/autogen.sh @@ -1,6 +1,5 @@ #!/bin/sh -find . -type d -name .deps | xargs rm -rf aclocal && -${AUTOMAKE:-automake} --add-missing && +automake --add-missing && ${AUTOCONF:-autoconf} diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c index 67b4e62..c96b2ad 100644 --- a/lustre/cobd/cache_obd.c +++ b/lustre/cobd/cache_obd.c @@ -71,23 +71,23 @@ cobd_setup (struct obd_device *dev, obd_count len, void *buf) /* don't bother checking attached/setup; * obd_connect() should, and it can change underneath us */ - rc = obd_connect (&cobd->cobd_target, target, &target_uuid, NULL, NULL); + rc = obd_connect (&cobd->cobd_target, target, &target_uuid); if (rc != 0) return (rc); - rc = obd_connect (&cobd->cobd_cache, cache, &cache_uuid, NULL, NULL); + rc = obd_connect (&cobd->cobd_cache, cache, &cache_uuid); if (rc != 0) goto fail_0; return (0); fail_0: - obd_disconnect (&cobd->cobd_target); + obd_disconnect (&cobd->cobd_target, 0 ); return (rc); } static int -cobd_cleanup (struct obd_device *dev) +cobd_cleanup (struct obd_device *dev, int force, int failover) { struct cache_obd *cobd = &dev->u.cobd; int rc; @@ -95,11 +95,11 @@ cobd_cleanup (struct obd_device *dev) if (!list_empty (&dev->obd_exports)) return (-EBUSY); - rc = obd_disconnect (&cobd->cobd_cache); + rc = obd_disconnect (&cobd->cobd_cache, failover); if (rc != 0) CERROR ("error %d disconnecting cache\n", rc); - rc = obd_disconnect (&cobd->cobd_target); + rc = obd_disconnect (&cobd->cobd_target, failover); if (rc != 0) CERROR ("error %d disconnecting target\n", rc); @@ -108,8 +108,7 @@ cobd_cleanup (struct obd_device *dev) static int cobd_connect (struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { int rc = class_connect (conn, obd, cluuid); @@ -118,9 +117,9 @@ cobd_connect (struct lustre_handle *conn, struct obd_device *obd, } static int -cobd_disconnect (struct lustre_handle *conn) +cobd_disconnect (struct lustre_handle *conn, int failover) { - int rc = class_disconnect (conn); + int rc = class_disconnect (conn, failover); CERROR ("rc %d\n", rc); return (rc); @@ -128,13 +127,13 @@ cobd_disconnect (struct lustre_handle *conn) static int cobd_get_info(struct lustre_handle *conn, obd_count keylen, - void *key, obd_count *vallen, void **val) + void *key, __u32 *vallen, void *val) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -142,8 +141,7 @@ cobd_get_info(struct lustre_handle *conn, obd_count keylen, /* intercept cache utilisation info? */ - return (obd_get_info (&cobd->cobd_target, - keylen, key, vallen, val)); + return obd_get_info(&cobd->cobd_target, keylen, key, vallen, val); } static int @@ -153,7 +151,7 @@ cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -169,7 +167,7 @@ cobd_getattr(struct lustre_handle *conn, struct obdo *oa, struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -179,18 +177,19 @@ cobd_getattr(struct lustre_handle *conn, struct obdo *oa, static int cobd_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti) + struct lov_stripe_md *lsm, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } cobd = &obd->u.cobd; - return (obd_open (&cobd->cobd_target, oa, lsm, oti)); + return (obd_open (&cobd->cobd_target, oa, lsm, oti, och)); } static int @@ -201,7 +200,7 @@ cobd_close(struct lustre_handle *conn, struct obdo *oa, struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -209,66 +208,59 @@ cobd_close(struct lustre_handle *conn, struct obdo *oa, return (obd_close (&cobd->cobd_target, oa, lsm, oti)); } -static int -cobd_preprw(int cmd, struct lustre_handle *conn, - int objcount, struct obd_ioobj *obj, - int niocount, struct niobuf_remote *nb, - struct niobuf_local *res, void **desc_private, - struct obd_trans_info *oti) +static int cobd_preprw(int cmd, struct obd_export *exp, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_remote *nb, + struct niobuf_local *res, void **desc_private, + struct obd_trans_info *oti) { - struct obd_device *obd = class_conn2obd(conn); - struct cache_obd *cobd; + struct obd_export *cobd_exp; + int rc; - if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + if (exp->exp_obd == NULL) return -EINVAL; - } if ((cmd & OBD_BRW_WRITE) != 0) return -EOPNOTSUPP; - cobd = &obd->u.cobd; - return (obd_preprw (cmd, &cobd->cobd_target, - objcount, obj, - niocount, nb, - res, desc_private, oti)); + cobd_exp = class_conn2export(&exp->exp_obd->u.cobd.cobd_target); + rc = obd_preprw(cmd, cobd_exp, objcount, obj, niocount, nb, res, + desc_private, oti); + class_export_put(cobd_exp); + return rc; } -static int -cobd_commitrw(int cmd, struct lustre_handle *conn, - int objcount, struct obd_ioobj *obj, - int niocount, struct niobuf_local *local, - void *desc_private, struct obd_trans_info *oti) +static int cobd_commitrw(int cmd, struct obd_export *exp, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_local *local, + void *desc_private, struct obd_trans_info *oti) { - struct obd_device *obd = class_conn2obd(conn); - struct cache_obd *cobd; + struct obd_export *cobd_exp; + int rc; - if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + if (exp->exp_obd == NULL) return -EINVAL; - } if ((cmd & OBD_BRW_WRITE) != 0) return -EOPNOTSUPP; - cobd = &obd->u.cobd; - return (obd_commitrw (cmd, &cobd->cobd_target, - objcount, obj, - niocount, local, - desc_private, oti)); + cobd_exp = class_conn2export(&exp->exp_obd->u.cobd.cobd_target); + rc = obd_commitrw(cmd, cobd_exp, objcount, obj, niocount, local, + desc_private, oti); + class_export_put(cobd_exp); + return rc; } static inline int cobd_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set, - struct obd_trans_info *oti) + struct brw_page *pga, struct obd_trans_info *oti) { struct obd_device *obd = class_conn2obd(conn); struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -277,7 +269,7 @@ cobd_brw(int cmd, struct lustre_handle *conn, cobd = &obd->u.cobd; return (obd_brw (cmd, &cobd->cobd_target, - lsm, oa_bufs, pga, set, oti)); + lsm, oa_bufs, pga, oti)); } static int @@ -288,7 +280,7 @@ cobd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, struct cache_obd *cobd; if (obd == NULL) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } diff --git a/lustre/cobd/lproc_cache.c b/lustre/cobd/lproc_cache.c index 7e5c267..fd7474b 100644 --- a/lustre/cobd/lproc_cache.c +++ b/lustre/cobd/lproc_cache.c @@ -40,13 +40,14 @@ static int rd_target(char *page, char **start, off_t off, int count, LASSERT(dev != NULL); conn = &dev->u.cobd.cobd_target; - if ((dev->obd_flags & OBD_SET_UP) == 0) + if (!dev->obd_set_up) { rc = snprintf (page, count, "not set up\n"); - else { - exp = class_conn2export (conn); + } else { + exp = class_conn2export(conn); LASSERT(exp != NULL); rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid); + class_export_put(exp); } return (rc); } @@ -62,13 +63,14 @@ static int rd_cache(char *page, char **start, off_t off, int count, LASSERT(dev != NULL); conn = &dev->u.cobd.cobd_cache; - if ((dev->obd_flags & OBD_SET_UP) == 0) + if (!dev->obd_set_up) { rc = snprintf (page, count, "not set up\n"); - else { - exp = class_conn2export (conn); + } else { + exp = class_conn2export(conn); LASSERT (exp != NULL); rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid.uuid); + class_export_put(exp); } return (rc); } diff --git a/lustre/conf/lustre.dtd b/lustre/conf/lustre.dtd index 8d575a61..51d1d1a 100644 --- a/lustre/conf/lustre.dtd +++ b/lustre/conf/lustre.dtd @@ -12,19 +12,21 @@ + - + + nettype (tcp | elan | gm | scimac) 'tcp'> - @@ -47,18 +49,20 @@ + target_ref | node_ref | journalsize )*> - - + + - - + + + target_ref | node_ref | journalsize )*> @@ -72,11 +76,14 @@ + + + - - + + diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl index f3c1364..5fc6f9c 100644 --- a/lustre/conf/lustre2ldif.xsl +++ b/lustre/conf/lustre2ldif.xsl @@ -10,6 +10,7 @@ dn: uuid: CONFIG_UUID objectClass: LUSTRECONFIG config: +version: @@ -23,8 +24,30 @@ networkRef: profileRef: + +timeout: + + +lustreUpcall: + + +portalsUpcall: + - + + +dn: uuid=, +objectClass: NETWORK +lustreName: +uuid: +nettype: +nid: + +port: + + + + + @@ -76,6 +113,9 @@ devpath: devsize: + +journalsize: + nodeRef: targetRef: @@ -124,6 +164,9 @@ devpath: devsize: + +journalsize: + @@ -132,6 +175,22 @@ devsize: dn: uuid=, objectClass: OST lustreName: +uuid: +activeRef: + +failover: + + +group: + + + + + + + + @@ -217,17 +280,8 @@ echoclientRef: lovRef: - - - - - diff --git a/lustre/conf/slapd-lustre.conf b/lustre/conf/slapd-lustre.conf index 7906908..8558f64 100644 --- a/lustre/conf/slapd-lustre.conf +++ b/lustre/conf/slapd-lustre.conf @@ -1,6 +1,5 @@ ####################################################################### # lustre ldap config database -# $Id: slapd-lustre.conf,v 1.3 2003/03/11 23:36:45 pschwan Exp $ ####################################################################### database ldbm diff --git a/lustre/configure.in b/lustre/configure.in index 5c5f438..0850115 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -1,201 +1,49 @@ -AC_INIT -AC_CANONICAL_SYSTEM - # Copyright (C) 2001-2003 Cluster File Systems, Inc. # # This code is issued under the GNU General Public License. # See the file COPYING in this distribution +AC_INIT +AC_CANONICAL_SYSTEM + # Automake variables. Steal the version number from lustre.spec.in. AM_INIT_AUTOMAKE(lustre, builtin([esyscmd], [sed -ne '/^%define version /{ s/.*version //; p; q; }' scripts/lustre.spec.in])) #AM_MAINTAINER_MODE -AC_PROG_CC -AC_MSG_CHECKING(for buggy compiler) -CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` -bad_cc() { - echo - echo " '$CC_VERSION'" - echo " has been known to generate bad code, " - echo " please get an updated compiler." - AC_MSG_ERROR(sorry) -} -TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` -if test "$TMP_VERSION" = "gcc version 2.95"; then - bad_cc -fi -case "$CC_VERSION" in - # ost_pack_niobuf putting 64bit NTOH temporaries on the stack - # without "sub $0xc,%esp" to protect the stack from being - # stomped on by interrupts (bug 606) - "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") - bad_cc - ;; - *) - AC_MSG_RESULT(no known problems) - ;; -esac - -AC_PROG_RANLIB - -# -# Check for required packages - -# this doesn't seem to work on older autoconf -# AC_CHECK_LIB(readline, readline,,) - -AC_ARG_ENABLE(readline, [ --enable-readline use readline library],, - enable_readline="yes") - -if test "$enable_readline" = "yes" ; then - LIBREADLINE="-lreadline -lncurses" - HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1" -else - LIBREADLINE="" - HAVE_LIBREADLINE="" -fi -AC_SUBST(LIBREADLINE) -AC_SUBST(HAVE_LIBREADLINE) - -AC_ARG_ENABLE(efence, [ --enable-efence use efence library],, - enable_efence="no") - -if test "$enable_efence" = "yes" ; then - LIBEFENCE="-lefence" - HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1" -else - LIBEFENCE="" - HAVE_LIBEFENCE="" -fi -AC_SUBST(LIBEFENCE) -AC_SUBST(HAVE_LIBEFENCE) - -# XXX this should be a runtime option -AC_MSG_CHECKING(if you are enabling OST recovery...) -AC_ARG_ENABLE(ost_recovery, [ --enable-ost-recovery: enable support for ost recovery],, - enable_ost_recovery="no") -if test "$enable_ost_recovery" = "yes" ; then - ENABLE_OST_RECOVERY="-DOST_RECOVERY=1" - AC_MSG_RESULT(yes) -else - ENABLE_OST_RECOVERY="" - AC_MSG_RESULT(no) -fi -AC_SUBST(ENABLE_OST_RECOVERY) - - -# Kernel build environment. -ac_default_prefix= -bindir='${exec_prefix}/usr/bin' -sbindir='${exec_prefix}/usr/sbin' - -linuxdir_def=/usr/src/linux -AC_ARG_WITH(linux, [ --with-linux=[path] set path to Linux source (default=/usr/src/linux)], enable_linuxdir=$withval) -AC_ARG_ENABLE(linuxdir, [ --enable-linuxdir=[path] (deprecated) set path to Linux source (default=/usr/src/linux)],, enable_linuxdir=$linuxdir_def) - -LINUX=$enable_linuxdir -AC_SUBST(LINUX) - -AC_MSG_CHECKING(if you are running linux 2.5...) -if test -e $LINUX/include/linux/namei.h ; then - linux25="yes" - AC_MSG_RESULT(yes) -else - linux25="no" - AC_MSG_RESULT(no) -fi -AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) - -sinclude(archdep.m4) - - -portalsdir_def='$(top_srcdir)/../portals' -AC_ARG_WITH(portals, [ --with-portals=[path] set path to Portals source (default=../portals)], enable_portalsdir=$withval) -AC_ARG_ENABLE(portalsdir, [ --enable-portalsdir=[path] (deprecated) set path to Portals source (default=$portalsdir_def],, enable_portalsdir=$portalsdir_def) -PORTALS=$enable_portalsdir - -if test $PORTALS = $portalsdir_def; then - PORTALSLOC='../portals' -else - PORTALSLOC=$PORTALS -fi - -AC_SUBST(PORTALS) -AC_SUBST(PORTALSLOC) - -portalslib_def=$enable_portalsdir/linux/utils -AC_ARG_WITH(portalslib, [ --with-portalslib=[path] set path to Portals library (default=../portals/linux/utils)], enable_portalslib=$withval) -AC_ARG_ENABLE(portalslib, [ --enable-portalslib=[path] (deprecated) set path to Portals lib (default=../portals/linux/utils)],, enable_portalslib=$portalslib_def) - - -if ! test -z "$enable_portalslib"; then - PORTALSLIB=${enable_portalslib} -fi -AC_SUBST(PORTALSLIB) - -AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) -AC_MSG_CHECKING(if you are building lib lustre) -if test "$host_cpu" = "lib"; then - AC_MSG_RESULT(yes) - libdir='${exec_prefix}/lib/lustre' -else - AC_MSG_RESULT(no) -fi - -if test $host_cpu != "lib" ; then -KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include -I$(LINUX)/include' -else -KINCFLAGS='-I$(top_srcdir)/include -I$(PORTALS)/include' -fi -CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS $ENABLE_OST_RECOVERY" - -if test $host_cpu != "lib" ; then -AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) ) -if test -f $LINUX/include/linux/config.h ; then - AC_MSG_RESULT(yes) -else - AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.) -fi - -AC_MSG_CHECKING(if autoconf.h is in kernel source) -if test -f $LINUX/include/linux/autoconf.h ; then - AC_MSG_RESULT(yes) -else - AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.) -fi - -AC_MSG_CHECKING(for Linux release) - -dnl We need to rid ourselves of the nasty [ ] quotes. -changequote(, ) -dnl Get release from version.h -RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`" -changequote([, ]) - -moduledir='$(libdir)/modules/'$RELEASE/kernel -AC_SUBST(moduledir) - -modulefsdir='$(moduledir)/fs/$(PACKAGE)' -AC_SUBST(modulefsdir) - -AC_MSG_RESULT($RELEASE) -AC_SUBST(RELEASE) - -fi -# Directories for documentation and demos. -docdir='${prefix}/usr/share/doc/$(PACKAGE)' -AC_SUBST(docdir) - -demodir='$(docdir)/demo' -AC_SUBST(demodir) - -# not needed until the AC_CHECK_LIB(readline) above works -# AM_CONFIG_HEADER(include/config.h) - -AC_OUTPUT(Makefile lib/Makefile ldlm/Makefile obdecho/Makefile ptlrpc/Makefile \ - liblustre/Makefile \ - lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \ - cobd/Makefile ptlbd/Makefile conf/Makefile \ - utils/Makefile utils/lconf tests/Makefile obdfilter/Makefile \ - obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \ - scripts/lustre.spec extN/Makefile, chmod +x utils/lconf) +# LLNL patches their ext3 and calls it extN +AC_ARG_ENABLE(extN, [ --enable-extN use extN instead of ext3 for lustre backend]) +AM_CONDITIONAL(EXTN, test x$enable_extN = xyes) + +AC_ARG_WITH(obd-buffer-size, [ --with-obd-buffer-size=[size] set lctl ioctl maximum (default=8K)],OBD_BUFFER_SIZE=$with_obd_buffer_size,OBD_BUFFER_SIZE=8192) +AC_SUBST(OBD_BUFFER_SIZE) + +sinclude(portals/build.m4) +sinclude(portals/archdep.m4) + +if test x$enable_inkernel = xyes ; then +cp Makefile.mk Makefile.in +cp mds/Makefile.mk mds/Makefile.in +cp portals/Kernelenv.mk portals/Kernelenv.in +cp portals/Makefile.mk portals/Makefile.in +cp portals/libcfs/Makefile.mk portals/libcfs/Makefile.in +cp portals/portals/Makefile.mk portals/portals/Makefile.in +cp portals/knals/Makefile.mk portals/knals/Makefile.in +cp portals/knals/socknal/Makefile.mk portals/knals/socknal/Makefile.in +cp portals/router/Makefile.mk portals/router/Makefile.in +fi + +AM_CONFIG_HEADER(portals/include/config.h) + +AC_OUTPUT([Makefile portals/Makefile portals/Kernelenv \ + portals/libcfs/Makefile portals/portals/Makefile \ + portals/unals/Makefile portals/knals/Makefile \ + portals/router/Makefile portals/knals/socknal/Makefile \ + portals/knals/gmnal/Makefile portals/knals/qswnal/Makefile \ + portals/knals/scimacnal/Makefile portals/knals/toenal/Makefile \ + portals/utils/Makefile portals/tests/Makefile portals/doc/Makefile \ + ldlm/Makefile obdecho/Makefile ptlrpc/Makefile liblustre/Makefile \ + lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \ + cobd/Makefile ptlbd/Makefile conf/Makefile tests/Makefile \ + utils/Makefile utils/Lustre/Makefile obdfilter/Makefile \ + obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \ + scripts/lustre.spec]) diff --git a/lustre/doc/lconf.lyx b/lustre/doc/lconf.lyx index 3bfecbd..85c670b 100644 --- a/lustre/doc/lconf.lyx +++ b/lustre/doc/lconf.lyx @@ -48,7 +48,24 @@ lconf\SpecialChar ~ DESCRIPTION \layout Standard -This program configures a node following directives in the . +This program configures a node following directives in the .. + There will be single configuration file for all the nodes in a single cluster. + This file should be distributed to all the nodes in the cluster or kept + in a location accessible to all the nodes. + One option is to store the cluster configuration information in LDAP format + on an LDAP server that can be reached from all the cluster nodes. +\layout Description + +--ldapurl\SpecialChar ~ + LDAP server URL +\layout Description + +--config\SpecialChar ~ + Cluster configuration name used for LDAP query +\layout Description + +--select\SpecialChar ~ + Select a particular node for a service \layout Description --node\SpecialChar ~ @@ -67,7 +84,7 @@ node_name error. \layout Description ---cleanup Unconfigure a node. +--d|--cleanup Unconfigure a node. The same config and \emph on --node @@ -77,6 +94,21 @@ node_name including unloading the kernel modules. \layout Description +--force Forced unmounting and/or obd detach during cleanup. + Default is 0. + +\layout Description + +--mds_ost_conn Open connections to OSTs on MDS. +\layout Description + +--failover Used to shutdown without saving state. + Default is 0. + This will allow the node to give up service to another node for failover + purposes. + This will not be a clean shutdown. +\layout Description + --noexec Print, but don't execute, the steps lconf will perform. This is useful for debugging a configuration, and when used with \emph on @@ -90,9 +122,27 @@ node_name module script is always created, however). \layout Description +--gdb_script\SpecialChar ~ + Full name of gdb debug script. + Default is /tmp/ogdb. +\layout Description + +--dump_path\SpecialChar ~ + Path to save debug dumps. + Default is /tmp/lustre_log +\layout Description + +--recover\SpecialChar ~ + Recover a device. +\layout Description + --nosetup Only load modules, do not configure devices or services. \layout Description +--group\SpecialChar ~ + The group of devices tol cleanup/configure. +\layout Description + --nomod Only setup devices and services, do not load modules. \layout Description @@ -102,15 +152,44 @@ node_name --verbose,-v Be verbose and show actions while going along. \layout Description ---reformat Reformat all the devices +--timeout\SpecialChar ~ + Set the recovery timeout period. +\layout Description + +--lustre_upcall\SpecialChar ~ + Set the location of the Lustre upcall scripts used + by the client for recovery +\layout Description + +--portals_upcall\SpecialChar ~ + Specify the location of the Portals upcall scripts + used by the client for recovery +\layout Description + +--upcall\SpecialChar ~ + Set the location of both Lustre and Portals upcall scripts + used by the client for recovery +\layout Description + +--lctl-dump\SpecialChar ~ + Dump all ioctls to the specified file +\layout Description + +--dump\SpecialChar ~ + Dump the kernel debug log to the specified file before portals + is unloaded during cleanup. +\layout Description + +--reformat Reformat all the devices. + This is essential on the first time the file system is brought up. \layout Description -h,--help Print help. \layout Description --maxlevel\SpecialChar ~ - [NOT IMPLEMENTED] Perform configuration of devices and - services up to level given. + Perform configuration of devices and services up to level + given. \emph on level @@ -122,6 +201,32 @@ net, dev, svc, fs. \series default When used in conjunction with cleanup, services are torn down up to a certain level. + Default is 100. +\layout Description + +--minlevel\SpecialChar ~ + Specify the minimum level of services to configure/cleanup. + Default is 0. +\layout Description + +--lustre=src_dir Specify the base directory for Lustre sources, this parameter + will cause lconf to load the lustre modules from this soure tree. +\layout Description + +--portals=src_dir Portals source directory. + If this is a relative path, it is assumed to be relative to Lustre source + tree location. + +\layout Description + +--ptldebug\SpecialChar ~ +debug\SpecialChar ~ +level This options can be used to set the required debug + level +\layout Description + +--subsystem\SpecialChar ~ + Set the portals debug subsystem \layout Subsection EXAMPLES @@ -136,6 +241,47 @@ lconf --node client config.xml \layout Standard in order to give clients, regardless of hostname, a single configuration. +\layout Standard + +Required debug levels can be set like this: +\layout LyX-Code + + +\size small + ## Everything but these +\layout LyX-Code + + +\size small +lconf --ptldebug +\begin_inset Quotes eld +\end_inset + +~(portals | malloc | trace) +\begin_inset Quotes erd +\end_inset + + +\layout LyX-Code + +\layout LyX-Code + + +\size small +## Only these debug types +\layout LyX-Code + + +\size small +lconf --ptldebug +\begin_inset Quotes eld +\end_inset + +ldlm|ha +\begin_inset Quotes erd +\end_inset + + \layout Subsection BUGS diff --git a/lustre/doc/lctl.lyx b/lustre/doc/lctl.lyx index 33b40b2..b3f3f3e 100644 --- a/lustre/doc/lctl.lyx +++ b/lustre/doc/lctl.lyx @@ -179,8 +179,14 @@ on a device name. devno \emph default option is used as above. -\layout LyX-Code +\layout Description + +--ignore_errors\SpecialChar ~ +|\SpecialChar ~ +ignore_errors Ignore errors during script processing +\layout Description +dump Save ioctls to a file \layout LyX-Code \layout Description @@ -276,6 +282,15 @@ send_mem\SpecialChar ~ nagle\SpecialChar ~ [on/off] Enable/disable nagle; omitting the argument will cause the default value to be printed. +\layout Description + +fail\SpecialChar ~ +nid|all\SpecialChar ~ +[count] Fail/restore communications. + Ommiting tha count implies fail indefinitely, count of zero indicates that + communication should be restored. + A non-zero count indicates the number of portals messages to be dropped + after which the communication is restored. \end_deeper \layout Description @@ -297,6 +312,9 @@ device This will select the specified OBD device. \layout Description device_list Show all the devices. +\layout Description + +lustre_build_version Print the Lustre build version. \end_deeper \layout Description @@ -340,13 +358,13 @@ detach Remove driver (and name and UUID) from the current device. lov_setconfig\SpecialChar ~ lov-uuid\SpecialChar ~ -default-stripe-count\SpecialChar ~ +stripe-count\SpecialChar ~ default-stripe-size\SpecialChar ~ offset\SpecialChar ~ pattern\SpecialChar ~ UUID1\SpecialChar ~ -[U -UID2...] Write LOV configuration to an MDS device. +[UUID2...] + Write LOV configuration to an MDS device. \layout Description lov_getconfig\SpecialChar ~ @@ -371,6 +389,12 @@ probe\SpecialChar ~ close \emph on +\emph default +Close the +\emph on + +\emph default +connection handle \layout Description getattr\SpecialChar ~ @@ -405,7 +429,18 @@ create\SpecialChar ~ \layout Description destroy\SpecialChar ~ - Destroy an OST object. +\SpecialChar ~ +starting\SpecialChar ~ +at\SpecialChar ~ + Destroy < +\emph on +num +\emph default +> number of objects starting from the object with object id < +\emph on +objid +\emph default +>. \layout Description test_getattr\SpecialChar ~ @@ -476,9 +511,45 @@ ldlm_regress_stop Stop lock manager stress test. dump_ldlm Dump all lock manager state, this is very useful for debugging \layout Description -newconn\SpecialChar ~ -\SpecialChar ~ -[newuuid] +activate Activate an import +\layout Description + +deacttivate De-activate an import +\layout Description + +recover\SpecialChar ~ + +\layout Description + +lookup\SpecialChar ~ +\SpecialChar ~ + +\layout Description + +notransno Disable sending of committed transnumber updates +\layout Description + +readonly Disable writes to the underlying device +\layout Description + +abort_recovery Abort recovery on MDS device +\layout Description + +mount_option Dump mount options to a file +\layout Description + +get_stripe show stripe info for an echo client object. +\layout Description + +set_stripe\SpecialChar ~ +[\SpecialChar ~ +width!count[@offset]\SpecialChar ~ +[:id:id....] set stripe info for an echo + client +\layout Description + +unset_stripe\SpecialChar ~ + unset stripe info for an echo client object. \end_deeper \layout Description @@ -486,6 +557,9 @@ Debug \begin_deeper \layout Description +debug_daemon debug daemon control and dump to a file +\layout Description + debug_kernel\SpecialChar ~ [file]\SpecialChar ~ [raw] Get debug buffer and dump to a diff --git a/lustre/doc/lmc.lyx b/lustre/doc/lmc.lyx index 2cbcdc0..7a90023 100644 --- a/lustre/doc/lmc.lyx +++ b/lustre/doc/lmc.lyx @@ -142,6 +142,44 @@ To generate configuration data associated with systems in a Lustre cluster: - \emph default -add\SpecialChar ~ +node Adds a new node in the cluster configuration. + +\begin_deeper +\layout Standard + +The arguments required are: +\layout Description + +--node\SpecialChar ~ +''node_name'' This will create a new node with the given name if not + already present. +\layout Description + +--timeout\SpecialChar ~ + Timeout before going into recovery +\layout Description + +--lustre_upcall\SpecialChar ~ + Set the location of the Lustre upcall scripts used + by the client for recovery +\layout Description + +--portals_upcall\SpecialChar ~ + Specify the location of the Portals upcall scripts + used by the client for recovery +\layout Description + +--upcall\SpecialChar ~ + Specify the location of both (Lustre and Portals) upcall + scripts used by the client for recovery +\end_deeper +\layout Description + + +\emph on +- +\emph default +-add\SpecialChar ~ net Adds a network device descriptor for the given node, with parameters as indicated. \begin_deeper @@ -159,7 +197,7 @@ The arguments required are: --nettype\SpecialChar ~ This can be \series bold -tcp, elan, gm. +tcp, elan, gm, scimac. \layout Description --nid\SpecialChar ~ @@ -193,6 +231,9 @@ client configuration. \layout Description +--hostaddr addr +\layout Description + --router Optional flag to mark this node as a router \layout Description @@ -210,13 +251,26 @@ profiles \layout Description --port\SpecialChar ~ -[port] Optional argument to indicate the tcp port. +[port] Optional arguement to indicate the tcp port. The default is 988. \layout Description --tcpbuf\SpecialChar ~ - Optional argument. + Optional arguement. + The default TCP buffer size is 1MB. +\layout Description + +--irq_affinity\SpecialChar ~ +0|1 Optional arguement. + Default is 0. +\layout Description + +--nid_exchange\SpecialChar ~ +0|1 Optional arguement since some OSTs might not have the + required support. + This is turned off by default, value of 1 will turn it ON. + \end_deeper \layout Description @@ -225,6 +279,11 @@ mds \begin_deeper \layout Description +--node\SpecialChar ~ + Name of the node on which the MDS resides +\layout Description + --mds\SpecialChar ~ \layout Description @@ -235,8 +294,8 @@ mds \layout Description --size\SpecialChar ~ - Optional argument indicating the size of the device to be created - (used typically for loop devices). + Optional arguement indicating the size of the device to be + created (used typically for loop devices). \layout Description --node\SpecialChar ~ @@ -246,6 +305,23 @@ mds --node \emph default argument, and it must not be a profile node. +\layout Description + +--fstype\SpecialChar ~ +extN|ext3 Optional arguement used to specify the file system type. + Default is ext3. +\layout Description + +--journal_size\SpecialChar ~ + Optional arguement to specify the journal size for + the ext2/ext3 file system. + The size should be in the units expected by +\series bold +mkfs +\series default +, so for ext3 it should be in MB. + If this is option is not used, the ext2/ext3 filesystem will be configured + with the default journal size. \end_deeper \layout Description @@ -272,11 +348,13 @@ mds_name \layout Description --stripe_cnt\SpecialChar ~ - + A value of 0 for this means to stripe on all available + OSTs. + Default is 0. \layout Description --stripe_pattern\SpecialChar ~ - Pattern can be 0. + Only Pattern 0 (RAID 0) is supported currently. \end_deeper \layout Description @@ -286,8 +364,8 @@ ost Creates an OBD, OST, and OSC. \begin_deeper \layout Description ---obd\SpecialChar ~ - Assign a name to the OBD device. +--ost\SpecialChar ~ + Assign a name to the OST device. \layout Description --node\SpecialChar ~ @@ -305,18 +383,36 @@ ost Creates an OBD, OST, and OSC. [size] \layout Description +--osdtype\SpecialChar ~ +obdfilter|obdecho +\layout Description + --lov\SpecialChar ~ - Name of LOV to which this OSC will be attached. + Optional arguement. + Name of LOV to which this OSC will be attached. \layout Description ---obduuid\SpecialChar ~ -UUID Specify the UUID of the OBD device. - The default value is -\emph on -OBD_nodename_UUID -\emph default -. +--ostuuid\SpecialChar ~ +UUID Specify the UUID of the OST device. + +\layout Description + +--fstype\SpecialChar ~ +extN|ext3 Optional arguement used to specify the file system type. + Default is ext3. +\layout Description + +--journal_size\SpecialChar ~ + Optional arguement to specify the journal size for + the ext2/ext3 file system. + The size should be in the units expected by +\series bold +mkfs +\series default +, so for ext3 it should be in MB. + If this is option is not used, the ext2/ext3 filesystem will be configured + with the default journal size. \end_deeper \layout Description @@ -343,8 +439,8 @@ mtpt mds_name \layout Description ---obd\SpecialChar ~ -obd_name\SpecialChar ~ +--ost\SpecialChar ~ +ost_name\SpecialChar ~ |\SpecialChar ~ --lov\SpecialChar ~ lov_name @@ -396,7 +492,8 @@ nid. \layout Description --add\SpecialChar ~ -echo-client Used for testing purpose only +echo-client Used for testing purpose only. + \begin_deeper \layout Description diff --git a/lustre/extN/Makefile.am b/lustre/extN/Makefile.am deleted file mode 100644 index d1de59b..0000000 --- a/lustre/extN/Makefile.am +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright (C) 2001 Cluster File Systems, Inc. -# -# This code is issued under the GNU General Public License. -# See the file COPYING in this distribution - -DEFS=-DEXPORT_SYMTAB -MODULE = extN -modulefs_DATA = extN.o -EXTRA_PROGRAMS = extN - -# NOTE: If you are not using a RedHat 12.5 or later kernel, then you need to -# apply the "fixes" patch first, as it fixes a number of bugs in ext3. -# It will be applied automatically by the extN build process, or you -# can apply it to the source kernel tree and fix ext3 also. For chaos22 -# (or other RH < 12.5 kernels) use the "chaos22" patch instead. -EXTN_FIXES = patch-2.4.18-chaos22 -#EXTN_FIXES = ext3-2.4.18-fixes.diff -EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff ext3-2.4-ino_t.diff -EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff -EXTNP+= extN-wantedi.diff extN-san.diff extN-2.4.18-ino_sb_fixup.diff -#EXTNP+= extN-iget-debug.diff -EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c -EXTNC+= namei.c super.c symlink.c -EXTNI = extN_fs.h extN_fs_i.h extN_fs_sb.h extN_jbd.h quotaops.h -EXTN_EXTRA = include/linux/xattr.h include/linux/extN_xattr.h fs/extN/xattr.c -EXTN_EXTRA += include/linux/quotaops.h -extN_SOURCES = $(EXTNC) xattr.c # punch.c -extN_DEPENDENCIES = patch-stamp -EXTRA_DIST = $(EXTNP) $(EXTN_FIXES) \ - ext3-largefile.diff extN-2.4.18-exports.diff \ - ext3-use-after-free.diff ext3-unmount_sync.diff $(wildcard extN.patch-*) -DISTCLEANFILES = -r $(extN_SOURCES) sed-stamp patch-stamp *.orig *.rej -SUB=-e "s/ext3/extN/g" -e "s/EXT3/EXTN/g" -e "s/extern __inline__/static inline/" - -distclean: - cd .. && rm -f $(EXTN_EXTRA) - -include $(top_srcdir)/Rules - -# Following 2 vars are for buildind outside the source tree. -extN_orig = $(top_builddir)/$(subdir)/extN.orig -extN_include_orig = $(top_builddir)/$(subdir)/extN-include.orig - -# Create a fresh extN patch. -# This is for when the patch-stamp target fails for your kernel. -# Just edit the files until you like them, then do `make diff', and -# it will create a specialized patch for your particular kernel. -# Check it in, and the build should work for you without disrupting -# the other developers. -# Of course, the ideal is to merge changes so that the default patch -# set works for nearly everybody. This is mainly for damage control. - -diff: - $(RM) extN.patchT - l='$(EXTNC)'; for f in $$l; do \ - echo "$$f"; \ - (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT; \ - test $$? -le 1 || exit 1; \ - done - l='$(EXTNI)'; for f in $$l; do \ - echo "$$f"; \ - (diff -u $(extN_include_orig)/$$f $(top_srcdir)/include/linux/$$f)>>extN.patchT;\ - test $$? -le 1 || exit 1; \ - done - l='$(EXTN_EXTRA)'; for f in $$l; do \ - f=`echo "$$f" | sed 's%^fs/%%'`; \ - echo "$$f"; \ - (cd $(top_srcdir) && \ - diff -u /dev/null $$f) >> extN.patchT; \ - test $$? -le 1 || exit 1; \ - done - mv -f extN.patchT $(top_builddir)/$(subdir)/extN.patch-$(RELEASE) - echo "Don't forget to add $(srcdir)/extN.patch-$(RELEASE) to CVS!" - -.PHONY: diff - -# Just do the SUB transformation on all our source files. -sed-stamp: - $(RM) $@ - rm -rf $(extN_orig) $(extN_include_orig) - mkdir $(extN_orig) $(extN_include_orig) - list='$(EXTNC)'; for f in $$list; do \ - echo "creating $(extN_orig)/$$f"; \ - sed $(SUB) $(LINUX)/fs/ext3/$$f > $(extN_orig)/$$f; \ - done - list='$(EXTNI)'; for i in $$list; do \ - s=`echo $$i | sed "s/extN/ext3/"`; \ - echo "creating $(extN_include_orig)/$$i"; \ - sed $(SUB) $(LINUX)/include/linux/$$s > $(extN_include_orig)/$$i; \ - done - echo timestamp > $@ - - -# Patch the kernel files with our ext3 patches. We need to go through some -# extra hoops because the include files are in a different tree and because -# patch likes to make local copies of files with (sym)links when it is patching -# them. To avoid this, we copy/patch in the source dir instead of the build -# dir (if they are different). -# We also want to preserve the pristine transformed files for the diff target. - - - -patch-stamp: sed-stamp $(EXTNP) - test -e $(top_builddir)/include/linux || mkdir -p $(top_builddir)/include/linux - cp -a $(extN_orig)/* $(top_builddir)/$(subdir) - cp -a $(extN_include_orig)/* $(top_builddir)/include/linux - test -e $(top_builddir)/fs || ln -s . $(top_builddir)/fs - list='$(EXTN_EXTRA)'; for f in $$list; do $(RM) $(top_builddir)/$$f; done - if [ -f $(srcdir)/extN.patch-$(RELEASE) ]; then \ - echo "applying patch $(srcdir)/extN.patch-$(RELEASE)"; \ - (cd $(top_builddir) && patch -p0) < $(srcdir)/extN.patch-$(RELEASE);\ - else \ - list='$(EXTNP)'; \ - grep -q "err = extN_mark_inode_dirty" $(extN_orig)/namei.c || \ - list="ext3-use-after-free.diff $$list"; \ - sed '/i_version/q' $(extN_orig)/namei.c | tail -2 | \ - grep -q extN_mark_inode_dirty && list="$(EXTN_FIXES) $$list"; \ - grep -q "if (do_sync_supers)" $(extN_orig)/super.c && \ - list="ext3-unmount_sync.diff $$list"; \ - grep -q "ext3_journal_start(inode, 2)" $(extN_orig)/inode.c || \ - list="ext3-largefile.diff $$list"; \ - grep -q "EXPORT_SYMBOL(extN_bread)" $(extN_orig)/super.c || \ - list="$$list extN-2.4.18-exports.diff"; \ - for p in $$list; do \ - echo "applying patch $$p"; \ - sed $(SUB) $(srcdir)/$$p | \ - (cd $(top_builddir) && patch -p1) || exit $$?; \ - done; \ - fi - echo timestamp > $@ - - - - -$(extN_SOURCES) $(EXTNI) $(EXTN_EXTRA): patch-stamp - -# Don't distribute any patched files. -dist-hook: - $(RM) $(top_srcdir)/fs - list='$(EXTNC)'; for f in $$list; do $(RM) $(distdir)/$$f; done - list='$(EXTNI)'; for i in $$list; do \ - $(RM) $(distdir)/../include/linux/$$i; \ - done - list='$(EXTN_EXTRA)'; for f in $$list; do $(RM) $(distdir)/../$$f; done diff --git a/lustre/extN/ext3-largefile.diff b/lustre/extN/ext3-largefile.diff deleted file mode 100644 index db41aab..0000000 --- a/lustre/extN/ext3-largefile.diff +++ /dev/null @@ -1,23 +0,0 @@ -Under rare conditions (filesystem corruption, really) it is possible -for ext3_dirty_inode() to require _two_ blocks for the transaction: one -for the inode and one to update the superblock - to set -EXT3_FEATURE_RO_COMPAT_LARGE_FILE. This causes the filesystem to go -BUG. - -So reserve an additional block for that eventuality. - - - fs/ext3/inode.c | 2 +- - 1 files changed, 1 insertion(+), 1 deletion(-) - ---- 25/fs/ext3/inode.c~ext3-transaction-reserved-blocks Sat Dec 14 18:28:21 2002 -+++ 25-akpm/fs/ext3/inode.c Sat Dec 14 18:28:21 2002 -@@ -2698,7 +2698,7 @@ void ext3_dirty_inode(struct inode *inod - handle_t *handle; - - lock_kernel(); -- handle = ext3_journal_start(inode, 1); -+ handle = ext3_journal_start(inode, 2); - if (IS_ERR(handle)) - goto out; - if (current_handle && diff --git a/lustre/extN/ext3-unmount_sync.diff b/lustre/extN/ext3-unmount_sync.diff deleted file mode 100644 index 1f9b796..0000000 --- a/lustre/extN/ext3-unmount_sync.diff +++ /dev/null @@ -1,59 +0,0 @@ -From adilger@clusterfs.com Mon Dec 2 10:26:44 2002 -Date: Mon, 2 Dec 2002 10:26:44 -0700 -From: Andreas Dilger -To: Lustre LLNL Mailing list , - Lustre Development Mailing List -Subject: Re: data corrupting bug in 2.4.20 ext3, data=journal -Message-ID: <20021202102644.H1422@schatzie.adilger.int> -Mail-Followup-To: Lustre LLNL Mailing list , - Lustre Development Mailing List -Mime-Version: 1.0 -Content-Type: text/plain; charset=us-ascii -Content-Disposition: inline -User-Agent: Mutt/1.2.5.1i -X-GPG-Key: 1024D/0D35BED6 -X-GPG-Fingerprint: 7A37 5D79 BF1B CECA D44F 8A29 A488 39F5 0D35 BED6 -Status: RO -Content-Length: 1160 -Lines: 39 - -Here is the new-improved fix for the ext3 discarding data at umount bug -discovered late last week. To be used instead of the previous ext3 fix. - -Sadly, this is completely unrelated to the problems Mike is having with -ext3 under UML, since it is an unmount-time problem. - ------ Forwarded message from "Stephen C. Tweedie" ----- -The attached patch seems to fix things for me. - -Cheers, - Stephen - - ---- linux-2.4-ext3merge/fs/ext3/super.c.=K0027=.orig 2002-12-02 15:35:13.000000000 +0000 -+++ linux-2.4-ext3merge/fs/ext3/super.c 2002-12-02 15:35:14.000000000 +0000 -@@ -1640,7 +1640,12 @@ - sb->s_dirt = 0; - target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); - -- if (do_sync_supers) { -+ /* -+ * Tricky --- if we are unmounting, the write really does need -+ * to be synchronous. We can detect that by looking for NULL in -+ * sb->s_root. -+ */ -+ if (do_sync_supers || !sb->s_root) { - unlock_super(sb); - log_wait_commit(EXT3_SB(sb)->s_journal, target); - lock_super(sb); - - ------ End forwarded message ----- - -Cheers, Andreas --- -Andreas Dilger -http://sourceforge.net/projects/ext2resize/ -http://www-mddsp.enel.ucalgary.ca/People/adilger/ - - diff --git a/lustre/extN/extN-2.4.18-exports.diff b/lustre/extN/extN-2.4.18-exports.diff deleted file mode 100644 index 8780209..0000000 --- a/lustre/extN/extN-2.4.18-exports.diff +++ /dev/null @@ -1,11 +0,0 @@ ---- linux-2.4.17/fs/extN/super.c.orig Fri Dec 21 10:41:55 2001 -+++ linux-2.4.17/fs/extN/super.c Fri Mar 22 11:00:41 2002 -@@ -1742,7 +1742,7 @@ - unregister_filesystem(&extN_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(extN_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); diff --git a/lustre/extN/extN-2.4.18-ino_sb_fixup.diff b/lustre/extN/extN-2.4.18-ino_sb_fixup.diff deleted file mode 100644 index 37fd692..0000000 --- a/lustre/extN/extN-2.4.18-ino_sb_fixup.diff +++ /dev/null @@ -1,33 +0,0 @@ ---- ./include/linux/extN_fs.h.orig Tue May 7 17:06:03 2002 -+++ ./include/linux/extN_fs.h Tue May 7 17:07:11 2002 -@@ -17,6 +17,8 @@ - #define _LINUX_EXTN_FS_H - - #include -+#include -+#include - - /* - * The second extended filesystem constants/structures -@@ -86,8 +88,8 @@ - #define EXTN_MIN_BLOCK_LOG_SIZE 10 - - #ifdef __KERNEL__ --#define EXTN_SB(sb) (&((sb)->u.extN_sb)) --#define EXTN_I(inode) (&((inode)->u.extN_i)) -+#define EXTN_SB(sb) ((struct extN_sb_info *)&((sb)->u.generic_sbp)) -+#define EXTN_I(inode) ((struct extN_inode_info *)&((inode)->u.generic_ip)) - - #define EXTN_BLOCK_SIZE(s) ((s)->s_blocksize) - #define EXTN_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -447,7 +447,9 @@ - #define NEXT_ORPHAN(inode) EXTN_I(inode)->i_dtime - static inline struct inode *orphan_list_entry(struct list_head *l) - { -- return list_entry(l, struct inode, u.extN_i.i_orphan); -+ return ((struct inode *)((char *)l - -+ (unsigned long)(offsetof(struct inode, u.generic_ip) + -+ offsetof(struct extN_inode_info, i_orphan)))); - } - - /* diff --git a/lustre/extN/extN-san.diff b/lustre/extN/extN-san.diff deleted file mode 100644 index 4d0f277..0000000 --- a/lustre/extN/extN-san.diff +++ /dev/null @@ -1,88 +0,0 @@ ---- lustre/extN/inode.orig.c 2002-12-29 18:48:56.000000000 +0800 -+++ lustre/extN/inode.c 2002-12-29 19:17:24.000000000 +0800 -@@ -2728,3 +2728,85 @@ - * here, in extN_aops_journal_start() to ensure that the forthcoming "see if we - * need to extend" test in extN_prepare_write() succeeds. - */ -+ -+/* for each block: 1 ind + 1 dind + 1 tind -+ * for each block: 3 bitmap blocks -+ * for each block: 3 group descriptor blocks -+ * i inode block -+ * 1 superblock -+ * 2 * EXTN_SINGLEDATA_TRANS_BLOCKS for the quote files -+ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXTN_SINGLEDATA_TRANS_BLOCKS -+ * -+ * XXX assuming: -+ * (1) fs logic block size == page size -+ * (2) extN in writeback mode -+ */ -+static inline int extN_san_write_trans_blocks(int nblocks) -+{ -+ int ret; -+ -+ ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1; -+ -+#ifdef CONFIG_QUOTA -+ ret += 2 * EXTN_SINGLEDATA_TRANS_BLOCKS; -+#endif -+ -+ return ret; -+} -+ -+/* Alloc blocks for an inode, while don't create any buffer/page -+ * for data I/O; set the inode size if file is extended. -+ * -+ * @inode: target inode -+ * @blocks: array of logic block number -+ * @nblocks: how many blocks need be alloced -+ * @newsize: new filesize we should set -+ * -+ * return: 0 success, otherwise failed -+ * (*blocks) contains physical block number alloced -+ * -+ * XXX this assume the fs block size == page size -+ */ -+int extN_prep_san_write(struct inode *inode, long *blocks, -+ int nblocks, loff_t newsize) -+{ -+ handle_t *handle; -+ struct buffer_head bh_tmp; -+ int needed_blocks; -+ int i, ret = 0, ret2; -+ -+ needed_blocks = extN_san_write_trans_blocks(nblocks); -+ -+ lock_kernel(); -+ handle = extN_journal_start(inode, needed_blocks); -+ if (IS_ERR(handle)) { -+ unlock_kernel(); -+ return PTR_ERR(handle); -+ } -+ unlock_kernel(); -+ -+ /* alloc blocks one by one */ -+ for (i = 0; i < nblocks; i++) { -+ ret = extN_get_block_handle(handle, inode, blocks[i], -+ &bh_tmp, 1); -+ if (ret) -+ break; -+ -+ blocks[i] = bh_tmp.b_blocknr; -+ } -+ -+ /* set inode size if needed */ -+ if (!ret && (newsize > inode->i_size)) { -+ inode->i_size = newsize; -+ extN_mark_inode_dirty(handle, inode); -+ } -+ -+ lock_kernel(); -+ ret2 = extN_journal_stop(handle, inode); -+ unlock_kernel(); -+ -+ if (!ret) -+ ret = ret2; -+ return ret; -+} -+EXPORT_SYMBOL(extN_prep_san_write); diff --git a/lustre/extN/extN-wantedi.diff b/lustre/extN/extN-wantedi.diff deleted file mode 100644 index a55aec0..0000000 --- a/lustre/extN/extN-wantedi.diff +++ /dev/null @@ -1,163 +0,0 @@ ---- lustre/extN-clean/namei.c 2002-12-30 05:56:09.000000000 -0500 -+++ lustre/extN/namei.c 2002-12-30 06:29:39.000000000 -0500 -@@ -1224,7 +1224,8 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = extN_new_inode (handle, dir, mode); -+ inode = extN_new_inode (handle, dir, mode, -+ (unsigned long)dentry->d_fsdata); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &extN_file_inode_operations; -@@ -1254,7 +1254,8 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = extN_new_inode (handle, dir, mode); -+ inode = extN_new_inode (handle, dir, mode, -+ (unsigned long)dentry->d_fsdata); - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, mode, rdev); -@@ -1286,7 +1286,8 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = extN_new_inode (handle, dir, S_IFDIR | mode); -+ inode = extN_new_inode (handle, dir, S_IFDIR | mode, -+ (unsigned long)dentry->d_fsdata); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -1680,7 +1681,8 @@ - if (IS_SYNC(dir)) - handle->h_sync = 1; - -- inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); -+ inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO, -+ (unsigned long)dentry->d_fsdata); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; ---- lustre/extN-clean/ialloc.c 2002-12-28 23:56:42.000000000 -0500 -+++ lustre/extN/ialloc.c 2002-12-30 06:29:39.000000000 -0500 -@@ -329,8 +329,8 @@ - * For other inodes, search forward from the parent directory's block - * group to find a free inode. - */ --struct inode * extN_new_inode (handle_t *handle, -- const struct inode * dir, int mode) -+struct inode *extN_new_inode(handle_t *handle, const struct inode *dir, -+ int mode, unsigned long goal) - { - struct super_block * sb; - struct buffer_head * bh; -@@ -360,6 +361,38 @@ - - lock_super (sb); - es = sbi->s_es; -+ -+ if (goal) { -+ i = (goal - 1) / EXTN_INODES_PER_GROUP(sb); -+ j = (goal - 1) % EXTN_INODES_PER_GROUP(sb); -+ gdp = extN_get_group_desc(sb, i, &bh2); -+ -+ bitmap_nr = load_inode_bitmap (sb, i); -+ if (bitmap_nr < 0) -+ goto fail; -+ -+ bh = sbi->s_inode_bitmap[bitmap_nr]; -+ -+ BUFFER_TRACE(bh, "get_write_access"); -+ err = extN_journal_get_write_access(handle, bh); -+ if (err) goto fail; -+ -+ if (extN_set_bit(j, bh->b_data)) { -+ printk(KERN_ERR "goal inode %lu unavailable\n", goal); -+ /* Oh well, we tried. */ -+ goto repeat; -+ } -+ -+ BUFFER_TRACE(bh, "call extN_journal_dirty_metadata"); -+ err = extN_journal_dirty_metadata(handle, bh); -+ if (err) goto fail; -+ -+ /* We've shortcircuited the allocation system successfully, -+ * now finish filling in the inode. -+ */ -+ goto have_bit_and_group; -+ } -+ - repeat: - gdp = NULL; - i = 0; -@@ -474,6 +509,7 @@ - } - goto repeat; - } -+have_bit_and_group: - j += i * sbi->s_inodes_per_group + 1; - if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) { - extN_error (sb, "extN_new_inode", ---- lustre/extN-clean/ioctl.c 2002-12-28 23:56:42.000000000 -0500 -+++ lustre/extN/ioctl.c 2002-12-30 06:29:39.000000000 -0500 -@@ -24,6 +24,31 @@ - extN_debug ("cmd = %u, arg = %lu\n", cmd, arg); - - switch (cmd) { -+ case EXTN_IOC_CREATE_INUM: { -+ char name[32]; -+ struct dentry *dchild, *dparent; -+ int rc = 0; -+ -+ dparent = list_entry(inode->i_dentry.next, struct dentry, -+ d_alias); -+ snprintf(name, sizeof name, "%lu", arg); -+ dchild = lookup_one_len(name, dparent, strlen(name)); -+ if (dchild->d_inode) { -+ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", -+ dparent->d_name.len, dparent->d_name.name, arg, -+ dchild->d_inode->i_ino); -+ rc = -EEXIST; -+ } else { -+ dchild->d_fsdata = (void *)arg; -+ rc = vfs_create(inode, dchild, 0644); -+ if (rc) -+ printk(KERN_ERR "vfs_create: %d\n", rc); -+ else if (dchild->d_inode->i_ino != arg) -+ rc = -EEXIST; -+ } -+ dput(dchild); -+ return rc; -+ } - case EXTN_IOC_GETFLAGS: - flags = ei->i_flags & EXTN_FL_USER_VISIBLE; - return put_user(flags, (int *) arg); ---- lustre/include/linux/extN_fs.h~ 2002-12-30 06:01:43.000000000 -0500 -+++ lustre/include/linux/extN_fs.h 2002-12-30 06:02:51.000000000 -0500 -@@ -200,6 +200,7 @@ - #define EXTN_IOC_SETFLAGS _IOW('f', 2, long) - #define EXTN_IOC_GETVERSION _IOR('f', 3, long) - #define EXTN_IOC_SETVERSION _IOW('f', 4, long) -+/* EXTN_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ - #define EXTN_IOC_GETVERSION_OLD _IOR('v', 1, long) - #define EXTN_IOC_SETVERSION_OLD _IOW('v', 2, long) - #ifdef CONFIG_JBD_DEBUG -@@ -632,7 +633,8 @@ - extern int extN_sync_file (struct file *, struct dentry *, int); - - /* ialloc.c */ --extern struct inode * extN_new_inode (handle_t *, const struct inode *, int); -+extern struct inode * extN_new_inode (handle_t *, const struct inode *, int, -+ unsigned long); - extern void extN_free_inode (handle_t *, struct inode *); - extern struct inode * extN_orphan_get (struct super_block *, ino_t); - extern unsigned long extN_count_free_inodes (struct super_block *); -@@ -714,4 +716,6 @@ - - #endif /* __KERNEL__ */ - -+#define EXTN_IOC_CREATE_INUM _IOW('f', 5, long) -+ - #endif /* _LINUX_EXTN_FS_H */ diff --git a/lustre/include/.cvsignore b/lustre/include/.cvsignore index 864df96..7b78c04 100644 --- a/lustre/include/.cvsignore +++ b/lustre/include/.cvsignore @@ -4,6 +4,7 @@ config.status configure config.h stamp-h +stamp-h1 stamp-h.in Makefile Makefile.in diff --git a/lustre/include/ioctl.h b/lustre/include/ioctl.h new file mode 100644 index 0000000..a4ec8a5 --- /dev/null +++ b/lustre/include/ioctl.h @@ -0,0 +1,64 @@ +#ifndef _ASMI386_IOCTL_H +#define _ASMI386_IOCTL_H + +/* ioctl command encoding: 32 bits total, command in lower 16 bits, + * size of the parameter structure in the lower 14 bits of the + * upper 16 bits. + * Encoding the size of the parameter structure in the ioctl request + * The highest 2 bits are reserved for indicating the ``access mode''. + * NOTE: This limits the max parameter size to 16kB -1 ! + */ + +/* + * The following is for compatibility across the various Linux + * platforms. The i386 ioctl numbering scheme doesn't really enforce + * a type field. De facto, however, the top 8 bits of the lower 16 + * bits are indeed used as a type field, so we might just as well make + * this explicit here. Please be sure to use the decoding macros + * below from now on. + */ +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 14 +#define _IOC_DIRBITS 2 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +/* + * Direction bits. + */ +#define _IOC_NONE 0U +#define _IOC_WRITE 1U +#define _IOC_READ 2U + +#define _IOC(dir,type,nr,size) (((dir) << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr) << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT)) + +/* used to create numbers */ +#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) +#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) +#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) +#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) + +/* used to decode ioctl numbers.. */ +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +/* ...and for the drivers/sound files... */ + +#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) +#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) +#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) +#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) +#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) + +#endif /* _ASMI386_IOCTL_H */ diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 1e57ea4..017d5b6 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -25,10 +25,15 @@ #define LIBLUSTRE_H__ #include +#ifndef __CYGWIN__ +#include #include +#else +#include +#include "ioctl.h" +#endif #include #include -#include #include #include #include @@ -37,12 +42,24 @@ #include #include +#include /* definitions for liblustre */ +#ifdef __CYGWIN__ + +#define PAGE_SHIFT 12 +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) +#define loff_t __u64 +#define ERESTART 2001 +typedef unsigned short umode_t; + +#endif + /* always adopt 2.5 definitions */ -#define LINUX_VERSION_CODE 1 -#define KERNEL_VERSION(a,b,c) 0 +#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c) +#define LINUX_VERSION_CODE (2*200+5*10+0) static inline void inter_module_put(void *a) { @@ -51,16 +68,7 @@ static inline void inter_module_put(void *a) extern ptl_handle_ni_t tcpnal_ni; -static inline void *inter_module_get(char *arg) -{ - - if (strcmp(arg, "tcpnal_ni") == 0 ) - return &tcpnal_ni; - else - return NULL; - -} - +void *inter_module_get(char *arg); /* cheats for now */ @@ -108,6 +116,93 @@ typedef void *read_proc_t; typedef void *write_proc_t; +/* byteorder */ +#define __swab16(x) \ +({ \ + __u16 __x = (x); \ + ((__u16)( \ + (((__u16)(__x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(__x) & (__u16)0xff00U) >> 8) )); \ +}) + +#define __swab32(x) \ +({ \ + __u32 __x = (x); \ + ((__u32)( \ + (((__u32)(__x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(__x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(__x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(__x) & (__u32)0xff000000UL) >> 24) )); \ +}) + +#define __swab64(x) \ +({ \ + __u64 __x = (x); \ + ((__u64)( \ + (__u64)(((__u64)(__x) & (__u64)0x00000000000000ffULL) << 56) | \ + (__u64)(((__u64)(__x) & (__u64)0x000000000000ff00ULL) << 40) | \ + (__u64)(((__u64)(__x) & (__u64)0x0000000000ff0000ULL) << 24) | \ + (__u64)(((__u64)(__x) & (__u64)0x00000000ff000000ULL) << 8) | \ + (__u64)(((__u64)(__x) & (__u64)0x000000ff00000000ULL) >> 8) | \ + (__u64)(((__u64)(__x) & (__u64)0x0000ff0000000000ULL) >> 24) | \ + (__u64)(((__u64)(__x) & (__u64)0x00ff000000000000ULL) >> 40) | \ + (__u64)(((__u64)(__x) & (__u64)0xff00000000000000ULL) >> 56) )); \ +}) + +#define __swab16s(x) __swab16(*(x)) +#define __swab32s(x) __swab32(*(x)) +#define __swab64s(x) __swab64(*(x)) + +#define __LITTLE_ENDIAN__ +#ifdef __LITTLE_ENDIAN__ +# define le16_to_cpu(x) ((__u16)(x)) +# define cpu_to_le16(x) ((__u16)(x)) +# define le32_to_cpu(x) ((__u32)(x)) +# define cpu_to_le32(x) ((__u32)(x)) +# define le64_to_cpu(x) ((__u64)(x)) +# define cpu_to_le64(x) ((__u64)(x)) +#else +# define le16_to_cpu(x) __swab16(x) +# define cpu_to_le16(x) __swab16(x) +# define le32_to_cpu(x) __swab32(x) +# define cpu_to_le32(x) __swab32(x) +# define le64_to_cpu(x) __swab64(x) +# define cpu_to_le64(x) __swab64(x) +# error "do more check here!!!" +#endif + +/* bits ops */ +static __inline__ int set_bit(int nr,long * addr) +{ + int mask, retval; + + addr += nr >> 5; + mask = 1 << (nr & 0x1f); + retval = (mask & *addr) != 0; + *addr |= mask; + return retval; +} + +static __inline__ int clear_bit(int nr, long * addr) +{ + int mask, retval; + + addr += nr >> 5; + mask = 1 << (nr & 0x1f); + retval = (mask & *addr) != 0; + *addr &= ~mask; + return retval; +} + +static __inline__ int test_bit(int nr, long * addr) +{ + int mask; + + addr += nr >> 5; + mask = 1 << (nr & 0x1f); + return ((mask & *addr) != 0); +} + /* modules */ struct module { @@ -144,6 +239,7 @@ extern int ptlrpc_init(void); extern int ldlm_init(void); extern int osc_init(void); extern int lov_init(void); +extern int mdc_init(void); extern int echo_client_init(void); @@ -168,21 +264,20 @@ static inline void spin_unlock_bh(spinlock_t *l) { return; } -static inline void spin_lock_irqrestore(a,b) +static inline void spin_unlock_irqrestore(spinlock_t *a, long b) { return; } -static inline void spin_unlock_irqrestore(a,b) -{ - return; -} -static inline void spin_lock_irqsave(a,b) +static inline void spin_lock_irqsave(spinlock_t *a, long b) { return; } #define barrier() do {int a= 1; a++; } while (0) +#define min(x,y) ((x)<(y) ? (x) : (y)) +#define max(x,y) ((x)>(y) ? (x) : (y)) + /* registering symbols */ #define ERESTARTSYS ERESTART @@ -192,18 +287,18 @@ static inline void spin_lock_irqsave(a,b) static inline void get_random_bytes(void *ptr, int size) { - static int r; int *p = (int *)ptr; - int *end = p + (size / sizeof(int)); - r = rand(); - while ( p + sizeof(int) < end ) { - *p = r; - p++; - } + int i, count = size/sizeof(int); + + for (i = 0; i< count; i++) + *p++ = rand(); } /* memory */ +/* FIXME */ +#define num_physpages (16 * 1024) + static inline int copy_from_user(void *a,void *b, int c) { memcpy(a,b,c); @@ -222,26 +317,35 @@ typedef struct { int size; } kmem_cache_t; #define SLAB_HWCACHE_ALIGN 0 -static inline kmem_cache_t *kmem_cache_create(name,objsize,cdum,d,e,f) +static inline kmem_cache_t * +kmem_cache_create(const char *name, size_t objsize, size_t cdum, + unsigned long d, + void (*e)(void *, kmem_cache_t *, unsigned long), + void (*f)(void *, kmem_cache_t *, unsigned long)) { kmem_cache_t *c; c = malloc(sizeof(*c)); if (!c) return NULL; c->size = objsize; + CDEBUG(D_MALLOC, "alloc slab cache %s at %p, objsize %d\n", + name, c, (int)objsize); return c; }; static inline int kmem_cache_destroy(kmem_cache_t *a) { + CDEBUG(D_MALLOC, "destroy slab cache %p, objsize %u\n", a, a->size); free(a); return 0; } #define kmem_cache_validate(a,b) 1 #define kmem_cache_alloc(cache, prio) malloc(cache->size) -#define kmem_cache_free(cache, obj) OBD_FREE(obj, cache->size) -#define PORTAL_SLAB_ALLOC(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0) -#define PORTAL_SLAB_FREE(lock,cache,size) do { lock = kmem_cache_alloc(cache,prio); } while (0) +#define kmem_cache_free(cache, obj) free(obj) + +#define PAGE_CACHE_SIZE PAGE_SIZE +#define PAGE_CACHE_SHIFT 12 +#define PAGE_CACHE_MASK PAGE_MASK struct page { void *addr; @@ -251,7 +355,7 @@ struct page { #define kmap(page) (page)->addr #define kunmap(a) do { int foo = 1; foo++; } while (0) -static inline struct page *alloc_pages(mask,foo) +static inline struct page *alloc_pages(int mask, unsigned long foo) { struct page *pg = malloc(sizeof(*pg)); @@ -280,29 +384,82 @@ static inline void __free_pages(struct page *pg, int what) free(pg); } +static inline struct page* __grab_cache_page(int index) +{ + struct page *pg = alloc_pages(0, 0); + + if (pg) + pg->index = index; + return pg; +} + +#define grab_cache_page(index) __grab_cache_page(index) +#define page_cache_release(page) __free_pages(page, 0) + /* arithmetic */ -#define do_div(a,b) (a)/(b) +#define do_div(a,b) \ + ({ \ + unsigned long ret; \ + ret = (a)%(b); \ + (a) = (a)/(b); \ + (ret); \ + }) + +/* VFS stuff */ +#define ATTR_MODE 1 +#define ATTR_UID 2 +#define ATTR_GID 4 +#define ATTR_SIZE 8 +#define ATTR_ATIME 16 +#define ATTR_MTIME 32 +#define ATTR_CTIME 64 +#define ATTR_ATIME_SET 128 +#define ATTR_MTIME_SET 256 +#define ATTR_FORCE 512 /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG 1024 +#define ATTR_RAW 2048 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 4096 /* called from open path, ie O_TRUNC */ -/* dentries / intents */ -struct lookup_intent { - void *it_iattr; +struct iattr { + unsigned int ia_valid; + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + time_t ia_atime; + time_t ia_mtime; + time_t ia_ctime; + unsigned int ia_attr_flags; }; -struct iattr { - int mode; +/* copy from kernel header */ +#define IT_OPEN (1) +#define IT_CREAT (1<<1) +#define IT_READDIR (1<<2) +#define IT_GETATTR (1<<3) +#define IT_LOOKUP (1<<4) +#define IT_UNLINK (1<<5) + +struct lookup_intent { + int it_op; + int it_mode; + int it_flags; + int it_disposition; + int it_status; + struct iattr *it_iattr; + __u64 it_lock_handle[2]; + int it_lock_mode; + void *it_data; }; struct dentry { int d_count; }; -struct file { - struct dentry *f_dentry; - void *private_data; -} ; struct vfsmount { void *pwd; }; + #define cpu_to_le32(x) ((__u32)(x)) /* semaphores */ @@ -327,16 +484,24 @@ struct signal { int signal; }; +struct fs_struct { + int umask; +}; + struct task_struct { + struct fs_struct *fs; int state; struct signal pending; char comm[32]; int pid; + int fsuid; + int fsgid; + __u32 cap_effective; }; extern struct task_struct *current; - +#define in_group_p(a) 0 /* FIXME */ #define set_current_state(foo) do { current->state = foo; } while (0) @@ -351,9 +516,10 @@ extern struct task_struct *current; #define TASK_UNINTERRUPTIBLE 1 #define TASK_RUNNING 2 +#define in_interrupt() (0) #define schedule() do { int a; a++; } while (0) -static inline int schedule_timeout(t) +static inline int schedule_timeout(signed long t) { return 0; } @@ -364,7 +530,7 @@ static inline int schedule_timeout(t) #define recalc_sigpending(l) do { int a; a++; } while (0) #define kernel_thread(l,m,n) -static inline int call_usermodehelper(char *prog, char **argv, char **evnp) +static inline int call_usermodehelper(char *prog, char **argv, char **evnp, int unknown) { return 0; } @@ -416,7 +582,11 @@ typedef struct { volatile int counter; } atomic_t; #define atomic_add(b,a) do {(a)->counter += b;} while (0) #define atomic_sub(b,a) do {(a)->counter -= b;} while (0) -#define LBUG() do { sleep(1000000); } while (0) +#define LBUG() \ + do { \ + printf("!!!LBUG at %s:%d\n", __FILE__, __LINE__); \ + sleep(1000000); \ + } while (0) #include #include diff --git a/lustre/include/linux/lprocfs_status.h b/lustre/include/linux/lprocfs_status.h index d0060fc..5ce5e98 100644 --- a/lustre/include/linux/lprocfs_status.h +++ b/lustre/include/linux/lprocfs_status.h @@ -36,7 +36,7 @@ #endif struct lprocfs_vars { - char *name; + const char *name; read_proc_t *read_fptr; write_proc_t *write_fptr; void *data; @@ -47,11 +47,121 @@ struct lprocfs_static_vars { struct lprocfs_vars *obd_vars; }; +/* Lprocfs counters are can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 1, + LPROCFS_CNTR_AVGMINMAX = 2, + LPROCFS_CNTR_STDDEV = 4, +}; + +struct lprocfs_counter { + union { + spinlock_t internal; /* when there is no external lock */ + spinlock_t *external; /* external lock, when available */ + } l; + unsigned int config; + __u64 count; + __u64 sum; + __u64 min; + __u64 max; + __u64 sumsquare; + const char *name; /* must be static */ + const char *units; /* must be static */ +}; + + +struct lprocfs_counters { + unsigned int num; + unsigned int padto8byteboundary; + struct lprocfs_counter cntr[0]; +}; + + /* class_obd.c */ extern struct proc_dir_entry *proc_lustre_root; - +struct obd_device; #ifdef LPROCFS + +/* Two optimized LPROCFS counter increment macros are provided: + * LPROCFS_COUNTER_INCR(cntr, value) - use for multi-valued counters + * LPROCFS_COUNTER_INCBY1(cntr) - optimized for by-one counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +#define LPROCFS_COUNTER_INCR(cntr, value) \ + do { \ + struct lprocfs_counter *c = (cntr); \ + LASSERT(c != NULL); \ + if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK)) \ + spin_lock(&c->l.internal); \ + c->count++; \ + if (c->config & LPROCFS_CNTR_AVGMINMAX) { \ + __u64 val = (__u64) (value); \ + c->sum += val; \ + if (c->config & LPROCFS_CNTR_STDDEV) \ + c->sumsquare += (val*val); \ + if (val < c->min) c->min = val; \ + if (val > c->max) c->max = val; \ + } \ + if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK)) \ + spin_unlock(&c->l.internal); \ + } while (0) + +#define LPROCFS_COUNTER_INCBY1(cntr) \ + do { \ + struct lprocfs_counter *c = (cntr); \ + LASSERT(c != NULL); \ + if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK)) \ + spin_lock(&c->l.internal); \ + c->count++; \ + if (!(c->config & LPROCFS_CNTR_EXTERNALLOCK)) \ + spin_unlock(&c->l.internal); \ + } while (0) + +#define LPROCFS_COUNTER_INIT(cntr, conf, lck, nam, un) \ + do { \ + struct lprocfs_counter *c = (cntr); \ + LASSERT(c != NULL); \ + memset(c, 0, sizeof(struct lprocfs_counter)); \ + if (conf & LPROCFS_CNTR_EXTERNALLOCK) c->l.external = (lck); \ + else spin_lock_init(&c->l.internal); \ + c->config = conf; \ + c->min = (~(__u64)0); \ + c->name = (nam); \ + c->units = (un); \ + } while (0) + +extern struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num); +extern void lprocfs_free_counters(struct lprocfs_counters* cntrs); +extern int lprocfs_alloc_obd_counters(struct obd_device *obddev, + unsigned int num_private_counters); +extern void lprocfs_free_obd_counters(struct obd_device *obddev); +extern int lprocfs_register_counters(struct proc_dir_entry *root, + const char* name, + struct lprocfs_counters *cntrs); + #define LPROCFS_INIT_MULTI_VARS(array, size) \ void lprocfs_init_multi_vars(unsigned int idx, \ struct lprocfs_static_vars *x) \ @@ -71,7 +181,7 @@ void lprocfs_init_vars(struct lprocfs_static_vars *x) \ } \ extern void lprocfs_init_vars(struct lprocfs_static_vars *var); -extern void lprocfs_init_multi_vars(unsigned int idx, +extern void lprocfs_init_multi_vars(unsigned int idx, struct lprocfs_static_vars *var); /* lprocfs_status.c */ extern int lprocfs_add_vars(struct proc_dir_entry *root, @@ -85,7 +195,6 @@ extern struct proc_dir_entry *lprocfs_register(const char *name, extern void lprocfs_remove(struct proc_dir_entry *root); -struct obd_device; extern int lprocfs_obd_attach(struct obd_device *dev, struct lprocfs_vars *list); extern int lprocfs_obd_detach(struct obd_device *dev); @@ -119,18 +228,44 @@ extern int lprocfs_rd_filesfree(char *page, char **start, off_t off, extern int lprocfs_rd_filegroups(char *page, char **start, off_t off, int count, int *eof, struct statfs *sfs); -#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct) \ -int fct_name(char *page, char **start, off_t off, \ - int count, int *eof, void *data) \ -{ \ - struct statfs sfs; \ - int rc = get_statfs_fct((struct obd_device*)data, &sfs); \ - return (rc==0 \ - ? lprocfs_##fct_name (page, start, off, count, eof, &sfs) \ - : rc); \ +/* lprocfs_status.c: counter read/write functions */ +struct file; +extern int lprocfs_counter_read(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_counter_write(struct file *file, const char *buffer, + unsigned long count, void *data); + +#define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct) \ +int fct_name(char *page, char **start, off_t off, \ + int count, int *eof, void *data) \ +{ \ + struct statfs sfs; \ + int rc = get_statfs_fct((struct obd_device*)data, &sfs); \ + return (rc == 0 ? \ + lprocfs_##fct_name (page, start, off, count, eof, &sfs) : \ + rc); \ } #else +/* LPROCFS is not defined */ +#define LPROCFS_COUNTER_INCR(cntr, value) +#define LPROCFS_COUNTER_INCBY1(cntr) +#define LPROCFS_COUNTER_INIT(cntr, conf, lock, nam, un) + +static inline struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num) +{ return NULL; } +static inline void lprocfs_free_counters(struct lprocfs_counters* cntrs) +{ return; } + +static inline int lprocfs_register_counters(struct proc_dir_entry *root, + const char* name, + struct lprocfs_counters *cntrs) +{ return 0; } +static inline int lprocfs_alloc_obd_counters(struct obd_device *obddev, + unsigned int num_private_counters) +{ return 0; } +static inline void lprocfs_free_obd_counters(struct obd_device *obddev) +{ return; } static inline struct proc_dir_entry * lprocfs_register(const char *name, struct proc_dir_entry *parent, @@ -181,6 +316,13 @@ int lprocfs_rd_filesfree(char *page, char **start, off_t off, static inline int lprocfs_rd_filegroups(char *page, char **start, off_t off, int count, int *eof, struct statfs *sfs) { return 0; } +static inline +int lprocfs_counter_read(char *page, char **start, off_t off, + int count, int *eof, void *data) { return 0; } +struct file; +static inline +int lprocfs_counter_write(struct file *file, const char *buffer, + unsigned long count, void *data) { return 0; } #define DEFINE_LPROCFS_STATFS_FCT(fct_name, get_statfs_fct) \ int fct_name(char *page, char **start, off_t off, \ diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h new file mode 100644 index 0000000..4275a10 --- /dev/null +++ b/lustre/include/linux/lustre_compat25.h @@ -0,0 +1,76 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _COMPAT25_H +#define _COMPAT25_H + +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +#define KDEVT_VAL(dev, val) dev.value = 0 +#else +#define KDEVT_VAL(dev, val) dev = 0 +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define PGCACHE_WRLOCK(mapping) write_lock(&mapping->page_lock) +# define PGCACHE_WRUNLOCK(mapping) write_unlock(&mapping->page_lock) +#else +# define PGCACHE_WRLOCK(mapping) spin_lock(&pagecache_lock) +# define PGCACHE_WRUNLOCK(mapping) spin_unlock(&pagecache_lock) +#endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define filemap_fdatasync(mapping) filemap_fdatawrite(mapping) +#endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define TryLockPage(page) TestSetPageLocked(page) +#endif + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define Page_Uptodate(page) PageUptodate(page) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define USERMODEHELPER(path, argv, envp) call_usermodehelper(path, argv, envp, 0) +#else +# define USERMODEHELPER(path, argv, envp) call_usermodehelper(path, argv, envp) +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +# define LL_CHECK_DIRTY(sb) do { }while(0) +#else +# define LL_CHECK_DIRTY(sb) ll_check_dirty(sb) +#endif + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#define rb_node_s rb_node +#define rb_root_s rb_root +typedef struct rb_root_s rb_root_t; +typedef struct rb_node_s rb_node_t; +#endif + +#endif /* _COMPAT25_H */ diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index 70e7e87..c2a54b9 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -7,13 +7,14 @@ #define _LUSTRE_DLM_H__ #ifdef __KERNEL__ -#include +# include #endif #include #include #include #include +#include /* for obd_export, for LDLM_DEBUG */ struct obd_ops; struct obd_device; @@ -26,11 +27,9 @@ typedef enum { ELDLM_LOCK_CHANGED = 300, ELDLM_LOCK_ABORTED = 301, ELDLM_LOCK_REPLACED = 302, - ELDLM_LOCK_MATCHED = 303, ELDLM_NAMESPACE_EXISTS = 400, - ELDLM_BAD_NAMESPACE = 401, - ELDLM_GETATTR_ERROR = 402 + ELDLM_BAD_NAMESPACE = 401 } ldlm_error_t; #define LDLM_NAMESPACE_SERVER 0 @@ -56,10 +55,14 @@ typedef enum { #define LDLM_FL_INTENT_ONLY (1 << 9) /* don't grant lock, just do intent */ #define LDLM_FL_LOCAL_ONLY (1 << 10) /* see ldlm_cli_cancel_unused */ -#define LDLM_FL_NO_CALLBACK (1 << 11) /* see ldlm_cli_cancel_unused */ + +/* don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_NO_CALLBACK (1 << 11) + #define LDLM_FL_HAS_INTENT (1 << 12) /* lock request has intent */ #define LDLM_FL_CANCELING (1 << 13) /* lock cancel has already been sent */ #define LDLM_FL_LOCAL (1 << 14) // a local lock (ie, no srv/cli split) +#define LDLM_FL_WARN (1 << 15) /* see ldlm_cli_cancel_unused */ /* The blocking callback is overloaded to perform two functions. These flags * indicate which operation should be performed. */ @@ -146,9 +149,8 @@ struct ldlm_lock; typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, int flag); -typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags, void *data); -typedef int (*ldlm_granted_callback)(struct ldlm_lock *, - struct lustre_msg *, int offset); +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags, + void *data); struct ldlm_lock { struct portals_handle l_handle; // must be first in the structure @@ -168,14 +170,12 @@ struct ldlm_lock { ldlm_completion_callback l_completion_ast; ldlm_blocking_callback l_blocking_ast; - ldlm_granted_callback l_granted_cb; struct obd_export *l_export; struct lustre_handle *l_connh; __u32 l_flags; struct lustre_handle l_remote_handle; void *l_data; - void *l_cp_data; struct ldlm_extent l_extent; __u32 l_version[RES_VERSION_SIZE]; @@ -233,12 +233,6 @@ struct ldlm_ast_work { int w_datalen; }; -/* Per-export ldlm state. */ -struct ldlm_export_data { - struct list_head led_held_locks; /* protected by namespace lock */ - struct obd_import led_import; -}; - extern struct obd_ops ldlm_obd_ops; extern char *ldlm_lockname[]; @@ -250,8 +244,8 @@ do { \ if (lock->l_resource == NULL) { \ CDEBUG(level, "### " format \ " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\ - "res: \?\? rrc=\?\? type: \?\?\? remote: "LPX64")\n" \ - , ## a, lock, lock->l_handle.h_cookie, \ + "res: \?\? rrc=\?\? type: \?\?\? remote: " \ + LPX64"\n" , ## a, lock, lock->l_handle.h_cookie, \ atomic_read(&lock->l_refc), \ lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ @@ -281,7 +275,8 @@ do { \ CDEBUG(level, "### " format \ " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \ "res: "LPU64"/"LPU64" rrc: %d type: %s remote: "LPX64 \ - "\n" , ## a, lock->l_resource->lr_namespace->ns_name, \ + "\n" , ## a, \ + lock->l_resource->lr_namespace->ns_name, \ lock, lock->l_handle.h_cookie, \ atomic_read (&lock->l_refc), \ lock->l_readers, lock->l_writers, \ @@ -342,7 +337,7 @@ void ldlm_unregister_intent(void); void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh); struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int flags); void ldlm_cancel_callback(struct ldlm_lock *); -int ldlm_lock_set_data(struct lustre_handle *, void *data, void *cp_data); +int ldlm_lock_set_data(struct lustre_handle *, void *data); void ldlm_lock_remove_from_lru(struct ldlm_lock *); struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *, struct lustre_handle *); @@ -380,11 +375,11 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *, struct ldlm_lock * ldlm_lock_create(struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, struct ldlm_res_id, - __u32 type, ldlm_mode_t mode, void *data, void *cp_data); + __u32 type, ldlm_mode_t, ldlm_blocking_callback, + void *data); ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock **, void *cookie, int cookie_len, int *flags, - ldlm_completion_callback completion, - ldlm_blocking_callback blocking); + ldlm_completion_callback completion); struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, int *flags); void ldlm_lock_cancel(struct ldlm_lock *lock); @@ -444,7 +439,6 @@ int ldlm_cli_enqueue(struct lustre_handle *conn, ldlm_completion_callback completion, ldlm_blocking_callback callback, void *data, - void *cp_data, struct lustre_handle *lockh); int ldlm_match_or_enqueue(struct lustre_handle *connh, struct ptlrpc_request *req, @@ -458,15 +452,13 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback callback, void *data, - void *cp_data, struct lustre_handle *lockh); int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, void *data, __u32 data_len); int ldlm_cli_convert(struct lustre_handle *, int new_mode, int *flags); int ldlm_cli_cancel(struct lustre_handle *lockh); int ldlm_cli_cancel_unused(struct ldlm_namespace *, struct ldlm_res_id *, - int flags); -int ldlm_cancel_lru(struct ldlm_namespace *ns); + int flags, void *opaque); /* mds/handler.c */ /* This has to be here because recurisve inclusion sucks. */ diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index 694bd3e..6939a95 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -11,10 +11,22 @@ #define __EXPORT_H #include -#include -#include #include +struct mds_client_data; + +struct mds_export_data { + struct list_head med_open_head; + spinlock_t med_open_lock; + struct mds_client_data *med_mcd; + int med_off; +}; + +struct ldlm_export_data { + struct list_head led_held_locks; /* protected by namespace lock */ + struct obd_import *led_import; +}; + struct lov_export_data { spinlock_t led_lock; struct list_head led_open_head; @@ -26,13 +38,17 @@ struct ec_export_data { /* echo client */ }; struct obd_export { - __u64 exp_cookie; + struct portals_handle exp_handle; + atomic_t exp_refcount; struct obd_uuid exp_client_uuid; struct list_head exp_obd_chain; - struct list_head exp_conn_chain; struct obd_device *exp_obd; struct ptlrpc_connection *exp_connection; struct ldlm_export_data exp_ldlm_data; + struct ptlrpc_request *exp_outstanding_reply; + time_t exp_last_request_time; + spinlock_t exp_lock; /* protects flags int below */ + int exp_failed:1, exp_failover:1; union { struct mds_export_data eu_mds_data; struct filter_export_data eu_filter_data; diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 6b0cbfa..f736d4b 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -44,7 +44,7 @@ struct fsfilt_operations { void *(* fs_start)(struct inode *inode, int op); void *(* fs_brw_start)(int objcount, struct fsfilt_objinfo *fso, int niocount, struct niobuf_remote *nb); - int (* fs_commit)(struct inode *inode, void *handle); + int (* fs_commit)(struct inode *inode, void *handle,int force_sync); int (* fs_setattr)(struct dentry *dentry, void *handle, struct iattr *iattr); int (* fs_set_md)(struct inode *inode, void *handle, void *md, @@ -79,22 +79,25 @@ extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); static inline void *fsfilt_start(struct obd_device *obd, struct inode *inode, int op) { - ENTRY; - return obd->obd_fsops->fs_start(inode, op); + void *handle = obd->obd_fsops->fs_start(inode, op); + CDEBUG(D_HA, "starting handle %p\n", handle); + return handle; } static inline void *fsfilt_brw_start(struct obd_device *obd, int objcount, struct fsfilt_objinfo *fso, int niocount, struct niobuf_remote *nb) { - return obd->obd_fsops->fs_brw_start(objcount, fso, niocount, nb); + void *handle = obd->obd_fsops->fs_brw_start(objcount, fso, niocount,nb); + CDEBUG(D_HA, "starting handle %p\n", handle); + return handle; } static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, - void *handle) + void *handle, int force_sync) { - return obd->obd_fsops->fs_commit(inode, handle); - EXIT; + CDEBUG(D_HA, "committing handle %p\n", handle); + return obd->obd_fsops->fs_commit(inode, handle, force_sync); } static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry, diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h index 87b0bf3..fffbd60 100644 --- a/lustre/include/linux/lustre_ha.h +++ b/lustre/include/linux/lustre_ha.h @@ -5,60 +5,21 @@ #ifndef _LUSTRE_HA_H #define _LUSTRE_HA_H -#define LUSTRE_HA_NAME "ptlrpc" - -struct recovd_data; -struct recovd_obd; struct obd_import; -struct ptlrpc_connection; - -/* rd_phase/rd_next_phase values */ -#define RD_IDLE 0 -#define RD_TROUBLED 1 -#define RD_PREPARING 2 -#define RD_PREPARED 3 -#define RD_RECOVERING 4 -#define RD_RECOVERED 5 -#define RD_FAILED 6 - -/* recovd_state values */ -#define RECOVD_READY 1 -#define RECOVD_STOPPING 2 /* how cleanup tells recovd to quit */ -#define RECOVD_STOPPED 4 /* after recovd has stopped */ - -#define PTLRPC_RECOVD_PHASE_PREPARE 1 -#define PTLRPC_RECOVD_PHASE_RECOVER 2 -#define PTLRPC_RECOVD_PHASE_FAILURE 3 -#define PTLRPC_RECOVD_PHASE_NOTCONN 4 - -typedef int (*ptlrpc_recovery_cb_t)(struct recovd_data *, int); - -struct recovd_data { - /* you must hold recovd->recovd_lock when touching rd_managed_chain */ - struct list_head rd_managed_chain; - ptlrpc_recovery_cb_t rd_recover; - struct recovd_obd *rd_recovd; - __u32 rd_phase; - __u32 rd_next_phase; - __u32 rd_flags; -}; - -void recovd_conn_fail(struct ptlrpc_connection *conn); -void recovd_conn_manage(struct ptlrpc_connection *conn, struct recovd_obd *mgr, - ptlrpc_recovery_cb_t recover); -void recovd_conn_unmanage(struct ptlrpc_connection *conn); -void recovd_conn_fixed(struct ptlrpc_connection *conn); -int recovd_setup(struct recovd_obd *mgr); -int recovd_cleanup(struct recovd_obd *mgr); - -extern struct recovd_obd *ptlrpc_recovd; +struct obd_export; +struct obd_device; struct ptlrpc_request; -int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn); -int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, +void ptlrpc_run_failed_import_upcall(struct obd_import *imp); +void ptlrpc_run_recovery_over_upcall(struct obd_device *obd); +int ptlrpc_reconnect_import(struct obd_import *imp, struct ptlrpc_request **reqptr); int ptlrpc_replay(struct obd_import *imp); int ptlrpc_resend(struct obd_import *imp); void ptlrpc_free_committed(struct obd_import *imp); void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_fail_import(struct obd_import *imp, int generation); +void ptlrpc_fail_export(struct obd_export *exp); #endif diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index b99d996..b3acada 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -19,6 +19,23 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * (Un)packing of OST requests + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, + * implemented either here, inline (trivial implementations) or in + * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other" + * endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. */ #ifndef _LUSTRE_IDL_H_ @@ -30,13 +47,16 @@ # include # include # include /* for strncpy, below */ +# include +#else +#ifdef __CYGWIN__ +# include #else -# define __KERNEL__ # include -# include -# undef __KERNEL__ # include #endif +# include +#endif /* * this file contains all data structures used in Lustre interfaces: * - obdo and obd_request records @@ -52,12 +72,19 @@ struct obd_uuid { __u8 uuid[37]; }; +static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2) +{ + return strcmp(u1->uuid, u2->uuid) == 0; +} + static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) { strncpy(uuid->uuid, tmp, sizeof(*uuid)); uuid->uuid[sizeof(*uuid) - 1] = '\0'; } +extern struct obd_uuid lctl_fake_uuid; + /* FOO_REQUEST_PORTAL is for incoming requests on the FOO * FOO_REPLY_PORTAL is for incoming replies on the FOO * FOO_BULK_PORTAL is for incoming bulk on the FOO @@ -67,7 +94,7 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) #define CONNMGR_REPLY_PORTAL 2 //#define OSC_REQUEST_PORTAL 3 #define OSC_REPLY_PORTAL 4 -#define OSC_BULK_PORTAL 5 +//#define OSC_BULK_PORTAL 5 #define OST_REQUEST_PORTAL 6 //#define OST_REPLY_PORTAL 7 #define OST_BULK_PORTAL 8 @@ -96,32 +123,27 @@ static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) #define LUSTRE_CONN_NEW 1 #define LUSTRE_CONN_CON 2 -#define LUSTRE_CONN_RECOVD 3 -#define LUSTRE_CONN_FULL 4 +#define LUSTRE_CONN_NOTCONN 3 +#define LUSTRE_CONN_RECOVD 4 +#define LUSTRE_CONN_FULL 5 /* packet types */ #define PTL_RPC_MSG_REQUEST 4711 #define PTL_RPC_MSG_ERR 4712 #define PTL_RPC_MSG_REPLY 4713 -#define PTLRPC_MSG_MAGIC (cpu_to_le32(0x0BD00BD0)) -#define PTLRPC_MSG_VERSION (cpu_to_le32(0x00040001)) +#define PTLRPC_MSG_MAGIC 0x0BD00BD0 +#define PTLRPC_MSG_VERSION 0x00040002 struct lustre_handle { - __u64 addr; __u64 cookie; }; #define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabe -static inline void ptlrpc_invalidate_handle(struct lustre_handle *hdl) -{ - hdl->addr = hdl->cookie = 0; /* XXX invalid enough? */ -} - /* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ struct lustre_msg { - __u64 addr; - __u64 cookie; /* security token */ + struct lustre_handle handle; __u32 magic; __u32 type; __u32 version; @@ -130,11 +152,16 @@ struct lustre_msg { __u64 last_committed; __u64 transno; __u32 status; - __u32 bufcount; __u32 flags; + __u32 bufcount; __u32 buflens[0]; }; +static inline int lustre_msg_swabbed (struct lustre_msg *msg) +{ + return (msg->magic == __swab32 (PTLRPC_MSG_MAGIC)); +} + /* Flags that are operation-specific go in the top 16 bits. */ #define MSG_OP_FLAG_MASK 0xffff0000 #define MSG_OP_FLAG_SHIFT 16 @@ -206,6 +233,10 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define OST_SAN_READ 14 #define OST_SAN_WRITE 15 #define OST_SYNCFS 16 +/* When adding OST RPC opcodes, please update + * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */ +#define OST_LAST_OPC (OST_SYNCFS+1) +#define OST_FIRST_OPC OST_REPLY typedef uint64_t obd_id; @@ -226,10 +257,7 @@ typedef uint32_t obd_count; #define OBD_FL_OBDMDEXISTS (0x00000002) #define OBD_INLINESZ 60 -#define FD_OSTDATA_SIZE 32 -#if (FD_OSTDATA_SIZE > OBD_INLINESZ) -# error FD_OSTDATA_SIZE must be smaller than OBD_INLINESZ -#endif +#define FD_OSTDATA_SIZE sizeof(struct obd_client_handle) /* Note: 64-bit types are 64-bit aligned in structure */ struct obdo { @@ -241,7 +269,7 @@ struct obdo { obd_size o_size; obd_blocks o_blocks; obd_rdev o_rdev; - obd_blksize o_blksize; + obd_blksize o_blksize; /* optimal IO blocksize */ obd_mode o_mode; obd_uid o_uid; obd_gid o_gid; @@ -254,6 +282,8 @@ struct obdo { char o_inline[OBD_INLINESZ]; }; +extern void lustre_swab_obdo (struct obdo *o); + struct lov_object_id { /* per-child structure */ __u64 l_object_id; }; @@ -305,16 +335,20 @@ struct obd_statfs { __u8 os_fsid[40]; __u32 os_bsize; __u32 os_namelen; - __u32 os_spare[12]; + __u64 os_maxbytes; + __u32 os_spare[10]; }; +extern void lustre_swab_obd_statfs (struct obd_statfs *os); + /* ost_body.data values for OST_BRW */ -#define OBD_BRW_READ 0x1 -#define OBD_BRW_WRITE 0x2 -#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) -#define OBD_BRW_CREATE 0x4 -#define OBD_BRW_SYNC 0x8 +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_CREATE 0x04 +#define OBD_BRW_SYNC 0x08 +#define OBD_BRW_CHECK 0x10 #define OBD_OBJECT_EOF 0xffffffffffffffffULL @@ -325,13 +359,17 @@ struct obd_ioobj { __u32 ioo_bufcnt; } __attribute__((packed)); +extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo); + +/* multiple of 8 bytes => can array */ struct niobuf_remote { __u64 offset; __u32 len; - __u32 xid; __u32 flags; } __attribute__((packed)); +extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); + /* request structure for OST's */ #define OST_REQ_HAS_OA1 0x1 @@ -340,6 +378,8 @@ struct ost_body { struct obdo oa; }; +extern void lustre_swab_ost_body (struct ost_body *b); + /* * MDS REQ RECORDS */ @@ -355,6 +395,10 @@ struct ost_body { #define MDS_GETSTATUS 40 #define MDS_STATFS 41 #define MDS_GETLOVINFO 42 +/* When adding MDS RPC opcodes, please update + * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */ +#define MDS_LAST_OPC (MDS_GETLOVINFO+1) +#define MDS_FIRST_OPC MDS_GETATTR /* * Do not exceed 63 */ @@ -374,15 +418,13 @@ struct ost_body { #define IT_OPEN_CREATE (1 << 4) #define IT_OPEN_OPEN (1 << 5) -#define REINT_OPCODE_MASK 0xff /* opcodes must fit into this mask */ -#define REINT_REPLAYING 0x1000 /* masked into the opcode to indicate replay */ - struct ll_fid { __u64 id; __u32 generation; __u32 f_type; }; +extern void lustre_swab_ll_fid (struct ll_fid *fid); #define MDS_STATUS_CONN 1 #define MDS_STATUS_LOV 2 @@ -392,24 +434,20 @@ struct mds_status_req { __u32 repbuf; }; +extern void lustre_swab_mds_status_req (struct mds_status_req *r); + struct mds_fileh_body { struct ll_fid f_fid; struct lustre_handle f_handle; }; -struct mds_conn_status { - struct ll_fid rootfid; - __u64 xid; - __u64 last_committed; - __u64 last_rcvd; - /* XXX preallocated quota & obj fields here */ -}; +extern void lustre_swab_mds_fileh_body (struct mds_fileh_body *f); struct mds_body { struct ll_fid fid1; struct ll_fid fid2; struct lustre_handle handle; - __u64 size; + __u64 size; /* Offset, in the case of MDS_READPAGE */ __u64 blocks; /* XID, in the case of MDS_READPAGE */ __u32 ino; /* make this a __u64 */ __u32 valid; @@ -424,17 +462,19 @@ struct mds_body { __u32 atime; __u32 flags; __u32 rdev; - __u32 nlink; + __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ __u32 generation; __u32 suppgid; + __u32 eadatasize; }; +extern void lustre_swab_mds_body (struct mds_body *b); + /* This is probably redundant with OBD_MD_FLEASIZE, but we need an audit */ #define MDS_OPEN_HAS_EA 1 /* this open has an EA, for a delayed create*/ /* MDS update records */ - //struct mds_update_record_hdr { // __u32 ur_opcode; //}; @@ -458,6 +498,8 @@ struct mds_rec_setattr { __u32 sa_suppgid; }; +extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa); + struct mds_rec_create { __u32 cr_opcode; __u32 cr_fsuid; @@ -474,16 +516,21 @@ struct mds_rec_create { __u32 cr_suppgid; }; +extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr); + struct mds_rec_link { __u32 lk_opcode; __u32 lk_fsuid; __u32 lk_fsgid; __u32 lk_cap; - __u32 lk_suppgid; + __u32 lk_suppgid1; + __u32 lk_suppgid2; struct ll_fid lk_fid1; struct ll_fid lk_fid2; }; +extern void lustre_swab_mds_rec_link (struct mds_rec_link *lk); + struct mds_rec_unlink { __u32 ul_opcode; __u32 ul_fsuid; @@ -496,6 +543,8 @@ struct mds_rec_unlink { struct ll_fid ul_fid2; }; +extern void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul); + struct mds_rec_rename { __u32 rn_opcode; __u32 rn_fsuid; @@ -507,6 +556,7 @@ struct mds_rec_rename { struct ll_fid rn_fid2; }; +extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn); /* * LOV data structures @@ -515,6 +565,11 @@ struct mds_rec_rename { #define LOV_RAID0 0 #define LOV_RAIDRR 1 +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + struct lov_desc { __u32 ld_tgt_count; /* how many OBD's */ __u32 ld_active_tgt_count; /* how many active */ @@ -525,6 +580,8 @@ struct lov_desc { struct obd_uuid ld_uuid; }; +extern void lustre_swab_lov_desc (struct lov_desc *ld); + /* * LDLM requests: */ @@ -534,6 +591,10 @@ struct lov_desc { #define LDLM_CANCEL 103 #define LDLM_BL_CALLBACK 104 #define LDLM_CP_CALLBACK 105 +/* When adding LDLM RPC opcodes, please update + * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */ +#define LDLM_LAST_OPC (LDLM_CP_CALLBACK+1) +#define LDLM_FIRST_OPC LDLM_ENQUEUE #define RES_NAME_SIZE 3 #define RES_VERSION_SIZE 4 @@ -542,6 +603,8 @@ struct ldlm_res_id { __u64 name[RES_NAME_SIZE]; }; +extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id); + /* lock types */ typedef enum { LCK_EX = 1, @@ -557,10 +620,14 @@ struct ldlm_extent { __u64 end; }; +extern void lustre_swab_ldlm_extent (struct ldlm_extent *e); + struct ldlm_intent { __u64 opc; }; +extern void lustre_swab_ldlm_intent (struct ldlm_intent *i); + /* Note this unaligned structure; as long as it's only used in ldlm_request * below, we're probably fine. */ struct ldlm_resource_desc { @@ -569,6 +636,8 @@ struct ldlm_resource_desc { __u32 lr_version[RES_VERSION_SIZE]; }; +extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r); + struct ldlm_lock_desc { struct ldlm_resource_desc l_resource; ldlm_mode_t l_req_mode; @@ -577,6 +646,8 @@ struct ldlm_lock_desc { __u32 l_version[RES_VERSION_SIZE]; }; +extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l); + struct ldlm_request { __u32 lock_flags; struct ldlm_lock_desc lock_desc; @@ -584,6 +655,8 @@ struct ldlm_request { struct lustre_handle lock_handle2; }; +extern void lustre_swab_ldlm_request (struct ldlm_request *rq); + struct ldlm_reply { __u32 lock_flags; __u32 lock_mode; @@ -594,6 +667,8 @@ struct ldlm_reply { __u64 lock_policy_res2; }; +extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); + /* * ptlbd, portal block device requests */ @@ -601,7 +676,14 @@ typedef enum { PTLBD_QUERY = 200, PTLBD_READ = 201, PTLBD_WRITE = 202, + PTLBD_FLUSH = 203, + PTLBD_CONNECT = 204, + PTLBD_DISCONNECT = 205, } ptlbd_cmd_t; +/* When adding PTLBD RPC opcodes, please update + * LAST/FIRST macros used in ptlrpc/ptlrpc_internals.h */ +#define PTLBD_LAST_OPC (PTLBD_FLUSH+1) +#define PTLBD_FIRST_OPC PTLBD_QUERY struct ptlbd_op { __u16 op_cmd; @@ -611,6 +693,8 @@ struct ptlbd_op { __u32 op_block_cnt; }; +extern void lustre_swab_ptlbd_op (struct ptlbd_op *op); + struct ptlbd_niob { __u64 n_xid; __u64 n_block_nr; @@ -618,8 +702,19 @@ struct ptlbd_niob { __u32 n_length; }; +extern void lustre_swab_ptlbd_niob (struct ptlbd_niob *n); + struct ptlbd_rsp { __u16 r_status; __u16 r_error_cnt; }; + +extern void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r); + +/* + * Opcodes for multiple servers. + */ + +#define OBD_PING 400 + #endif diff --git a/lustre/include/linux/lustre_import.h b/lustre/include/linux/lustre_import.h index 4fc2581..c1af641 100644 --- a/lustre/include/linux/lustre_import.h +++ b/lustre/include/linux/lustre_import.h @@ -10,21 +10,17 @@ #ifndef __IMPORT_H #define __IMPORT_H - -#define IMP_INVALID 1 -#define IMP_REPLAYABLE 2 - - -struct obd_import; -typedef int (*import_recover_t)(struct obd_import *imp, int phase); +#include #include struct obd_import { - import_recover_t imp_recover; + struct portals_handle imp_handle; + atomic_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ struct ptlrpc_connection *imp_connection; struct ptlrpc_client *imp_client; - struct lustre_handle imp_handle; - struct list_head imp_chain; + struct list_head imp_observers; + struct list_head imp_pinger_chain; /* Lists of requests that are retained for replay, waiting for a reply, * or waiting for recovery to complete, respectively. @@ -34,17 +30,43 @@ struct obd_import { struct list_head imp_delayed_list; struct obd_device *imp_obd; - int imp_flags; int imp_level; + int imp_generation; __u64 imp_max_transno; __u64 imp_peer_committed_transno; + struct obd_uuid imp_target_uuid; /* XXX -> lustre_name */ + struct lustre_handle imp_remote_handle; - /* Protects flags, level, *_list */ + /* Protects flags, level, generation, *_list */ spinlock_t imp_lock; + + /* flags */ + int imp_invalid:1, imp_replayable:1, + imp_dlm_fake:1; + __u32 imp_connect_op; +}; + +typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, + int event, void *event_arg, void *cb_data); + +struct obd_import_observer { + struct list_head oio_chain; + obd_import_callback oio_cb; + void *oio_cb_data; }; +void class_observe_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_notify_import_observers(struct obd_import *imp, int event, + void *event_arg); + +#define IMP_EVENT_ACTIVE 1 +#define IMP_EVENT_INACTIVE 2 + +/* genops.c */ extern struct obd_import *class_conn2cliimp(struct lustre_handle *); extern struct obd_import *class_conn2ldlmimp(struct lustre_handle *); - #endif /* __IMPORT_H */ diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 6f38be0..c43cf95 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -27,12 +27,13 @@ #ifndef __KERNEL__ # include +# include #else # include # include # include +# include #endif -#include #include #include /* XXX just for LASSERT! */ #include @@ -51,19 +52,19 @@ /* target.c */ struct ptlrpc_request; -struct obd_device; struct recovd_data; struct recovd_obd; struct obd_export; #include #include - +#include int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler); int target_handle_disconnect(struct ptlrpc_request *req); int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, struct obd_uuid *cluuid); -int target_revoke_connection(struct recovd_data *rd, int phase); +int target_handle_ping(struct ptlrpc_request *req); +void target_cancel_recovery_timer(struct obd_device *obd); #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler); @@ -71,18 +72,26 @@ void target_abort_recovery(void *data); int target_queue_recovery_request(struct ptlrpc_request *req, struct obd_device *obd); int target_queue_final_reply(struct ptlrpc_request *req, int rc); +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); /* client.c */ -int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover); -int client_obd_disconnect(struct lustre_handle *conn); + int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf); int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf); -int client_obd_cleanup(struct obd_device * obddev); +int client_obd_cleanup(struct obd_device * obddev, int force, int failover); struct client_obd *client_conn2cli(struct lustre_handle *conn); struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid); +/* It is important that och_fh remain the first item in this structure: that + * way, we don't have to re-pack the obdo's inline data before we send it to + * the server, we can just send the whole struct unaltered. */ +struct obd_client_handle { + struct lustre_handle och_fh; + struct ptlrpc_request *och_req; + __u32 och_magic; +}; +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + /* statfs_pack.c */ int obd_self_statfs(struct obd_device *dev, struct statfs *sfs); @@ -99,45 +108,74 @@ void l_lock(struct lustre_lock *); void l_unlock(struct lustre_lock *); int l_has_lock(struct lustre_lock *); -#define CB_PHASE_START 12 -#define CB_PHASE_FINISH 13 - -/* This list head doesn't need to be locked, because it's only manipulated by - * one thread at a time. */ -struct obd_brw_set { - struct list_head brw_desc_head; /* list of ptlrpc_bulk_desc */ - wait_queue_head_t brw_waitq; - atomic_t brw_refcount; - atomic_t brw_desc_count; - int brw_flags; +/* simple.c */ +struct obd_ucred { + __u32 ouc_fsuid; + __u32 ouc_fsgid; + __u32 ouc_cap; + __u32 ouc_suppgid1; + __u32 ouc_suppgid2; +}; - int (*brw_callback)(struct obd_brw_set *, int phase); +#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA +#define OBD_CTXT_DEBUG /* development-only debugging */ +struct obd_run_ctxt { + struct vfsmount *pwdmnt; + struct dentry *pwd; + mm_segment_t fs; + struct obd_ucred ouc; + int ngroups; +#ifdef OBD_CTXT_DEBUG + __u32 magic; +#endif }; -/* simple.c */ -struct obd_run_ctxt; -struct obd_ucred; + +#ifdef OBD_CTXT_DEBUG +#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC +#else +#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0) +#endif + +#ifdef __KERNEL__ + void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, struct obd_ucred *cred); void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx, struct obd_ucred *cred); struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode); struct dentry *simple_mknod(struct dentry *dir, char *name, int mode); -int lustre_fread(struct file *file, char *str, int len, loff_t *off); -int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off); +int lustre_fread(struct file *file, void *buf, int len, loff_t *off); +int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off); int lustre_fsync(struct file *file); -#ifdef __KERNEL__ - static inline void l_dput(struct dentry *de) { if (!de || IS_ERR(de)) return; - shrink_dcache_parent(de); + //shrink_dcache_parent(de); LASSERT(atomic_read(&de->d_count) > 0); dput(de); } +/* We need to hold the inode semaphore over the dcache lookup itself, or we + * run the risk of entering the filesystem lookup path concurrently on SMP + * systems, and instantiating two inodes for the same entry. We still + * protect against concurrent addition/removal races with the DLM locking. + */ +static inline struct dentry *ll_lookup_one_len(char *fid_name, + struct dentry *dparent, + int fid_namelen) +{ + struct dentry *dchild; + + down(&dparent->d_inode->i_sem); + dchild = lookup_one_len(fid_name, dparent, fid_namelen); + up(&dparent->d_inode->i_sem); + + return dchild; +} + static inline void ll_sleep(int t) { set_current_state(TASK_INTERRUPTIBLE); @@ -146,17 +184,10 @@ static inline void ll_sleep(int t) } #endif -/* FIXME: This needs to validate pointers and cookies */ -static inline void *lustre_handle2object(struct lustre_handle *handle) -{ - if (handle) - return (void *)(unsigned long)(handle->addr); - return NULL; -} - -static inline void ldlm_object2handle(void *object, struct lustre_handle *handle) +#define LL_FID_NAMELEN (16 + 1 + 8 + 1) +static inline int ll_fid2str(char *str, __u64 id, __u32 generation) { - handle->addr = (__u64)(unsigned long)object; + return sprintf(str, "%llx:%08x", (unsigned long long)id, generation); } #include @@ -170,7 +201,6 @@ struct obd_ioctl_data { uint32_t ioc_len; uint32_t ioc_version; - uint64_t ioc_addr; uint64_t ioc_cookie; uint32_t ioc_conn1; uint32_t ioc_conn2; @@ -368,6 +398,8 @@ static inline int obd_ioctl_unpack(struct obd_ioctl_data *data, char *pbuf, #include +#define OBD_MAX_IOCTL_BUFFER 8192 + /* buffer MUST be at least the size of obd_ioctl_hdr */ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) { @@ -383,12 +415,13 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) } if (hdr.ioc_version != OBD_IOCTL_VERSION) { - printk("OBD: version mismatch kernel vs application\n"); + CERROR("Version mismatch kernel vs application\n"); return -EINVAL; } - if (hdr.ioc_len > 8192) { - printk("OBD: user buffer exceeds 8192 max buffer\n"); + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); return -EINVAL; } @@ -397,8 +430,10 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) return -EINVAL; } - OBD_ALLOC(*buf, hdr.ioc_len); - if (!*buf) { + /* XXX allocate this more intelligently, using kmalloc when + * appropriate */ + OBD_VMALLOC(*buf, hdr.ioc_len); + if (*buf == NULL) { CERROR("Cannot allocate control buffer of len %d\n", hdr.ioc_len); RETURN(-EINVAL); @@ -413,7 +448,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) } if (obd_ioctl_is_invalid(data)) { - printk("OBD: ioctl not correctly formatted\n"); + CERROR("ioctl not correctly formatted\n"); return -EINVAL; } @@ -436,6 +471,15 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) return 0; } +static inline void obd_ioctl_freedata(char *buf, int len) +{ + ENTRY; + + OBD_VFREE(buf, len); + EXIT; + return; +} + #define OBD_IOC_CREATE _IOR ('f', 101, long) #define OBD_IOC_SETUP _IOW ('f', 102, long) #define OBD_IOC_CLEANUP _IO ('f', 103 ) @@ -467,19 +511,18 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) #define OBD_IOC_LIST _IOWR('f', 129, long) #define OBD_IOC_UUID2DEV _IOWR('f', 130, long) -#define OBD_IOC_RECOVD_NEWCONN _IOWR('f', 131, long) -#define OBD_IOC_LOV_SET_CONFIG _IOWR('f', 132, long) -#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 133, long) +#define OBD_IOC_LOV_SET_CONFIG _IOWR('f', 131, long) +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, long) #define OBD_IOC_LOV_CONFIG OBD_IOC_LOV_SET_CONFIG +#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, long) #define OBD_IOC_OPEN _IOWR('f', 134, long) #define OBD_IOC_CLOSE _IOWR('f', 135, long) -#define OBD_IOC_RECOVD_FAILCONN _IOWR('f', 136, long) - #define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 ) #define OBD_IOC_NO_TRANSNO _IOW ('f', 140, long) #define OBD_IOC_SET_READONLY _IOW ('f', 141, long) +#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, long) #define OBD_GET_VERSION _IOWR ('f', 144, long) @@ -487,11 +530,20 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) #define OBD_IOC_DEL_UUID _IOWR ('f', 146, long) #define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, long) +#define OBD_IOC_MOUNTOPT _IOWR('f', 170, long) + #define ECHO_IOC_GET_STRIPE _IOWR('f', 200, long) #define ECHO_IOC_SET_STRIPE _IOWR('f', 201, long) #define ECHO_IOC_ENQUEUE _IOWR('f', 202, long) #define ECHO_IOC_CANCEL _IOWR('f', 203, long) +/* XXX _IOWR('f', 250, long) has been defined in + * portals/include/linux/kp30.h for debug, don't use it + */ + +/* Until such time as we get_info the per-stripe maximum from the OST, + * we define this to be 2T - 4k, which is the ext3 maxbytes. */ +#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL #define CHECKSUM_BULK 0 @@ -507,8 +559,6 @@ static inline void ost_checksum(__u64 *cksum, void *addr, int len) *cksum = (*cksum << 2) + sum; } -#else -#define ost_checksum(cksum, addr, len) do {} while (0) #endif /* @@ -551,7 +601,7 @@ struct l_wait_info { long lwi_timeout; int (*lwi_on_timeout)(void *); long lwi_signals; - int (*lwi_on_signal)(void *); /* XXX return is ignored for now */ + void (*lwi_on_signal)(void *); void *lwi_cb_data; }; @@ -587,11 +637,11 @@ static inline sigset_t l_w_e_set_sigs(int sigs) sigset_t old; unsigned long irqflags; - spin_lock_irqsave(¤t->sigmask_lock, irqflags); + SIGNAL_MASK_LOCK(current, irqflags); old = current->blocked; siginitsetinv(¤t->blocked, sigs); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, irqflags); return old; } @@ -639,10 +689,10 @@ do { \ } \ } \ \ - spin_lock_irqsave(¤t->sigmask_lock, irqflags); \ + SIGNAL_MASK_LOCK(current, irqflags); \ current->blocked = blocked; \ - recalc_sigpending(current); \ - spin_unlock_irqrestore(¤t->sigmask_lock, irqflags); \ + RECALC_SIGPENDING; \ + SIGNAL_MASK_UNLOCK(current, irqflags); \ \ current->state = TASK_RUNNING; \ remove_wait_queue(&wq, &__wait); \ @@ -656,6 +706,11 @@ do { \ __l_wait_event(wq, condition, __info, __ret); \ __ret; \ }) +#else +#define l_wait_event(wq, condition, info) \ +({ \ + 0; \ +}) #endif /* __KERNEL__ */ #endif /* _LUSTRE_LIB_H */ diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index 9657f24..35d4994 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -25,12 +25,17 @@ #include #include +#include +#include +#include + +/* careful, this is easy to screw up */ +#define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << PAGE_CACHE_SHIFT) extern kmem_cache_t *ll_file_data_slab; struct ll_file_data { - struct lustre_handle fd_mdshandle; - struct ptlrpc_request *fd_req; - char fd_ostdata[FD_OSTDATA_SIZE]; + struct obd_client_handle fd_mds_och; + struct obd_client_handle fd_ost_och; __u32 fd_flags; }; @@ -47,30 +52,34 @@ struct ll_dentry_data { #define ll_d2d(dentry) ((struct ll_dentry_data*) dentry->d_fsdata) -struct ll_read_inode2_cookie { - struct mds_body *lic_body; - struct lov_mds_md *lic_lmm; +struct ll_dirty_offsets { + rb_root_t do_root; + spinlock_t do_lock; + unsigned long do_num_dirty; }; +void ll_lldo_init(struct ll_dirty_offsets *lldo); +void ll_record_dirty(struct inode *inode, unsigned long offset); +void ll_remove_dirty(struct inode *inode, unsigned long start, + unsigned long end); +int ll_find_dirty(struct ll_dirty_offsets *lldo, unsigned long *start, + unsigned long *end); +int ll_farthest_dirty(struct ll_dirty_offsets *lldo, unsigned long *farthest); +extern struct file_operations ll_pgcache_seq_fops; + struct ll_inode_info { - struct lov_stripe_md *lli_smd; - char *lli_symlink_name; - struct semaphore lli_open_sem; - atomic_t lli_open_count; /* see ll_file_release */ - /* - * the VALID flag and valid_sem are temporary measures to serialize - * the manual getattrs that we're doing at lock acquisition. in - * the future the OST will always return its notion of the file - * size with the granted locks. - */ - unsigned long lli_flags; -#define LLI_F_DID_GETATTR 0 - struct semaphore lli_getattr_sem; - struct list_head lli_read_extents; - spinlock_t lli_read_extent_lock; + struct lov_stripe_md *lli_smd; + char *lli_symlink_name; + struct semaphore lli_open_sem; + struct list_head lli_read_extents; + loff_t lli_maxbytes; + spinlock_t lli_read_extent_lock; + struct ll_dirty_offsets lli_dirty; + unsigned long lli_flags; +#define LLI_F_HAVE_SIZE_LOCK 0 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - struct inode lli_vfs_inode; + struct inode lli_vfs_inode; #endif }; @@ -89,6 +98,32 @@ struct ll_read_extent { int ll_check_dirty( struct super_block *sb ); int ll_batch_writepage( struct inode *inode, struct page *page ); +struct file_io_stats { + spinlock_t fis_lock; + __u64 fis_dirty_pages; + __u64 fis_dirty_hits; + __u64 fis_dirty_misses; + __u64 fis_forced_pages; + __u64 fis_writepage_pages; + __u64 fis_wb_ok; + __u64 fis_wb_fail; + __u64 fis_wb_from_writepage; + __u64 fis_wb_from_pressure; +}; + +#define IO_STAT_ADD(FIS, STAT, VAL) do { \ + struct file_io_stats *_fis_ = (FIS); \ + spin_lock(&_fis_->fis_lock); \ + _fis_->fis_##STAT += VAL; \ + spin_unlock(&_fis_->fis_lock); \ +} while (0) + +#define INODE_IO_STAT_ADD(INODE, STAT, VAL) \ + IO_STAT_ADD(&ll_i2sbi(INODE)->ll_iostats, STAT, VAL) + +#define PAGE_IO_STAT_ADD(PAGE, STAT, VAL) \ + INODE_IO_STAT_ADD((PAGE)->mapping, STAT, VAL) + /* interpet return codes from intent lookup */ #define LL_LOOKUP_POSITIVE 1 #define LL_LOOKUP_NEGATIVE 2 @@ -119,6 +154,8 @@ struct ll_sb_info { struct list_head ll_conn_chain; /* per-conn chain of SBs */ struct list_head ll_orphan_dentry_list; /*please don't ask -p*/ + + struct file_io_stats ll_iostats; }; static inline struct ll_sb_info *ll_s2sbi(struct super_block *sb) @@ -189,12 +226,7 @@ static inline struct lustre_handle *ll_i2obdconn(struct inode *inode) } static inline void ll_ino2fid(struct ll_fid *fid, obd_id ino, __u32 generation, - int type) -{ - fid->id = ino; - fid->generation = generation; - fid->f_type = type; -} + int type); static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode) { @@ -207,16 +239,28 @@ static inline int ll_mds_max_easize(struct super_block *sb) return sbi2mdc(ll_s2sbi(sb))->cl_max_mds_easize; } +static inline loff_t ll_file_maxbytes(struct inode *inode) +{ + return ll_i2info(inode)->lli_maxbytes; +} + /* namei.c */ int ll_lock(struct inode *dir, struct dentry *dentry, struct lookup_intent *it, struct lustre_handle *lockh); int ll_unlock(__u32 mode, struct lustre_handle *lockh); typedef int (*intent_finish_cb)(int flag, struct ptlrpc_request *, - struct dentry **, struct lookup_intent *, - int offset, obd_id ino); + struct inode *parent, struct dentry **, + struct lookup_intent *, int offset, obd_id ino); int ll_intent_lock(struct inode *parent, struct dentry **, struct lookup_intent *, intent_finish_cb); +int ll_mdc_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag); +void ll_mdc_lock_set_inode(struct lustre_handle *lock, struct inode *inode); +void ll_prepare_mdc_op_data(struct mdc_op_data *data, + struct inode *i1, struct inode *i2, + const char *name, int namelen, int mode); /* dcache.c */ void ll_intent_release(struct dentry *, struct lookup_intent *); @@ -260,6 +304,8 @@ do { \ up(&ll_d2d(de)->lld_it_sem); \ } while(0) +#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") + /* dcache.c */ int ll_have_md_lock(struct dentry *de); @@ -285,6 +331,9 @@ int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, struct lustre_handle *lockh); int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid, struct lov_stripe_md **lsmp); +int ll_file_open(struct inode *inode, struct file *file); +int ll_file_release(struct inode *inode, struct file *file); + /* rw.c */ struct page *ll_getpage(struct inode *inode, unsigned long offset, @@ -292,7 +341,7 @@ struct page *ll_getpage(struct inode *inode, unsigned long offset, void ll_truncate(struct inode *inode); /* super.c */ -void ll_update_inode(struct inode *, struct mds_body *, struct lov_mds_md *); +void ll_update_inode(struct inode *, struct mds_body *, struct lov_stripe_md *); int ll_setattr_raw(struct inode *inode, struct iattr *attr); /* symlink.c */ @@ -303,8 +352,25 @@ extern struct inode_operations ll_symlink_inode_operations; void ll_sysctl_init(void); void ll_sysctl_clean(void); +#else +#include #endif /* __KERNEL__ */ +static inline void ll_ino2fid(struct ll_fid *fid, + obd_id ino, + __u32 generation, + int type) +{ + fid->id = ino; + fid->generation = generation; + fid->f_type = type; +} + +struct ll_read_inode2_cookie { + struct mds_body *lic_body; + struct lov_stripe_md *lic_lsm; +}; + #include #define LL_IOC_GETFLAGS _IOR ('f', 151, long) diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index c951637..683d78d 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -27,10 +27,14 @@ #define _LUSTRE_MDS_H #ifdef __KERNEL__ -#include +# include +# include #endif +#include #include #include +#include +#include struct ldlm_lock_desc; struct mds_obd; @@ -58,6 +62,36 @@ static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) lck->rpcl_it = NULL; } +#ifdef __KERNEL__ +/* Compat code for kernel patch v18 users, can be removed when everyone has + * upgraded --phik 02 June 2003 */ +#ifdef IT_FL_LOCKED +static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + down(&lck->rpcl_sem); + if (it) { + lck->rpcl_it = it; + it->it_int_flags |= IT_FL_LOCKED; + } +} + +static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it == NULL) { + LASSERT(it == lck->rpcl_it); + up(&lck->rpcl_sem); + return; + } + if (it != NULL && (it->it_int_flags & IT_FL_LOCKED)) { + it->it_int_flags &= ~IT_FL_LOCKED; + LASSERT(it == lck->rpcl_it); + lck->rpcl_it = NULL; + up(&lck->rpcl_sem); + } +} +#else static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, struct lookup_intent *it) { @@ -83,18 +117,24 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, up(&lck->rpcl_sem); } } -struct mdc_unlink_data { - struct inode *unl_dir; - struct inode *unl_de; - int unl_mode; - const char *unl_name; - int unl_len; +#endif +#endif + +struct mdc_op_data { + __u64 ino1; + __u32 gen1; + __u32 typ1; + __u32 gid1; + __u64 ino2; + __u32 gen2; + __u32 typ2; + __u32 gid2; + const char *name; + int namelen; + int mode; }; struct mds_update_record { - __u32 ur_fsuid; - __u32 ur_fsgid; - __u32 ur_cap; __u32 ur_opcode; struct ll_fid *ur_fid1; struct ll_fid *ur_fid2; @@ -102,17 +142,24 @@ struct mds_update_record { char *ur_name; int ur_tgtlen; char *ur_tgt; + int ur_eadatalen; + void *ur_eadata; struct iattr ur_iattr; + struct obd_ucred ur_uc; __u64 ur_rdev; __u32 ur_mode; __u32 ur_uid; __u32 ur_gid; __u64 ur_time; __u32 ur_flags; - __u32 ur_suppgid1; - __u32 ur_suppgid2; }; +#define ur_fsuid ur_uc.ouc_fsuid +#define ur_fsgid ur_uc.ouc_fsgid +#define ur_cap ur_uc.ouc_cap +#define ur_suppgid1 ur_uc.ouc_suppgid1 +#define ur_suppgid2 ur_uc.ouc_suppgid2 + #define MDS_LR_CLIENT 8192 #define MDS_LR_SIZE 128 @@ -141,21 +188,14 @@ struct mds_client_data { __u8 padding[MDS_LR_SIZE - 74]; }; -/* In-memory access to client data from MDS struct */ -struct mds_export_data { - struct list_head med_open_head; - spinlock_t med_open_lock; - struct mds_client_data *med_mcd; - int med_off; - struct ptlrpc_request *med_outstanding_reply; -}; - /* file data for open files on MDS */ struct mds_file_data { - struct list_head mfd_list; - __u64 mfd_servercookie; - __u64 mfd_xid; - struct file *mfd_file; + struct portals_handle mfd_handle; /* must be first */ + atomic_t mfd_refcount; + struct list_head mfd_list; + __u64 mfd_xid; + int mfd_mode; + struct dentry *mfd_dentry; }; /* mds/mds_reint.c */ @@ -166,41 +206,8 @@ int mds_reint_rec(struct mds_update_record *r, int offset, int mds_open(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *); -/* lib/mds_updates.c */ -void mds_unpack_body(struct mds_body *b); -void mds_unpack_fid(struct ll_fid *fid); -void mds_pack_fid(struct ll_fid *fid); -void mds_pack_req_body(struct ptlrpc_request *); -void mds_pack_rep_body(struct ptlrpc_request *); -int mds_update_unpack(struct ptlrpc_request *, int offset, - struct mds_update_record *); - -void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, obd_id ino, - int type, __u64 xid); -void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, int fl, - struct inode *inode, const char *name, int namelen); -void mds_setattr_pack(struct ptlrpc_request *, struct inode *, - struct iattr *, void *ea, int ealen); -void mds_create_pack(struct ptlrpc_request *, int offset, struct inode *dir, - __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, - const char *name, int namelen, const void *data, - int datalen); -void mds_open_pack(struct ptlrpc_request *, int offset, struct inode *dir, - __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, - __u32 flags, const char *name, int namelen, - const void *data, int datalen); -void mds_unlink_pack(struct ptlrpc_request *, int offset, struct inode *inode, - struct inode *child, __u32 mode, const char *name, - int namelen); -void mds_link_pack(struct ptlrpc_request *, int offset, struct inode *ino, - struct inode *dir, const char *name, int namelen); -void mds_rename_pack(struct ptlrpc_request *, int offset, struct inode *srcdir, - struct inode *tgtdir, const char *name, int namelen, - const char *tgt, int tgtlen); -void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode); -void mds_pack_inode2body(struct mds_body *body, struct inode *inode); - /* mds/handler.c */ +#ifdef __KERNEL__ struct dentry *mds_name2locked_dentry(struct obd_device *, struct dentry *dir, struct vfsmount **mnt, char *name, int namelen, int lock_mode, @@ -214,64 +221,60 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, int mds_reint(struct ptlrpc_request *req, int offset, struct lustre_handle *); int mds_pack_md(struct obd_device *mds, struct lustre_msg *msg, int offset, struct mds_body *body, struct inode *inode); -void mds_steal_ack_locks(struct mds_export_data *med, +void mds_steal_ack_locks(struct obd_export *exp, struct ptlrpc_request *req); /* mds/mds_fs.c */ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt); -int mds_fs_cleanup(struct obd_device *obddev); +int mds_fs_cleanup(struct obd_device *obddev, int failover); +#endif /* mdc/mdc_request.c */ int mdc_enqueue(struct lustre_handle *conn, int lock_type, - struct lookup_intent *it, int lock_mode, struct inode *dir, - struct dentry *de, struct lustre_handle *lockh, char *tgt, - int tgtlen, void *data, int datalen); -int mdc_cancel_unused(struct lustre_handle *conn, struct inode *, int flags); + struct lookup_intent *it, int lock_mode, + struct mdc_op_data *enq_data, + struct lustre_handle *lockh, char *tgt, int tgtlen, + ldlm_completion_callback cb_completion, + ldlm_blocking_callback cb_blocking, + void *cb_data); int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, struct ptlrpc_request **request); int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid); -int mdc_getattr(struct lustre_handle *conn, - obd_id ino, int type, unsigned long valid, unsigned int ea_size, +int mdc_getattr(struct lustre_handle *conn, struct ll_fid *fid, + unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request); -int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, +int mdc_getattr_name(struct lustre_handle *conn, struct ll_fid *fid, char *filename, int namelen, unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request); int mdc_setattr(struct lustre_handle *conn, - struct inode *, struct iattr *iattr, - void *ea, int ealen, struct ptlrpc_request **); + struct mdc_op_data *data, + struct iattr *iattr, void *ea, int ealen, + struct ptlrpc_request **request); int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, struct ptlrpc_request **); -void mdc_set_open_replay_data(struct ll_file_data *fd); +struct obd_client_handle; +void mdc_set_open_replay_data(struct obd_client_handle *och); int mdc_close(struct lustre_handle *conn, obd_id ino, int type, struct lustre_handle *fh, struct ptlrpc_request **req); -int mdc_readpage(struct lustre_handle *conn, obd_id ino, - int type, __u64 offset, char *addr, struct ptlrpc_request **); -int mdc_create(struct lustre_handle *conn, - struct inode *dir, const char *name, int namelen, +int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset, + struct page *, struct ptlrpc_request **); +int mdc_create(struct lustre_handle *conn, struct mdc_op_data *op_data, const void *data, int datalen, int mode, __u32 uid, __u32 gid, - __u64 time, __u64 rdev, struct ptlrpc_request **); -int mdc_unlink(struct lustre_handle *, struct inode *dir, struct inode *child, - __u32 mode, const char *name, int namelen, - struct ptlrpc_request **); -int mdc_link(struct lustre_handle *conn, - struct inode *src, struct inode *dir, const char *name, - int namelen, struct ptlrpc_request **); -int mdc_rename(struct lustre_handle *conn, - struct inode *src, struct inode *tgt, const char *old, - int oldlen, const char *new, int newlen, - struct ptlrpc_request **); + __u64 time, __u64 rdev, struct ptlrpc_request **request); +int mdc_unlink(struct lustre_handle *conn, struct mdc_op_data *data, + struct ptlrpc_request **request); +int mdc_link(struct lustre_handle *conn, struct mdc_op_data *data, + struct ptlrpc_request **); +int mdc_rename(struct lustre_handle *conn, struct mdc_op_data *data, + const char *old, int oldlen, const char *new, int newlen, + struct ptlrpc_request **request); int mdc_create_client(struct obd_uuid uuid, struct ptlrpc_client *cl); -void mdc_lock_set_inode(struct lustre_handle *lock, struct inode *inode); /* Store the generation of a newly-created inode in |req| for replay. */ void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, int repoff); -int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, - int cl_off); -int mds_client_free(struct obd_export *exp); - /* ioctls for trying requests */ #define IOC_REQUEST_TYPE 'f' diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 6966424..ed5db88 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -1,7 +1,7 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * * This file is part of Lustre, http://www.lustre.org. * @@ -30,7 +30,7 @@ #else #include #endif -#endif +#endif #include // #include @@ -38,6 +38,7 @@ #include #include #include +#include /* The following constants determine how much memory is devoted to * buffering in the lustre services. @@ -62,7 +63,8 @@ #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE 1024 -#define MDT_NUM_THREADS 8 +#define MDT_MAX_THREADS 32UL +#define MDT_NUM_THREADS min(num_physpages / 8192, MDT_MAX_THREADS) #define MDS_NEVENT_MAX 8192UL #define MDS_NEVENTS min(num_physpages / 64, MDS_NEVENT_MAX) #define MDS_NBUF_MAX 512UL @@ -84,16 +86,17 @@ */ #define MDS_MAXREQSIZE (5 * 1024) -#define OST_NUM_THREADS 6 +#define OST_MAX_THREADS 36UL +#define OST_NUM_THREADS min(num_physpages / 8192, OST_MAX_THREADS) #define OST_NEVENT_MAX 32768UL #define OST_NEVENTS min(num_physpages / 16, OST_NEVENT_MAX) #define OST_NBUF_MAX 1280UL #define OST_NBUFS min(OST_NEVENTS / 64, OST_NBUF_MAX) #define OST_BUFSIZE (8 * 1024) -/* OST_MAXREQSIZE ~= 1896 bytes = +/* OST_MAXREQSIZE ~= 1640 bytes = * lustre_msg + obdo + 16 * obd_ioobj + 64 * niobuf_remote * - * single object with 16 pages is 576 bytes + * single object with 16 pages is 512 bytes */ #define OST_MAXREQSIZE (2 * 1024) @@ -120,19 +123,13 @@ struct ptlrpc_connection { __u32 c_epoch; /* changes when peer changes */ __u32 c_bootcount; /* peer's boot count */ - spinlock_t c_lock; /* also protects req->rq_list */ + spinlock_t c_lock; atomic_t c_refcount; __u64 c_token; __u64 c_remote_conn; __u64 c_remote_token; - struct list_head c_delayed_head;/* delayed until post-recovery XXX imp? */ - struct recovd_data c_recovd_data; - - struct list_head c_imports; - struct list_head c_exports; - struct list_head c_sb_chain; __u32 c_flags; // can we indicate INVALID elsewhere? }; @@ -147,28 +144,76 @@ struct ptlrpc_client { }; /* state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ #define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ -#define PTL_RPC_FL_REPLIED (1 << 1) /* reply was received */ -#define PTL_RPC_FL_SENT (1 << 2) /* request was sent */ -#define PTL_RPC_FL_WANT_ACK (1 << 3) /* reply is awaiting an ACK */ -#define PTL_BULK_FL_SENT (1 << 4) /* outgoing bulk was sent */ -#define PTL_BULK_FL_RCVD (1 << 5) /* incoming bulk was recieved */ -#define PTL_RPC_FL_ERR (1 << 6) /* request failed due to RPC error */ #define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ -#define PTL_RPC_FL_RESEND (1 << 8) /* retransmit the request */ -#define PTL_RPC_FL_RESTART (1 << 9) /* operation must be restarted */ -#define PTL_RPC_FL_RETAIN (1 << 10) /* retain for replay after reply */ -#define PTL_RPC_FL_REPLAY (1 << 11) /* replay upon recovery */ -#define PTL_RPC_FL_ALLOCREP (1 << 12) /* reply buffer allocated */ -#define PTL_RPC_FL_NO_RESEND (1 << 13) /* don't automatically resend this req */ -#define PTL_RPC_FL_RESENT (1 << 14) /* server rcvd resend of this req */ + +#define REQ_MAX_ACK_LOCKS 4 + +#define SWAB_PARANOIA 1 +#if SWAB_PARANOIA +/* unpacking: assert idx not unpacked already */ +#define LASSERT_REQSWAB(rq, idx) \ +do { \ + LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8); \ + LASSERT (((rq)->rq_req_swab_mask & (1 << (idx))) == 0); \ + (rq)->rq_req_swab_mask |= (1 << (idx)); \ +} while (0) + +#define LASSERT_REPSWAB(rq, idx) \ +do { \ + LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8); \ + LASSERT (((rq)->rq_rep_swab_mask & (1 << (idx))) == 0); \ + (rq)->rq_rep_swab_mask |= (1 << (idx)); \ +} while (0) + +/* just looking: assert idx already unpacked */ +#define LASSERT_REQSWABBED(rq, idx) \ +LASSERT ((idx) < sizeof ((rq)->rq_req_swab_mask) * 8 && \ + ((rq)->rq_req_swab_mask & (1 << (idx))) != 0) + +#define LASSERT_REPSWABBED(rq, idx) \ +LASSERT ((idx) < sizeof ((rq)->rq_rep_swab_mask) * 8 && \ + ((rq)->rq_rep_swab_mask & (1 << (idx))) != 0) +#else +#define LASSERT_REQSWAB(rq, idx) +#define LASSERT_REPSWAB(rq, idx) +#define LASSERT_REQSWABBED(rq, idx) +#define LASSERT_REPSWABBED(rq, idx) +#endif + +union ptlrpc_async_args { + /* Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and LASSERT that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. */ + void *pointer_arg[4]; + __u64 space[4]; +}; + +struct ptlrpc_request_set { + int set_remaining; /* # uncompleted requests */ + wait_queue_head_t set_waitq; + struct list_head set_requests; + void *set_interpret; /* completion callback */ + union ptlrpc_async_args set_args; /* completion context */ +}; + +struct ptlrpc_bulk_desc; struct ptlrpc_request { int rq_type; /* one of PTL_RPC_MSG_* */ struct list_head rq_list; struct obd_device *rq_obd; int rq_status; - int rq_flags; + spinlock_t rq_lock; + unsigned int rq_intr:1, rq_replied:1, rq_want_ack:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, rq_replay:1, + rq_no_resend:1, rq_resent:1, rq_no_recov:1, rq_waiting:1, + rq_receiving_reply:1; + int rq_phase; + atomic_t rq_refcount; int rq_request_portal; /* XXX FIXME bug 249 */ @@ -183,12 +228,18 @@ struct ptlrpc_request { __u64 rq_transno; __u64 rq_xid; +#if SWAB_PARANOIA + __u32 rq_req_swab_mask; + __u32 rq_rep_swab_mask; +#endif + + int rq_import_generation; int rq_level; wait_queue_head_t rq_wait_for_rep; /* XXX also _for_ack */ /* incoming reply */ ptl_md_t rq_reply_md; - ptl_handle_me_t rq_reply_me_h; + ptl_handle_md_t rq_reply_md_h; /* outgoing req/rep */ ptl_md_t rq_req_md; @@ -202,26 +253,60 @@ struct ptlrpc_request { void (*rq_replay_cb)(struct ptlrpc_request *); void *rq_replay_data; + struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */ + time_t rq_sent; /* when the request was sent */ + + /* Multi-rpc bits */ + struct list_head rq_set_chain; + struct ptlrpc_request_set *rq_set; + void *rq_interpret_reply; /* Async completion handler */ + union ptlrpc_async_args rq_async_args; /* Async completion context */ + /* Only used on the server side for tracking acks. */ struct ptlrpc_req_ack_lock { struct lustre_handle lock; __u32 mode; - } rq_ack_locks[4]; + } rq_ack_locks[REQ_MAX_ACK_LOCKS]; }; +#define RQ_PHASE_NEW 0xebc0de00 +#define RQ_PHASE_RPC 0xebc0de01 +#define RQ_PHASE_BULK 0xebc0de02 +#define RQ_PHASE_INTERPRET 0xebc0de03 +#define RQ_PHASE_COMPLETE 0xebc0de04 + +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +#define DEBUG_REQ_FLAGS(req) \ + ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \ + (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \ + (req->rq_phase == RQ_PHASE_INTERPRET) ? "Interpret" : \ + (req->rq_phase == RQ_PHASE_COMPLETE) ? "Complete" : "?phase?"), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_want_ack, "A"), FLAG(req->rq_err, "E"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), FLAG(req->rq_resent, "s"), \ + FLAG(req->rq_no_recov, "n"), FLAG(req->rq_waiting, "W") + +#define REQ_FLAGS_FMT "%s%s%s%s%s%s%s%s%s%s%s%s%s" + #define DEBUG_REQ(level, req, fmt, args...) \ do { \ -CDEBUG(level, \ - "@@@ " fmt " req@%p x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \ - "%x/%x/%x rc %x\n" , ## args, req, req->rq_xid, \ +CDEBUG(level, "@@@ " fmt \ + " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl " \ + REQ_FLAGS_FMT"/%x/%x rc %x\n" , ## args, req, req->rq_xid, \ req->rq_reqmsg ? req->rq_reqmsg->transno : -1, \ req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \ + req->rq_import ? (char *)req->rq_import->imp_target_uuid.uuid : "", \ req->rq_connection ? \ (char *)req->rq_connection->c_remote_uuid.uuid : "", \ (req->rq_import && req->rq_import->imp_client) ? \ req->rq_import->imp_client->cli_request_portal : -1, \ req->rq_reqlen, req->rq_replen, \ - atomic_read (&req->rq_refcount), req->rq_flags, \ + atomic_read(&req->rq_refcount), \ + DEBUG_REQ_FLAGS(req), \ req->rq_reqmsg ? req->rq_reqmsg->flags : 0, \ req->rq_repmsg ? req->rq_repmsg->flags : 0, \ req->rq_status); \ @@ -230,45 +315,43 @@ CDEBUG(level, \ struct ptlrpc_bulk_page { struct ptlrpc_bulk_desc *bp_desc; struct list_head bp_link; - void *bp_buf; int bp_buflen; + int bp_pageoffset; /* offset within a page */ struct page *bp_page; - __u32 bp_xid; - __u32 bp_flags; - struct dentry *bp_dentry; - int (*bp_cb)(struct ptlrpc_bulk_page *); }; +#define BULK_GET_SOURCE 0 +#define BULK_PUT_SINK 1 +#define BULK_GET_SINK 2 +#define BULK_PUT_SOURCE 3 struct ptlrpc_bulk_desc { - struct list_head bd_set_chain; /* entry in obd_brw_set */ - struct obd_brw_set *bd_brw_set; - int bd_flags; - struct ptlrpc_connection *bd_connection; - struct ptlrpc_client *bd_client; + unsigned int bd_complete:1; + unsigned int bd_network_rw:1; /* accessible to the network */ + unsigned int bd_type:2; /* {put,get}{source,sink} */ + unsigned int bd_registered:1; /* client side */ + spinlock_t bd_lock; /* serialise with callback */ + int bd_import_generation; + struct obd_export *bd_export; + struct obd_import *bd_import; __u32 bd_portal; - struct lustre_handle bd_conn; - void (*bd_ptl_ev_hdlr)(struct ptlrpc_bulk_desc *); - - wait_queue_head_t bd_waitq; + struct ptlrpc_request *bd_req; /* associated request */ + wait_queue_head_t bd_waitq; /* server side only WQ */ struct list_head bd_page_list; __u32 bd_page_count; - atomic_t bd_refcount; - void *bd_desc_private; - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - struct work_struct bd_queue; -#else - struct tq_struct bd_queue; -#endif - + __u32 bd_last_xid; + ptl_md_t bd_md; ptl_handle_md_t bd_md_h; ptl_handle_me_t bd_me_h; - atomic_t bd_source_callback_count; + int bd_callback_count; /* server side callbacks */ +#ifdef __KERNEL__ + ptl_kiov_t bd_iov[16]; /* self-sized pre-allocated iov */ +#else struct iovec bd_iov[16]; /* self-sized pre-allocated iov */ +#endif }; struct ptlrpc_thread { @@ -289,6 +372,7 @@ struct ptlrpc_request_buffer_desc { struct ptlrpc_ni { /* Generic interface state */ char *pni_name; + int pni_number; ptl_handle_ni_t pni_ni_h; ptl_handle_eq_t pni_request_out_eq_h; ptl_handle_eq_t pni_reply_in_eq_h; @@ -328,29 +412,23 @@ struct ptlrpc_service { struct list_head srv_threads; int (*srv_handler)(struct ptlrpc_request *req); char *srv_name; /* only statically allocated strings here; we don't clean them */ + struct proc_dir_entry *svc_procroot; + struct lprocfs_counters *svc_counters; int srv_interface_rover; struct ptlrpc_srv_ni srv_interfaces[0]; }; -static inline void ptlrpc_hdl2req(struct ptlrpc_request *req, - struct lustre_handle *h) -{ - req->rq_reqmsg->addr = h->addr; - req->rq_reqmsg->cookie = h->cookie; -} - -typedef void (*bulk_callback_t)(struct ptlrpc_bulk_desc *, void *); - typedef int (*svc_handler_t)(struct ptlrpc_request *req); -/* rpc/events.c */ +/* ptlrpc/events.c */ extern struct ptlrpc_ni ptlrpc_interfaces[]; extern int ptlrpc_ninterfaces; -extern int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer); +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, struct ptlrpc_peer *peer); -/* rpc/connection.c */ -void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *uuid); +/* ptlrpc/connection.c */ +void ptlrpc_dump_connections(void); +void ptlrpc_readdress_connection(struct ptlrpc_connection *, struct obd_uuid *); struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, struct obd_uuid *uuid); int ptlrpc_put_connection(struct ptlrpc_connection *c); @@ -358,58 +436,74 @@ struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); void ptlrpc_init_connection(void); void ptlrpc_cleanup_connection(void); -/* rpc/niobuf.c */ -int ptlrpc_check_bulk_sent(struct ptlrpc_bulk_desc *bulk); -int ptlrpc_check_bulk_received(struct ptlrpc_bulk_desc *bulk); +/* ptlrpc/niobuf.c */ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *); int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *); -int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *); -int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *); -int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk); -struct obd_brw_set *obd_brw_set_new(void); -void obd_brw_set_add(struct obd_brw_set *, struct ptlrpc_bulk_desc *); -void obd_brw_set_del(struct ptlrpc_bulk_desc *); -void obd_brw_set_decref(struct obd_brw_set *set); -void obd_brw_set_addref(struct obd_brw_set *set); - -int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req); -int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req); +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *bulk); +int ptlrpc_register_bulk(struct ptlrpc_request *req); +void ptlrpc_unregister_bulk (struct ptlrpc_request *req); + +static inline int ptlrpc_bulk_complete (struct ptlrpc_bulk_desc *desc) +{ + unsigned long flags; + int rc; + + spin_lock_irqsave (&desc->bd_lock, flags); + rc = desc->bd_complete; + spin_unlock_irqrestore (&desc->bd_lock, flags); + return (rc); +} + +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); int ptl_send_rpc(struct ptlrpc_request *request); void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd); -/* rpc/client.c */ +/* ptlrpc/client.c */ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, struct ptlrpc_client *); void ptlrpc_cleanup_client(struct obd_import *imp); struct obd_uuid *ptlrpc_req_to_uuid(struct ptlrpc_request *req); struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); -int ll_brw_sync_wait(struct obd_brw_set *, int phase); - int ptlrpc_queue_wait(struct ptlrpc_request *req); -void ptlrpc_continue_req(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); -int ptlrpc_abort(struct ptlrpc_request *req); +void ptlrpc_unregister_reply(struct ptlrpc_request *req); void ptlrpc_restart_req(struct ptlrpc_request *req); -void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import); +void ptlrpc_abort_inflight(struct obd_import *imp); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +int ptlrpc_set_wait(struct ptlrpc_request_set *); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, int count, int *lengths, char **bufs); void ptlrpc_free_req(struct ptlrpc_request *request); void ptlrpc_req_finished(struct ptlrpc_request *request); struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); -struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, + int type, int portal); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp(struct ptlrpc_request *req, + int type, int portal); void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); -struct ptlrpc_bulk_page *ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc); +int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len); void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *page); void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, struct obd_import *imp); +__u64 ptlrpc_next_xid(void); -/* rpc/service.c */ +/* ptlrpc/ptlrpc_module.c */ +void ptlrpc_put_ldlm_hooks(void); +int ptlrpc_ldlm_hooks_referenced(void); + +/* ptlrpc/service.c */ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size, - int req_portal, int rep_portal, svc_handler_t, char *name); + int req_portal, int rep_portal, svc_handler_t, char *name, + struct obd_device *dev); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, char *name); @@ -422,31 +516,21 @@ struct ptlrpc_svc_data { struct obd_device *dev; }; -/* rpc/pack_generic.c */ +/* ptlrpc/pack_generic.c */ int lustre_pack_msg(int count, int *lens, char **bufs, int *len, struct lustre_msg **msg); int lustre_msg_size(int count, int *lengths); int lustre_unpack_msg(struct lustre_msg *m, int len); -void *lustre_msg_buf(struct lustre_msg *m, int n); - -/* rpc/rpc.c */ -__u32 ptlrpc_next_xid(void); - -static inline void ptlrpc_bulk_decref(struct ptlrpc_bulk_desc *desc) -{ - CDEBUG(D_PAGE, "%p -> %d\n", desc, atomic_read(&desc->bd_refcount) - 1); - - if (atomic_dec_and_test(&desc->bd_refcount)) { - CDEBUG(D_PAGE, "Released last ref on %p, freeing\n", desc); - ptlrpc_free_bulk(desc); - } -} - -static inline void ptlrpc_bulk_addref(struct ptlrpc_bulk_desc *desc) -{ - atomic_inc(&desc->bd_refcount); - CDEBUG(D_PAGE, "Set refcount of %p to %d\n", desc, - atomic_read(&desc->bd_refcount)); -} +void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); +char *lustre_msg_string (struct lustre_msg *m, int n, int max_len); +void *lustre_swab_reqbuf (struct ptlrpc_request *req, int n, int minlen, + void *swabber); +void *lustre_swab_repbuf (struct ptlrpc_request *req, int n, int minlen, + void *swabber); + +/* ldlm/ldlm_lib.c */ +int client_import_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int client_import_disconnect(struct lustre_handle *conn, int failover); #endif diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index f3163fe..fe53974 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -17,7 +17,11 @@ struct lov_oinfo { /* per-child structure */ }; struct lov_stripe_md { + /* Public members. */ __u64 lsm_object_id; /* lov object id */ + __u64 lsm_maxbytes; + + /* LOV-private members start here -- only for use in lov/. */ __u32 lsm_magic; __u32 lsm_stripe_size; /* size of the stripe */ unsigned lsm_stripe_offset; /* offset of first stripe in lmd_objects */ @@ -28,6 +32,7 @@ struct lov_stripe_md { #define IOC_OSC_TYPE 'h' #define IOC_OSC_MIN_NR 20 #define IOC_OSC_REGISTER_LOV _IOWR(IOC_OSC_TYPE, 20, struct obd_device *) +#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *) #define IOC_OSC_MAX_NR 50 #define IOC_MDC_TYPE 'i' @@ -66,48 +71,8 @@ struct brw_page { /* Individual type definitions */ -struct ext2_obd { - struct super_block *e2_sb; - struct vfsmount *e2_vfsmnt; -}; - -struct obd_ucred { - __u32 ouc_fsuid; - __u32 ouc_fsgid; - __u32 ouc_cap; - __u32 ouc_suppgid1; - __u32 ouc_suppgid2; -}; - -#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA -#define OBD_CTXT_DEBUG /* development-only debugging */ -struct obd_run_ctxt { - struct vfsmount *pwdmnt; - struct dentry *pwd; - mm_segment_t fs; - __u32 fsuid; - __u32 fsgid; - __u32 cap; -#ifdef OBD_CTXT_DEBUG - __u32 magic; -#endif -}; - - -#ifdef OBD_CTXT_DEBUG -#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC -#else -#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0) -#endif - struct ost_server_data; -#define FILTER_TRANSNO_SEM - -#ifndef OST_RECOVERY -#undef FILTER_TRANSNO_SEM -#endif - struct filter_obd { char *fo_fstype; struct super_block *fo_sb; @@ -117,11 +82,7 @@ struct filter_obd { struct dentry *fo_dentry_O_mode[16]; struct dentry **fo_dentry_O_sub; spinlock_t fo_objidlock; /* protects fo_lastobjid increment */ -#ifdef FILTER_TRANSNO_SEM - struct semaphore fo_transno_sem; -#else spinlock_t fo_translock; /* protects fsd_last_rcvd increment */ -#endif struct file *fo_rcvd_filp; struct filter_server_data *fo_fsd; unsigned long *fo_last_rcvd_slots; @@ -137,10 +98,9 @@ struct filter_obd { struct mds_server_data; struct client_obd { - struct obd_import cl_import; + struct obd_import *cl_import; struct semaphore cl_sem; int cl_conn_count; - struct obd_uuid cl_target_uuid; /* XXX -> lustre_name */ /* max_mds_easize is purely a performance thing so we don't have to * call obd_size_wiremd() all the time. */ int cl_max_mds_easize; @@ -155,6 +115,7 @@ struct mds_obd { struct super_block *mds_sb; struct vfsmount *mds_vfsmnt; + struct dentry *mds_fid_de; struct obd_run_ctxt mds_ctxt; struct file_operations *mds_fop; struct inode_operations *mds_iop; @@ -170,6 +131,7 @@ struct mds_obd { int mds_has_lov_desc; struct lov_desc mds_lov_desc; + unsigned long *mds_client_bitmap; }; struct ldlm_obd { @@ -202,8 +164,10 @@ struct ptlbd_obd { struct ptlrpc_service *ptlbd_service; struct file *filp; /* client's */ - struct ptlrpc_client bd_client; - struct obd_import bd_import; + struct ptlrpc_client bd_client; + struct obd_import *bd_import; + struct obd_uuid bd_server_uuid; + struct lustre_handle bd_connect_handle; int refcount; /* XXX sigh */ }; @@ -219,18 +183,6 @@ struct recovd_obd { __u32 recovd_state; }; -struct trace_obd { - struct obdtrace_opstats *stats; -}; - -#if 0 -struct snap_obd { - unsigned int snap_index; /* which snapshot index are we accessing */ - int snap_tableno; -}; - -#endif - struct ost_obd { struct ptlrpc_service *ost_service; }; @@ -245,8 +197,8 @@ struct echo_client_obd { }; struct cache_obd { - struct lustre_handle cobd_target; /* local connection to target obd */ - struct lustre_handle cobd_cache; /* local connection to cache obd */ + struct lustre_handle cobd_target; /* local connection to target obd */ + struct lustre_handle cobd_cache; /* local connection to cache obd */ }; struct lov_tgt_desc { @@ -267,11 +219,9 @@ struct lov_obd { struct niobuf_local { __u64 offset; __u32 len; - __u32 xid; __u32 flags; - void *addr; + __u32 rc; struct page *page; - void *target_private; struct dentry *dentry; }; @@ -280,6 +230,11 @@ struct niobuf_local { struct obd_trans_info { __u64 oti_transno; + /* Only used on the server side for tracking acks. */ + struct oti_req_ack_lock { + struct lustre_handle lock; + __u32 mode; + } oti_ack_locks[4]; }; /* corresponds to one of the obd's */ @@ -291,7 +246,11 @@ struct obd_device { struct obd_uuid obd_uuid; int obd_minor; - int obd_flags; + int obd_attached:1, obd_set_up:1, obd_recovering:1, + obd_abort_recovery:1, obd_replayable:1, obd_no_transno:1, + obd_no_recov:1, obd_stopping:1; + atomic_t obd_refcount; + wait_queue_head_t obd_refcount_waitq; struct proc_dir_entry *obd_proc_entry; struct list_head obd_exports; struct list_head obd_imports; @@ -309,12 +268,12 @@ struct obd_device { pid_t obd_processing_task; __u64 obd_next_recovery_transno; wait_queue_head_t obd_next_transno_waitq; + wait_queue_head_t obd_commit_waitq; struct timer_list obd_recovery_timer; struct list_head obd_recovery_queue; struct list_head obd_delayed_reply_queue; union { - struct ext2_obd ext2; struct filter_obd filter; struct mds_obd mds; struct client_obd cli; @@ -323,16 +282,12 @@ struct obd_device { struct ldlm_obd ldlm; struct echo_obd echo; struct recovd_obd recovd; - struct trace_obd trace; struct lov_obd lov; struct cache_obd cobd; struct ptlbd_obd ptlbd; -#if 0 - struct snap_obd snap; -#endif } u; /* Fields used by LProcFS */ - unsigned int cntr_mem_size; + unsigned int cntr_base; void *counters; }; @@ -340,27 +295,25 @@ struct obd_ops { struct module *o_owner; int (*o_iocontrol)(unsigned int cmd, struct lustre_handle *, int len, void *karg, void *uarg); - int (*o_get_info)(struct lustre_handle *, obd_count keylen, void *key, - obd_count *vallen, void **val); - int (*o_set_info)(struct lustre_handle *, obd_count keylen, void *key, - obd_count vallen, void *val); + int (*o_get_info)(struct lustre_handle *, __u32 keylen, void *key, + __u32 *vallen, void *val); + int (*o_set_info)(struct lustre_handle *, __u32 keylen, void *key, + __u32 vallen, void *val); int (*o_attach)(struct obd_device *dev, obd_count len, void *data); int (*o_detach)(struct obd_device *dev); int (*o_setup) (struct obd_device *dev, obd_count len, void *data); - int (*o_cleanup)(struct obd_device *dev); + int (*o_cleanup)(struct obd_device *dev, int force, int failover); int (*o_connect)(struct lustre_handle *conn, struct obd_device *src, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover); - int (*o_disconnect)(struct lustre_handle *conn); - + struct obd_uuid *cluuid); + int (*o_disconnect)(struct lustre_handle *conn, int failover); int (*o_statfs)(struct lustre_handle *conn, struct obd_statfs *osfs); - int (*o_syncfs)(struct lustre_handle *conn); - int (*o_packmd)(struct lustre_handle *, struct lov_mds_md **wire_tgt, + int (*o_syncfs)(struct obd_export *); + int (*o_packmd)(struct lustre_handle *, struct lov_mds_md **disk_tgt, struct lov_stripe_md *mem_src); int (*o_unpackmd)(struct lustre_handle *, struct lov_stripe_md **mem_tgt, - struct lov_mds_md *wire_src); + struct lov_mds_md *disk_src, int disk_len); int (*o_preallocate)(struct lustre_handle *, obd_count *req, obd_id *ids); int (*o_create)(struct lustre_handle *conn, struct obdo *oa, @@ -371,14 +324,21 @@ struct obd_ops { struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_getattr)(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *ea); + int (*o_getattr_async)(struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *ea, + struct ptlrpc_request_set *set); int (*o_open)(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti); + struct lov_stripe_md *ea, struct obd_trans_info *oti, + struct obd_client_handle *och); int (*o_close)(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti); int (*o_brw)(int rw, struct lustre_handle *conn, struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pgarr, struct obd_brw_set *, - struct obd_trans_info *oti); + struct brw_page *pgarr, struct obd_trans_info *oti); + int (*o_brw_async)(int rw, struct lustre_handle *conn, + struct lov_stripe_md *ea, obd_count oa_bufs, + struct brw_page *pgarr, struct ptlrpc_request_set *, + struct obd_trans_info *oti); int (*o_punch)(struct lustre_handle *conn, struct obdo *tgt, struct lov_stripe_md *ea, obd_size count, obd_off offset, struct obd_trans_info *oti); @@ -392,12 +352,12 @@ struct obd_ops { int (*o_iterate)(struct lustre_handle *conn, int (*)(obd_id, obd_gr, void *), obd_id *startid, obd_gr group, void *data); - int (*o_preprw)(int cmd, struct lustre_handle *conn, + int (*o_preprw)(int cmd, struct obd_export *, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *remote, struct niobuf_local *local, void **desc_private, struct obd_trans_info *oti); - int (*o_commitrw)(int cmd, struct lustre_handle *conn, + int (*o_commitrw)(int cmd, struct obd_export *, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, void *desc_private, struct obd_trans_info *oti); @@ -406,12 +366,37 @@ struct obd_ops { __u32 type, void *cookie, int cookielen, __u32 mode, int *flags, void *cb, void *data, int datalen, struct lustre_handle *lockh); + int (*o_match)(struct lustre_handle *conn, struct lov_stripe_md *md, + __u32 type, void *cookie, int cookielen, __u32 mode, + int *flags, struct lustre_handle *lockh); int (*o_cancel)(struct lustre_handle *, struct lov_stripe_md *md, __u32 mode, struct lustre_handle *); int (*o_cancel_unused)(struct lustre_handle *, struct lov_stripe_md *, - int local_only); + int local_only, void *opaque); int (*o_san_preprw)(int cmd, struct lustre_handle *conn, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *remote); + void (*o_destroy_export)(struct obd_export *export); }; + +static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, + int error) +{ + if (error) { + CDEBUG(D_ERROR, "%s: transno "LPD64" commit error: %d\n", + obd->obd_name, transno, error); + return; + } + CDEBUG(D_HA, "%s: transno "LPD64" committed\n", + obd->obd_name, transno); + if (transno > obd->obd_last_committed) { + obd->obd_last_committed = transno; + wake_up(&obd->obd_commit_waitq); + } +} + +/* When adding a function pointer to struct obd_ops, please update + * function lprocfs_alloc_obd_counters() in obdclass/lprocfs_status.c + * accordingly. */ + #endif /* __OBD_H */ diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index b571b06..64b0a68 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -24,10 +24,8 @@ #define __LINUX_CLASS_OBD_H #ifndef __KERNEL__ -# include -# define __KERNEL__ -# include -# undef __KERNEL__ +#include +#include #else #include #include @@ -51,16 +49,36 @@ #define MAX_OBD_DEVICES 128 extern struct obd_device obd_dev[MAX_OBD_DEVICES]; -#define OBD_ATTACHED 0x01 -#define OBD_SET_UP 0x02 -#define OBD_RECOVERING 0x04 -#define OBD_ABORT_RECOVERY 0x08 -#define OBD_REPLAYABLE 0x10 -#define OBD_NO_TRANSNO 0x20 /* XXX needs better name */ - /* OBD Operations Declarations */ extern struct obd_device *class_conn2obd(struct lustre_handle *); -extern struct obd_export *class_conn2export(struct lustre_handle *); + +/* genops.c */ +struct obd_export *class_conn2export(struct lustre_handle *); +int class_register_type(struct obd_ops *ops, struct lprocfs_vars *, char *nm); +int class_unregister_type(char *nm); +int class_name2dev(char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); + +struct obd_export *class_export_get(struct obd_export *); +void class_export_put(struct obd_export *); +struct obd_export *class_new_export(struct obd_device *obddev); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(void); +void class_destroy_import(struct obd_import *exp); + +struct obd_type *class_get_type(char *name); +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct lustre_handle *conn, int failover); +void class_disconnect_exports(struct obd_device *obddev, int failover); +/* generic operations shared by various OBD types */ +int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data); +int class_multi_cleanup(struct obd_device *obddev); static inline int obd_check_conn(struct lustre_handle *conn) { @@ -76,12 +94,12 @@ static inline int obd_check_conn(struct lustre_handle *conn) RETURN(-ENODEV); } - if (!obd->obd_flags & OBD_ATTACHED ) { + if (!obd->obd_attached) { CERROR("obd %d not attached\n", obd->obd_minor); RETURN(-ENODEV); } - if (!obd->obd_flags & OBD_SET_UP) { + if (!obd->obd_set_up) { CERROR("obd %d not setup\n", obd->obd_minor); RETURN(-ENODEV); } @@ -103,41 +121,108 @@ static inline int obd_check_conn(struct lustre_handle *conn) #define OBT(dev) (dev)->obd_type #define OBP(dev, op) (dev)->obd_type->typ_ops->o_ ## op -#define OBD_CHECK_SETUP(conn, exp) \ +/* Ensure obd_setup: used for disconnect which might be called while + an obd is stopping. */ +#define OBD_CHECK_SETUP(conn, exp) \ +do { \ + if (!(conn)) { \ + CERROR("NULL connection\n"); \ + RETURN(-EINVAL); \ + } \ + \ + exp = class_conn2export(conn); \ + if (!(exp)) { \ + CERROR("No export for conn "LPX64"\n", (conn)->cookie); \ + RETURN(-EINVAL); \ + } \ + \ + if (!(exp)->exp_obd->obd_set_up) { \ + CERROR("Device %d not setup\n", \ + (exp)->exp_obd->obd_minor); \ + class_export_put(exp); \ + RETURN(-EINVAL); \ + } \ +} while (0) + +/* Ensure obd_setup and !obd_stopping. */ +#define OBD_CHECK_ACTIVE(conn, exp) \ +do { \ + if (!(conn)) { \ + CERROR("NULL connection\n"); \ + RETURN(-EINVAL); \ + } \ + \ + exp = class_conn2export(conn); \ + if (!(exp)) { \ + CERROR("No export for conn "LPX64"\n", (conn)->cookie); \ + RETURN(-EINVAL); \ + } \ + \ + if (!(exp)->exp_obd->obd_set_up || (exp)->exp_obd->obd_stopping) { \ + CERROR("Device %d not setup\n", \ + (exp)->exp_obd->obd_minor); \ + class_export_put(exp); \ + RETURN(-EINVAL); \ + } \ +} while (0) + +/* Ensure obd_setup: used for cleanup which must be called + while obd is stopping */ +#define OBD_CHECK_DEV_STOPPING(obd) \ do { \ - if (!(conn)) { \ - CERROR("NULL connection\n"); \ - RETURN(-EINVAL); \ + if (!(obd)) { \ + CERROR("NULL device\n"); \ + RETURN(-ENODEV); \ } \ \ - exp = class_conn2export(conn); \ - if (!(exp)) { \ - CERROR("No export for conn "LPX64":"LPX64"\n", \ - conn->addr, conn->cookie); \ - RETURN(-EINVAL); \ + if (!(obd)->obd_set_up) { \ + CERROR("Device %d not setup\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ } \ \ - if (!((exp)->exp_obd->obd_flags & OBD_SET_UP)) { \ - CERROR("Device %d not setup\n", \ - (exp)->exp_obd->obd_minor); \ - RETURN(-EINVAL); \ + if (!(obd)->obd_stopping) { \ + CERROR("Device %d not stopping\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ } \ } while (0) -#define OBD_CHECK_DEVSETUP(obd) \ +/* ensure obd_setup and !obd_stopping */ +#define OBD_CHECK_DEV_ACTIVE(obd) \ do { \ if (!(obd)) { \ CERROR("NULL device\n"); \ - RETURN(-EINVAL); \ + RETURN(-ENODEV); \ } \ \ - if (!((obd)->obd_flags & OBD_SET_UP)) { \ + if (!(obd)->obd_set_up || (obd)->obd_stopping) { \ CERROR("Device %d not setup\n", \ (obd)->obd_minor); \ - RETURN(-EINVAL); \ + RETURN(-ENODEV); \ } \ } while (0) + +#ifdef LPROCFS +#define OBD_COUNTER_OFFSET(op) \ + ((offsetof(struct obd_ops, o_ ## op) - \ + offsetof(struct obd_ops, o_iocontrol)) \ + / sizeof(((struct obd_ops *)(0))->o_iocontrol)) + +#define OBD_COUNTER_INCREMENT(obd, op) \ + if ((obd)->counters != NULL) { \ + struct lprocfs_counters* cntrs = obd->counters; \ + unsigned int coffset; \ + coffset = (obd)->cntr_base + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < cntrs->num); \ + LPROCFS_COUNTER_INCBY1(&cntrs->cntr[coffset]); \ + } +#else +#define OBD_COUNTER_OFFSET(op) +#define OBD_COUNTER_INCREMENT(obd, op) +#endif + #define OBD_CHECK_OP(obd, op) \ do { \ if (!OBP((obd), op)) { \ @@ -145,19 +230,21 @@ do { \ obd->obd_minor); \ RETURN(-EOPNOTSUPP); \ } \ + OBD_COUNTER_INCREMENT(obd, op); \ } while (0) -static inline int obd_get_info(struct lustre_handle *conn, obd_count keylen, - void *key, obd_count *vallen, void **val) +static inline int obd_get_info(struct lustre_handle *conn, __u32 keylen, + void *key, __u32 *vallen, void *val) { struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, get_info); rc = OBP(exp->exp_obd, get_info)(conn, keylen, key, vallen, val); + class_export_put(exp); RETURN(rc); } @@ -168,10 +255,11 @@ static inline int obd_set_info(struct lustre_handle *conn, obd_count keylen, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, set_info); rc = OBP(exp->exp_obd, set_info)(conn, keylen, key, vallen, val); + class_export_put(exp); RETURN(rc); } @@ -186,85 +274,93 @@ static inline int obd_setup(struct obd_device *obd, int datalen, void *data) RETURN(rc); } -static inline int obd_cleanup(struct obd_device *obd) +static inline int obd_cleanup(struct obd_device *obd, int force, int failover) { int rc; ENTRY; - OBD_CHECK_DEVSETUP(obd); + OBD_CHECK_DEV_STOPPING(obd); OBD_CHECK_OP(obd, cleanup); - rc = OBP(obd, cleanup)(obd); + rc = OBP(obd, cleanup)(obd, force, failover); RETURN(rc); } -/* Pack an in-memory MD struct for sending to the MDS and/or disk. +/* Pack an in-memory MD struct for storage on disk. * Returns +ve size of packed MD (0 for free), or -ve error. * - * If @wire_tgt == NULL, MD size is returned (max size if @mem_src == NULL). - * If @*wire_tgt != NULL and @mem_src == NULL, @*wire_tgt will be freed. - * If @*wire_tgt == NULL, it will be allocated + * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL). + * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed. + * If @*disk_tgt == NULL, it will be allocated */ static inline int obd_packmd(struct lustre_handle *conn, - struct lov_mds_md **wire_tgt, + struct lov_mds_md **disk_tgt, struct lov_stripe_md *mem_src) { struct obd_export *exp; + int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, packmd); - RETURN(OBP(exp->exp_obd, packmd)(conn, wire_tgt, mem_src)); + rc = OBP(exp->exp_obd, packmd)(conn, disk_tgt, mem_src); + class_export_put(exp); + RETURN(rc); } -static inline int obd_size_wiremd(struct lustre_handle *conn, +static inline int obd_size_diskmd(struct lustre_handle *conn, struct lov_stripe_md *mem_src) { return obd_packmd(conn, NULL, mem_src); } /* helper functions */ -static inline int obd_alloc_wiremd(struct lustre_handle *conn, - struct lov_mds_md **wire_tgt) +static inline int obd_alloc_diskmd(struct lustre_handle *conn, + struct lov_mds_md **disk_tgt) { - LASSERT(wire_tgt); - LASSERT(*wire_tgt == NULL); - return obd_packmd(conn, wire_tgt, NULL); + LASSERT(disk_tgt); + LASSERT(*disk_tgt == NULL); + return obd_packmd(conn, disk_tgt, NULL); } -static inline int obd_free_wiremd(struct lustre_handle *conn, - struct lov_mds_md **wire_tgt) +static inline int obd_free_diskmd(struct lustre_handle *conn, + struct lov_mds_md **disk_tgt) { - LASSERT(wire_tgt); - LASSERT(*wire_tgt); - return obd_packmd(conn, wire_tgt, NULL); + LASSERT(disk_tgt); + LASSERT(*disk_tgt); + return obd_packmd(conn, disk_tgt, NULL); } -/* Unpack an MD struct from the MDS and/or disk to in-memory format. +/* Unpack an MD struct from disk to in-memory format. * Returns +ve size of unpacked MD (0 for free), or -ve error. * - * If @mem_tgt == NULL, MD size is returned (max size if @wire_src == NULL). - * If @*mem_tgt != NULL and @wire_src == NULL, @*mem_tgt will be freed. + * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL). + * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed. * If @*mem_tgt == NULL, it will be allocated */ static inline int obd_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **mem_tgt, - struct lov_mds_md *wire_src) + struct lov_mds_md *disk_src, + int disk_len) { struct obd_export *exp; + int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, unpackmd); - RETURN(OBP(exp->exp_obd, unpackmd)(conn, mem_tgt, wire_src)); + rc = OBP(exp->exp_obd, unpackmd)(conn, mem_tgt, disk_src, disk_len); + class_export_put(exp); + RETURN(rc); } static inline int obd_size_memmd(struct lustre_handle *conn, - struct lov_mds_md *wire_src) + struct lov_mds_md *disk_src, + int disk_len) { - return obd_unpackmd(conn, NULL, wire_src); + return obd_unpackmd(conn, NULL, disk_src, disk_len); } /* helper functions */ @@ -273,7 +369,7 @@ static inline int obd_alloc_memmd(struct lustre_handle *conn, { LASSERT(mem_tgt); LASSERT(*mem_tgt == NULL); - return obd_unpackmd(conn, mem_tgt, NULL); + return obd_unpackmd(conn, mem_tgt, NULL, 0); } static inline int obd_free_memmd(struct lustre_handle *conn, @@ -281,7 +377,7 @@ static inline int obd_free_memmd(struct lustre_handle *conn, { LASSERT(mem_tgt); LASSERT(*mem_tgt); - return obd_unpackmd(conn, mem_tgt, NULL); + return obd_unpackmd(conn, mem_tgt, NULL, 0); } static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo, @@ -292,10 +388,11 @@ static inline int obd_create(struct lustre_handle *conn, struct obdo *obdo, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, create); rc = OBP(exp->exp_obd, create)(conn, obdo, ea, oti); + class_export_put(exp); RETURN(rc); } @@ -307,10 +404,11 @@ static inline int obd_destroy(struct lustre_handle *conn, struct obdo *obdo, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, destroy); rc = OBP(exp->exp_obd, destroy)(conn, obdo, ea, oti); + class_export_put(exp); RETURN(rc); } @@ -321,10 +419,27 @@ static inline int obd_getattr(struct lustre_handle *conn, struct obdo *obdo, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, getattr); rc = OBP(exp->exp_obd, getattr)(conn, obdo, ea); + class_export_put(exp); + RETURN(rc); +} + +static inline int obd_getattr_async(struct lustre_handle *conn, struct obdo *obdo, + struct lov_stripe_md *ea, + struct ptlrpc_request_set *set) +{ + struct obd_export *exp; + int rc; + ENTRY; + + OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_OP(exp->exp_obd, getattr); + + rc = OBP(exp->exp_obd, getattr_async)(conn, obdo, ea, set); + class_export_put(exp); RETURN(rc); } @@ -336,24 +451,27 @@ static inline int obd_close(struct lustre_handle *conn, struct obdo *obdo, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, close); rc = OBP(exp->exp_obd, close)(conn, obdo, ea, oti); + class_export_put(exp); RETURN(rc); } static inline int obd_open(struct lustre_handle *conn, struct obdo *obdo, - struct lov_stripe_md *ea, struct obd_trans_info *oti) + struct lov_stripe_md *ea, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, open); - rc = OBP(exp->exp_obd, open)(conn, obdo, ea, oti); + rc = OBP(exp->exp_obd, open)(conn, obdo, ea, oti, och); + class_export_put(exp); RETURN(rc); } @@ -365,29 +483,28 @@ static inline int obd_setattr(struct lustre_handle *conn, struct obdo *obdo, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, setattr); rc = OBP(exp->exp_obd, setattr)(conn, obdo, ea, oti); + class_export_put(exp); RETURN(rc); } static inline int obd_connect(struct lustre_handle *conn, - struct obd_device *obd, struct obd_uuid *cluuid, - struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_device *obd, struct obd_uuid *cluuid) { int rc; ENTRY; - OBD_CHECK_DEVSETUP(obd); + OBD_CHECK_DEV_ACTIVE(obd); OBD_CHECK_OP(obd, connect); - rc = OBP(obd, connect)(conn, obd, cluuid, recovd, recover); + rc = OBP(obd, connect)(conn, obd, cluuid); RETURN(rc); } -static inline int obd_disconnect(struct lustre_handle *conn) +static inline int obd_disconnect(struct lustre_handle *conn, int failover) { struct obd_export *exp; int rc; @@ -396,33 +513,41 @@ static inline int obd_disconnect(struct lustre_handle *conn) OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, disconnect); - rc = OBP(exp->exp_obd, disconnect)(conn); + rc = OBP(exp->exp_obd, disconnect)(conn, failover); + class_export_put(exp); RETURN(rc); } +static inline void obd_destroy_export(struct obd_export *exp) +{ + ENTRY; + if (OBP(exp->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + EXIT; +} + static inline int obd_statfs(struct lustre_handle *conn,struct obd_statfs *osfs) { struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, statfs); rc = OBP(exp->exp_obd, statfs)(conn, osfs); + class_export_put(exp); RETURN(rc); } -static inline int obd_syncfs(struct lustre_handle *conn) +static inline int obd_syncfs(struct obd_export *exp) { - struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, syncfs); - rc = OBP(exp->exp_obd, syncfs)(conn); + rc = OBP(exp->exp_obd, syncfs)(exp); RETURN(rc); } @@ -434,65 +559,86 @@ static inline int obd_punch(struct lustre_handle *conn, struct obdo *oa, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, punch); rc = OBP(exp->exp_obd, punch)(conn, oa, ea, start, end, oti); + class_export_put(exp); RETURN(rc); } static inline int obd_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *ea, obd_count oa_bufs, - struct brw_page *pg, struct obd_brw_set *set, - struct obd_trans_info *oti) + struct brw_page *pg, struct obd_trans_info *oti) { struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, brw); + if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) { + CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, " + "or OBD_BRW_CHECK\n"); + LBUG(); + } + + rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, oti); + class_export_put(exp); + RETURN(rc); +} + +static inline int obd_brw_async(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *ea, obd_count oa_bufs, + struct brw_page *pg, + struct ptlrpc_request_set *set, + struct obd_trans_info *oti) +{ + struct obd_export *exp; + int rc; + ENTRY; + + OBD_CHECK_ACTIVE(conn, exp); + OBD_CHECK_OP(exp->exp_obd, brw_async); + if (!(cmd & OBD_BRW_RWMASK)) { CERROR("obd_brw: cmd must be OBD_BRW_READ or OBD_BRW_WRITE\n"); LBUG(); } - rc = OBP(exp->exp_obd, brw)(cmd, conn, ea, oa_bufs, pg, set, oti); + rc = OBP(exp->exp_obd, brw_async)(cmd, conn, ea, oa_bufs, pg, set, oti); + class_export_put(exp); RETURN(rc); } -static inline int obd_preprw(int cmd, struct lustre_handle *conn, +static inline int obd_preprw(int cmd, struct obd_export *exp, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *remote, struct niobuf_local *local, void **desc_private, struct obd_trans_info *oti) { - struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, preprw); - rc = OBP(exp->exp_obd, preprw)(cmd, conn, objcount, obj, niocount, + rc = OBP(exp->exp_obd, preprw)(cmd, exp, objcount, obj, niocount, remote, local, desc_private, oti); RETURN(rc); } -static inline int obd_commitrw(int cmd, struct lustre_handle *conn, +static inline int obd_commitrw(int cmd, struct obd_export *exp, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *local, void *desc_private, struct obd_trans_info *oti) { - struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); OBD_CHECK_OP(exp->exp_obd, commitrw); - rc = OBP(exp->exp_obd, commitrw)(cmd, conn, objcount, obj, niocount, + rc = OBP(exp->exp_obd, commitrw)(cmd, exp, objcount, obj, niocount, local, desc_private, oti); RETURN(rc); } @@ -504,10 +650,11 @@ static inline int obd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, iocontrol); rc = OBP(exp->exp_obd, iocontrol)(cmd, conn, len, karg, uarg); + class_export_put(exp); RETURN(rc); } @@ -522,15 +669,36 @@ static inline int obd_enqueue(struct lustre_handle *conn, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, enqueue); rc = OBP(exp->exp_obd, enqueue)(conn, ea, parent_lock, type, cookie, cookielen, mode, flags, cb, data, datalen, lockh); + class_export_put(exp); + RETURN(rc); +} + +static inline int obd_match(struct lustre_handle *conn, + struct lov_stripe_md *ea, + __u32 type, void *cookie, int cookielen, + __u32 mode, int *flags, + struct lustre_handle *lockh) +{ + struct obd_export *exp; + int rc; + ENTRY; + + OBD_CHECK_ACTIVE(conn, exp); + OBD_CHECK_OP(exp->exp_obd, match); + + rc = OBP(exp->exp_obd, match)(conn, ea, type, cookie, cookielen, mode, + flags, lockh); + class_export_put(exp); RETURN(rc); } + static inline int obd_cancel(struct lustre_handle *conn, struct lov_stripe_md *ea, __u32 mode, struct lustre_handle *lockh) @@ -539,24 +707,27 @@ static inline int obd_cancel(struct lustre_handle *conn, int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, cancel); rc = OBP(exp->exp_obd, cancel)(conn, ea, mode, lockh); + class_export_put(exp); RETURN(rc); } static inline int obd_cancel_unused(struct lustre_handle *conn, - struct lov_stripe_md *ea, int local) + struct lov_stripe_md *ea, int flags, + void *opaque) { struct obd_export *exp; int rc; ENTRY; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, cancel_unused); - rc = OBP(exp->exp_obd, cancel_unused)(conn, ea, local); + rc = OBP(exp->exp_obd, cancel_unused)(conn, ea, flags, opaque); + class_export_put(exp); RETURN(rc); } @@ -567,11 +738,12 @@ static inline int obd_san_preprw(int cmd, struct lustre_handle *conn, struct obd_export *exp; int rc; - OBD_CHECK_SETUP(conn, exp); + OBD_CHECK_ACTIVE(conn, exp); OBD_CHECK_OP(exp->exp_obd, preprw); rc = OBP(exp->exp_obd, san_preprw)(cmd, conn, objcount, obj, niocount, remote); + class_export_put(exp); RETURN(rc); } @@ -607,39 +779,28 @@ static inline void obdo_free(struct obdo *oa) kmem_cache_free(obdo_cachep, oa); } +#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define to_kdev_t(dev) dev +#define kdev_t_to_nr(dev) dev +#endif + #ifdef __KERNEL__ static inline void obdo_from_iattr(struct obdo *oa, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (ia_valid & ATTR_ATIME) { - oa->o_atime = attr->ia_atime; - oa->o_valid |= OBD_MD_FLATIME; - } - if (ia_valid & ATTR_MTIME) { - oa->o_mtime = attr->ia_mtime; - oa->o_valid |= OBD_MD_FLMTIME; - } - if (ia_valid & ATTR_CTIME) { - oa->o_ctime = attr->ia_ctime; - oa->o_valid |= OBD_MD_FLCTIME; - } -#else if (ia_valid & ATTR_ATIME) { - oa->o_atime = attr->ia_atime.tv_sec; + oa->o_atime = LTIME_S(attr->ia_atime); oa->o_valid |= OBD_MD_FLATIME; } if (ia_valid & ATTR_MTIME) { - oa->o_mtime = attr->ia_mtime.tv_sec; + oa->o_mtime = LTIME_S(attr->ia_mtime); oa->o_valid |= OBD_MD_FLMTIME; } if (ia_valid & ATTR_CTIME) { - oa->o_ctime = attr->ia_ctime.tv_sec; + oa->o_ctime = LTIME_S(attr->ia_ctime); oa->o_valid |= OBD_MD_FLCTIME; } -#endif - if (ia_valid & ATTR_SIZE) { oa->o_size = attr->ia_size; oa->o_valid |= OBD_MD_FLSIZE; @@ -665,33 +826,18 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid) { memset(attr, 0, sizeof(*attr)); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (valid & OBD_MD_FLATIME) { - attr->ia_atime = oa->o_atime; - attr->ia_valid |= ATTR_ATIME; - } - if (valid & OBD_MD_FLMTIME) { - attr->ia_mtime = oa->o_mtime; - attr->ia_valid |= ATTR_MTIME; - } - if (valid & OBD_MD_FLCTIME) { - attr->ia_ctime = oa->o_ctime; - attr->ia_valid |= ATTR_CTIME; - } -#else if (valid & OBD_MD_FLATIME) { - attr->ia_atime.tv_sec = oa->o_atime; + LTIME_S(attr->ia_atime) = oa->o_atime; attr->ia_valid |= ATTR_ATIME; } if (valid & OBD_MD_FLMTIME) { - attr->ia_mtime.tv_sec = oa->o_mtime; + LTIME_S(attr->ia_mtime) = oa->o_mtime; attr->ia_valid |= ATTR_MTIME; } if (valid & OBD_MD_FLCTIME) { - attr->ia_ctime.tv_sec = oa->o_ctime; + LTIME_S(attr->ia_ctime) = oa->o_ctime; attr->ia_valid |= ATTR_CTIME; } -#endif if (valid & OBD_MD_FLSIZE) { attr->ia_size = oa->o_size; attr->ia_valid |= ATTR_SIZE; @@ -721,29 +867,16 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa, /* WARNING: the file systems must take care not to tinker with attributes they don't manage (such as blocks). */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#define to_kdev_t(dev) dev -#define kdev_t_to_nr(dev) dev -#endif static inline void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (valid & OBD_MD_FLATIME) - dst->o_atime = src->i_atime; - if (valid & OBD_MD_FLMTIME) - dst->o_mtime = src->i_mtime; - if (valid & OBD_MD_FLCTIME) - dst->o_ctime = src->i_ctime; -#else if (valid & OBD_MD_FLATIME) - dst->o_atime = src->i_atime.tv_sec; + dst->o_atime = LTIME_S(src->i_atime); if (valid & OBD_MD_FLMTIME) - dst->o_mtime = src->i_mtime.tv_sec; + dst->o_mtime = LTIME_S(src->i_mtime); if (valid & OBD_MD_FLCTIME) - dst->o_ctime = src->i_ctime.tv_sec; -#endif + dst->o_ctime = LTIME_S(src->i_ctime); if (valid & OBD_MD_FLSIZE) dst->o_size = src->i_size; if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ @@ -775,21 +908,12 @@ static inline void obdo_refresh_inode(struct inode *dst, struct obdo *src, { valid &= src->o_valid; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (valid & OBD_MD_FLATIME && src->o_atime > dst->i_atime) - dst->i_atime = src->o_atime; - if (valid & OBD_MD_FLMTIME && src->o_mtime > dst->i_mtime) - dst->i_mtime = src->o_mtime; - if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime) - dst->i_ctime = src->o_ctime; -#else - if (valid & OBD_MD_FLATIME && src->o_atime > dst->i_atime.tv_sec) - dst->i_atime.tv_sec = src->o_atime; - if (valid & OBD_MD_FLMTIME && src->o_mtime > dst->i_mtime.tv_sec) - dst->i_mtime.tv_sec = src->o_mtime; - if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime.tv_sec) - dst->i_ctime.tv_sec = src->o_ctime; -#endif + if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime)) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime)) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; if (valid & OBD_MD_FLSIZE && src->o_size > dst->i_size) dst->i_size = src->o_size; /* allocation of space */ @@ -802,21 +926,12 @@ static inline void obdo_to_inode(struct inode *dst, struct obdo *src, { valid &= src->o_valid; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) if (valid & OBD_MD_FLATIME) - dst->i_atime = src->o_atime; + LTIME_S(dst->i_atime) = src->o_atime; if (valid & OBD_MD_FLMTIME) - dst->i_mtime = src->o_mtime; - if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime) - dst->i_ctime = src->o_ctime; -#else - if (valid & OBD_MD_FLATIME) - dst->i_atime.tv_sec = src->o_atime; - if (valid & OBD_MD_FLMTIME) - dst->i_mtime.tv_sec = src->o_mtime; - if (valid & OBD_MD_FLCTIME && src->o_ctime > dst->i_ctime.tv_sec) - dst->i_ctime.tv_sec = src->o_ctime; -#endif + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; if (valid & OBD_MD_FLSIZE) dst->i_size = src->o_size; if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ @@ -931,49 +1046,17 @@ static inline int obdo_cmp_md(struct obdo *dst, struct obdo *src, return res; } - /* I'm as embarrassed about this as you are. * * // XXX do not look into _superhack with remaining eye * // XXX if this were any uglier, I'd get my own show on MTV */ extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); -extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp, - int dying_import); - -int class_register_type(struct obd_ops *ops, struct lprocfs_vars* vars, - char *nm); -int class_unregister_type(char *nm); -int class_name2dev(char *name); -int class_uuid2dev(struct obd_uuid *uuid); -struct obd_device *class_uuid2obd(struct obd_uuid *uuid); -struct obd_export *class_new_export(struct obd_device *obddev); -struct obd_type *class_get_type(char *name); -void class_put_type(struct obd_type *type); -void class_destroy_export(struct obd_export *exp); -int class_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid); -int class_disconnect(struct lustre_handle *conn); -void class_disconnect_all(struct obd_device *obddev); - -/* generic operations shared by various OBD types */ -int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data); -int class_multi_cleanup(struct obd_device *obddev); - -extern void (*class_signal_connection_failure)(struct ptlrpc_connection *); - -static inline struct ptlrpc_connection *class_rd2conn(struct recovd_data *rd) -{ - /* reuse list_entry's member-pointer offset stuff */ - return list_entry(rd, struct ptlrpc_connection, c_recovd_data); -} +extern void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp); struct obd_statfs; struct statfs; void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs); void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs); -void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src); -void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src); - struct obd_class_user_state { struct obd_device *ocus_current_obd; diff --git a/lustre/include/linux/obd_echo.h b/lustre/include/linux/obd_echo.h index 273779a..c344d8a 100644 --- a/lustre/include/linux/obd_echo.h +++ b/lustre/include/linux/obd_echo.h @@ -1,41 +1,42 @@ -#ifndef _OBD_ECHO_H -#define _OBD_ECHO_H -/* +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * * Copyright (C) 2001 Cluster File Systems, Inc. * * This code is issued under the GNU General Public License. * See the file COPYING in this distribution */ +#ifndef _OBD_ECHO_H +#define _OBD_ECHO_H + #define OBD_ECHO_DEVICENAME "obdecho" #define OBD_ECHO_CLIENT_DEVICENAME "echo_client" -struct ec_object -{ - struct list_head eco_obj_chain; - struct obd_device *eco_device; - int eco_refcount; - int eco_deleted; - obd_id eco_id; - struct lov_stripe_md *eco_lsm; +struct ec_object { + struct list_head eco_obj_chain; + struct obd_device *eco_device; + int eco_refcount; + int eco_deleted; + obd_id eco_id; + struct lov_stripe_md *eco_lsm; }; -struct ec_open_object -{ - struct list_head ecoo_exp_chain; - struct ec_object *ecoo_object; - struct obdo ecoo_oa; - __u64 ecoo_cookie; +struct ec_open_object { + struct list_head ecoo_exp_chain; + struct ec_object *ecoo_object; + __u64 ecoo_cookie; + struct obdo ecoo_oa; + struct obd_client_handle ecoo_och; }; -struct ec_lock -{ - struct list_head ecl_exp_chain; - struct lustre_handle ecl_handle; - struct ldlm_extent ecl_extent; - __u32 ecl_mode; - struct ec_object *ecl_object; - __u64 ecl_cookie; +struct ec_lock { + struct list_head ecl_exp_chain; + struct ec_object *ecl_object; + __u64 ecl_cookie; + struct lustre_handle ecl_lock_handle; + struct ldlm_extent ecl_extent; + __u32 ecl_mode; }; #endif diff --git a/lustre/include/linux/obd_filter.h b/lustre/include/linux/obd_filter.h index 26850d8..74bb784 100644 --- a/lustre/include/linux/obd_filter.h +++ b/lustre/include/linux/obd_filter.h @@ -23,6 +23,11 @@ #ifndef _OBD_FILTER_H #define _OBD_FILTER_H +#ifdef __KERNEL__ +#include +#endif +#include + #ifndef OBD_FILTER_DEVICENAME #define OBD_FILTER_DEVICENAME "obdfilter" #endif @@ -79,9 +84,10 @@ struct filter_export_data { /* file data for open files on OST */ struct filter_file_data { - struct list_head ffd_export_list; /* export open list - fed_lock */ - struct file *ffd_file; /* file handle */ - __u64 ffd_servercookie; /* cookie for lustre handle */ + struct portals_handle ffd_handle; + atomic_t ffd_refcount; + struct list_head ffd_export_list; /* export open list - fed_lock */ + struct file *ffd_file; /* file handle */ }; struct filter_dentry_data { diff --git a/lustre/include/linux/obd_lov.h b/lustre/include/linux/obd_lov.h index ff3e689..b12a062 100644 --- a/lustre/include/linux/obd_lov.h +++ b/lustre/include/linux/obd_lov.h @@ -7,8 +7,16 @@ #define OBD_LOV_DEVICENAME "lov" -void lov_unpackdesc(struct lov_desc *ld); -void lov_packdesc(struct lov_desc *ld); +struct lov_brw_async_args { + obd_count aa_oa_bufs; + struct brw_page *aa_ioarr; +}; + +struct lov_getattr_async_args { + struct lov_stripe_md *aa_lsm; + struct obdo *aa_oa; + struct obdo *aa_stripe_oas; +}; static inline int lov_stripe_md_size(int stripes) { @@ -20,6 +28,15 @@ static inline int lov_mds_md_size(int stripes) return sizeof(struct lov_mds_md) + stripes*sizeof(struct lov_object_id); } +extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm, + struct lov_stripe_md *lsm); +extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm, + struct lov_mds_md *lmm, int lmmsize); +extern int lov_setstripe(struct lustre_handle *conn, + struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu); +extern int lov_getstripe(struct lustre_handle *conn, + struct lov_stripe_md *lsm, struct lov_mds_md *lmmu); + #define IOC_LOV_TYPE 'g' #define IOC_LOV_MIN_NR 50 #define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long) diff --git a/lustre/include/linux/obd_ost.h b/lustre/include/linux/obd_ost.h index 9ef7052..22fe694 100644 --- a/lustre/include/linux/obd_ost.h +++ b/lustre/include/linux/obd_ost.h @@ -34,11 +34,15 @@ #define LUSTRE_SANOSC_NAME "sanosc" #define LUSTRE_SANOST_NAME "sanost" -/* ost/ost_pack.c */ -void ost_pack_niobuf(struct niobuf_remote *nb, __u64 offset, __u32 len, - __u32 flags, __u32 xid); -void ost_unpack_niobuf(struct niobuf_remote *dst, struct niobuf_remote *src); -void ost_pack_ioo(struct obd_ioobj *ioo, struct lov_stripe_md *lsm, int bufcnt); -void ost_unpack_ioo(struct obd_ioobj *dst, struct obd_ioobj *src); +struct osc_brw_async_args { + int aa_requested_nob; + int aa_nio_count; + obd_count aa_page_count; + struct brw_page *aa_pga; +}; + +struct osc_getattr_async_args { + struct obdo *aa_oa; +}; #endif diff --git a/lustre/include/linux/obd_ptlbd.h b/lustre/include/linux/obd_ptlbd.h index 3af66b5..1e6de5a 100644 --- a/lustre/include/linux/obd_ptlbd.h +++ b/lustre/include/linux/obd_ptlbd.h @@ -22,9 +22,12 @@ extern void ptlbd_blk_exit(void); extern void ptlbd_cl_exit(void); extern void ptlbd_sv_exit(void); +extern int ptlbd_do_connect(struct ptlbd_obd *); +extern int ptlbd_do_disconnect(struct ptlbd_obd *); extern void ptlbd_blk_register(struct ptlbd_obd *ptlbd); -extern int ptlbd_send_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, - struct request *); -extern int ptlbd_parse_req(struct ptlrpc_request *req); +extern int ptlbd_send_rw_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, + struct buffer_head *); +extern int ptlbd_send_flush_req(struct ptlbd_obd *, ptlbd_cmd_t cmd); +extern int ptlbd_handle(struct ptlrpc_request *req); #endif diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 85e577a..69a47dc 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -38,7 +38,7 @@ extern atomic_t obd_memory; extern int obd_memmax; extern unsigned long obd_fail_loc; extern unsigned long obd_timeout; -extern char obd_recovery_upcall[128]; +extern char obd_lustre_upcall[128]; extern unsigned long obd_sync_filter; #define OBD_FAIL_MDS 0x100 @@ -93,6 +93,9 @@ extern unsigned long obd_sync_filter; #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e #define OBD_FAIL_OST_BRW_READ_BULK 0x20f #define OBD_FAIL_OST_SYNCFS_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUESTS_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 @@ -153,9 +156,11 @@ do { \ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #define ll_bdevname(a) __bdevname((a)) #define ll_lock_kernel lock_kernel() +#define LTIME_S(time) (time.tv_sec) #else #define ll_lock_kernel #define ll_bdevname(a) bdevname((a)) +#define LTIME_S(time) (time) #endif @@ -185,7 +190,8 @@ static inline void OBD_FAIL_WRITE(int id, kdev_t dev) obd_fail_loc |= OBD_FAILED | OBD_FAIL_ONCE; } } - +#else /* !__KERNEL__ */ +#define LTIME_S(time) (time) #endif /* __KERNEL__ */ #define OBD_ALLOC(ptr, size) \ @@ -208,6 +214,30 @@ do { \ } \ } while (0) +#ifdef __arch_um__ +# define OBD_VMALLOC(ptr, size) OBD_ALLOC(ptr, size) +#else +# define OBD_VMALLOC(ptr, size) \ +do { \ + void *lptr; \ + int s = (size); \ + (ptr) = lptr = vmalloc(s); \ + if (lptr == NULL) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed " \ + "at %s:%d\n", s, __FILE__, __LINE__); \ + } else { \ + int obd_curmem; \ + memset(lptr, 0, s); \ + atomic_add(s, &obd_memory); \ + obd_curmem = atomic_read(&obd_memory); \ + if (obd_curmem > obd_memmax) \ + obd_memmax = obd_curmem; \ + CDEBUG(D_MALLOC, "vmalloced '" #ptr "': %d at %p " \ + "(tot %d)\n", s, lptr, obd_curmem); \ + } \ +} while (0) +#endif + #ifdef CONFIG_DEBUG_SLAB #define POISON(lptr, c, s) do {} while (0) #else @@ -227,11 +257,55 @@ do { \ (ptr) = (void *)0xdeadbeef; \ } while (0) -#ifdef CONFIG_HIGHMEM -extern void obd_kmap_get(int count, int server); -extern void obd_kmap_put(int count); +#ifdef __arch_um__ +# define OBD_VFREE(ptr, size) OBD_FREE(ptr, size) #else -#define obd_kmap_get(count, server) do {} while (0) -#define obd_kmap_put(count) do {} while (0) +# define OBD_VFREE(ptr, size) \ +do { \ + void *lptr = (ptr); \ + int s = (size); \ + LASSERT(lptr); \ + POISON(lptr, 0x5a, s); \ + vfree(lptr); \ + atomic_sub(s, &obd_memory); \ + CDEBUG(D_MALLOC, "vfreed '" #ptr "': %d at %p (tot %d).\n", \ + s, lptr, atomic_read(&obd_memory)); \ + (ptr) = (void *)0xdeadbeef; \ +} while (0) #endif + +#define OBD_SLAB_ALLOC(ptr, slab, type, size) \ +do { \ + long s = (size); \ + void *lptr; \ + LASSERT (!in_interrupt()); \ + (ptr) = lptr = kmem_cache_alloc((slab), type); \ + if (lptr == NULL) { \ + CERROR("slab-alloc of '" #ptr "' (%ld bytes) failed " \ + "at %s:%d\n", s, __FILE__, __LINE__); \ + } else { \ + int obd_curmem; \ + memset(lptr, 0, s); \ + atomic_add(s, &obd_memory); \ + obd_curmem = atomic_read(&obd_memory); \ + if (obd_curmem > obd_memmax) \ + obd_memmax = obd_curmem; \ + CDEBUG(D_MALLOC, "slab-alloced '" #ptr "': %ld at %p " \ + "(tot %d)\n", s, lptr, obd_curmem); \ + } \ +} while (0) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + long s = (size); \ + void *lptr = (ptr); \ + LASSERT(lptr); \ + POISON(lptr, 0x5a, s); \ + CDEBUG(D_MALLOC, "slab-freed '" #ptr "': %ld at %p (tot %d).\n", \ + s, lptr, atomic_read(&obd_memory)); \ + kmem_cache_free((slab), lptr); \ + atomic_sub(s, &obd_memory); \ + (ptr) = (void *)0xdeadbeef; \ +} while (0) + #endif diff --git a/lustre/kernel_patches/README b/lustre/kernel_patches/README index 7d4c4b6..1860f13 100644 --- a/lustre/kernel_patches/README +++ b/lustre/kernel_patches/README @@ -57,7 +57,7 @@ within the root of that tree. The scripts manage a "stack" of patches. Each patch is a changeset against the base tree plus the preceding patches. All patches are listed, in order, in the file ./series. You manage the -series file. +series file. Lines in the series file which start with `#' are ignored. Any currently-applied patches are described in the file ./applied-patches. The patch scripts manage this file. @@ -351,6 +351,15 @@ inpatch cat pc/$(toppatch).pc +join-patch patchname + + "joins" the named patch to the current topmost patch. + + Use this when you want to merge two patches into one. All the + files which `patchname' affects are added to pc/$(toppatch).pc (if + they are not already there) and patch `patchname' is applied. The + top patch remains unchanged. You'll need to run refpatch afterwards. + mpatch A low-level thing to generate patches diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 new file mode 100644 index 0000000..94ee0ab --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-i386 @@ -0,0 +1,1834 @@ +# +# Automatically generated by make menuconfig: don't edit +# +CONFIG_X86=y +CONFIG_ISA=y +# CONFIG_SBUS is not set +CONFIG_UID16=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y + +# +# Processor type and features +# +CONFIG_LOLAT=y +# CONFIG_LOLAT_SYSCTL is not set +CONFIG_M386=y +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MELAN is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_X86_CMPXCHG is not set +# CONFIG_X86_XADD is not set +CONFIG_X86_L1_CACHE_SHIFT=4 +CONFIG_RWSEM_GENERIC_SPINLOCK=y +# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set +CONFIG_X86_PPRO_FENCE=y +CONFIG_X86_MCE=y +# CONFIG_CPU_FREQ is not set +CONFIG_TOSHIBA=m +CONFIG_I8K=m +# CONFIG_MICROCODE is not set +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_E820_PROC is not set +CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set +CONFIG_HIGHIO=y +CONFIG_MATH_EMULATION=y +CONFIG_MTRR=y +# CONFIG_SMP is not set +CONFIG_X86_UP_APIC=y +CONFIG_X86_UP_IOAPIC=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y + +# +# General setup +# +CONFIG_HZ=100 +CONFIG_NET=y +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_NAMES=y + +# +# Performance-monitoring counters support +# +# CONFIG_PERFCTR is not set +CONFIG_EISA=y +# CONFIG_MCA is not set +CONFIG_HOTPLUG=y + +# +# PCMCIA/CardBus support +# +CONFIG_PCMCIA=m +CONFIG_CARDBUS=y +CONFIG_TCIC=y +CONFIG_I82092=y +CONFIG_I82365=y + +# +# PCI Hotplug Support +# +# CONFIG_HOTPLUG_PCI is not set +# CONFIG_HOTPLUG_PCI_COMPAQ is not set +# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set +# CONFIG_HOTPLUG_PCI_IBM is not set +# CONFIG_HOTPLUG_PCI_ACPI is not set +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_KCORE_ELF=y +# CONFIG_KCORE_AOUT is not set +CONFIG_BINFMT_AOUT=m +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=m +# CONFIG_IKCONFIG is not set +CONFIG_PM=y + +# +# Additional device driver support +# +CONFIG_CIPE=m +CONFIG_CRYPTO_AEP=m +CONFIG_MEGARAC=m +CONFIG_FC_QLA2200=m +CONFIG_FC_QLA2300=m +CONFIG_SCSI_ISCSI=m +# CONFIG_IBMASM is not set +# CONFIG_IBMSER is not set +# CONFIG_ACPI is not set +CONFIG_APM=y +# CONFIG_APM_IGNORE_USER_SUSPEND is not set +# CONFIG_APM_DO_ENABLE is not set +CONFIG_APM_CPU_IDLE=y +# CONFIG_APM_DISPLAY_BLANK is not set +CONFIG_APM_RTC_IS_GMT=y +# CONFIG_APM_ALLOW_INTS is not set +# CONFIG_APM_REAL_MODE_POWER_OFF is not set + +# +# Binary emulation of other systems +# +CONFIG_ABI=m +CONFIG_ABI_SVR4=m +CONFIG_ABI_UW7=m +# CONFIG_ABI_SOLARIS is not set +CONFIG_ABI_IBCS=m +CONFIG_ABI_ISC=m +CONFIG_ABI_SCO=m +# CONFIG_ABI_WYSE is not set +CONFIG_BINFMT_COFF=m +CONFIG_BINFMT_XOUT=m +# CONFIG_BINFMT_XOUT_X286 is not set +CONFIG_ABI_SPX=y +CONFIG_ABI_XTI=y +CONFIG_ABI_TLI_OPTMGMT=y +# CONFIG_ABI_XTI_OPTMGMT is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +CONFIG_PARPORT_PC_PCMCIA=m +# CONFIG_PARPORT_AMIGA is not set +# CONFIG_PARPORT_MFC3 is not set +# CONFIG_PARPORT_ATARI is not set +# CONFIG_PARPORT_GSC is not set +# CONFIG_PARPORT_SUNBPP is not set +# CONFIG_PARPORT_OTHER is not set +CONFIG_PARPORT_1284=y + +# +# Plug and Play configuration +# +CONFIG_PNP=y +CONFIG_ISAPNP=y +# CONFIG_PNPBIOS is not set + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +CONFIG_BLK_DEV_XD=m +CONFIG_PARIDE=m +CONFIG_PARIDE_PARPORT=m +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_BPCK6=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_CPQ_DA=m +CONFIG_BLK_CPQ_CISS_DA=m +CONFIG_CISS_SCSI_TAPE=y +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +# CONFIG_BLK_DEV_ENBD is not set +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_LVM=m + +# +# Cryptography support (CryptoAPI) +# +CONFIG_CRYPTO=m +CONFIG_CIPHERS=m +CONFIG_CIPHER_AES=m +CONFIG_CIPHER_IDENTITY=m +CONFIG_CRYPTODEV=m +CONFIG_CRYPTOLOOP=m + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_FILTER=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_TUX=m +CONFIG_TUX_EXTCGI=y +# CONFIG_TUX_EXTENDED_LOG is not set +# CONFIG_TUX_DEBUG is not set +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_FWMARK=y +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_LARGE_TABLES=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +CONFIG_SYN_COOKIES=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_UNCLEAN=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_MIRROR=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_NAT_LOCAL=y +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_COMPAT_IPCHAINS=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_COMPAT_IPFWADM=m +CONFIG_IP_NF_NAT_NEEDED=y + +# +# IP: Virtual Server Configuration +# +CONFIG_IP_VS=m +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=16 +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_FTP=m +CONFIG_IPV6=m + +# +# IPv6: Netfilter Configuration +# +# CONFIG_IP6_NF_QUEUE is not set +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m +# CONFIG_KHTTPD is not set +CONFIG_ATM=y +CONFIG_ATM_CLIP=y +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +CONFIG_ATM_BR2684_IPFILTER=y +CONFIG_VLAN_8021Q=m +CONFIG_IPX=m +# CONFIG_IPX_INTERN is not set +CONFIG_ATALK=m + +# +# Appletalk devices +# +CONFIG_DEV_APPLETALK=y +CONFIG_LTPC=m +CONFIG_COPS=m +CONFIG_COPS_DAYNA=y +CONFIG_COPS_TANGENT=y +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_DECNET=m +CONFIG_DECNET_SIOCGIFCONF=y +CONFIG_DECNET_ROUTER=y +CONFIG_DECNET_ROUTE_FWMARK=y +CONFIG_BRIDGE=m +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +CONFIG_NET_DIVERT=y +# CONFIG_ECONET is not set +CONFIG_WAN_ROUTER=m +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_CSZ=m +# CONFIG_NET_SCH_ATM is not set +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# Telephony Support +# +CONFIG_PHONE=m +CONFIG_PHONE_IXJ=m +CONFIG_PHONE_IXJ_PCMCIA=m + +# +# ATA/IDE/MFM/RLL support +# +CONFIG_IDE=y + +# +# IDE, ATA and ATAPI Block devices +# +CONFIG_BLK_DEV_IDE=y +# CONFIG_BLK_DEV_HD_IDE is not set +# CONFIG_BLK_DEV_HD is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +# CONFIG_IDEDISK_STROKE is not set +# CONFIG_BLK_DEV_IDEDISK_VENDOR is not set +# CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set +# CONFIG_BLK_DEV_IDEDISK_IBM is not set +# CONFIG_BLK_DEV_IDEDISK_MAXTOR is not set +# CONFIG_BLK_DEV_IDEDISK_QUANTUM is not set +# CONFIG_BLK_DEV_IDEDISK_SEAGATE is not set +# CONFIG_BLK_DEV_IDEDISK_WD is not set +# CONFIG_BLK_DEV_COMMERIAL is not set +# CONFIG_BLK_DEV_TIVO is not set +CONFIG_BLK_DEV_IDECS=m +CONFIG_BLK_DEV_IDECD=m +CONFIG_BLK_DEV_IDETAPE=m +CONFIG_BLK_DEV_IDEFLOPPY=y +CONFIG_BLK_DEV_IDESCSI=m +# CONFIG_IDE_TASK_IOCTL is not set +CONFIG_BLK_DEV_CMD640=y +# CONFIG_BLK_DEV_CMD640_ENHANCED is not set +CONFIG_BLK_DEV_ISAPNP=y +CONFIG_BLK_DEV_RZ1000=y +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_PCI_WIP is not set +# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set +# CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set +CONFIG_BLK_DEV_ADMA=y +CONFIG_BLK_DEV_AEC62XX=y +CONFIG_AEC62XX_TUNING=y +CONFIG_BLK_DEV_ALI15X3=y +# CONFIG_WDC_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_AMD74XX_OVERRIDE is not set +CONFIG_BLK_DEV_CMD64X=y +CONFIG_BLK_DEV_CMD680=y +CONFIG_BLK_DEV_CY82C693=y +CONFIG_BLK_DEV_CS5530=y +CONFIG_BLK_DEV_HPT34X=y +# CONFIG_HPT34X_AUTODMA is not set +CONFIG_BLK_DEV_HPT366=y +CONFIG_BLK_DEV_PIIX=y +CONFIG_PIIX_TUNING=y +# CONFIG_BLK_DEV_NS87415 is not set +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_ADMA100=y +CONFIG_BLK_DEV_PDC202XX=y +# CONFIG_PDC202XX_BURST is not set +CONFIG_PDC202XX_FORCE=y +CONFIG_BLK_DEV_SVWKS=y +CONFIG_BLK_DEV_SIS5513=y +CONFIG_BLK_DEV_SLC90E66=y +# CONFIG_BLK_DEV_TRM290 is not set +CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_BLK_DEV_CENATEK=y +# CONFIG_IDE_CHIPSETS is not set +# CONFIG_BLK_DEV_ELEVATOR_NOOP is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_IDEDMA_IVB is not set +# CONFIG_DMA_NONPCI is not set +CONFIG_BLK_DEV_IDE_MODES=y +CONFIG_BLK_DEV_ATARAID=m +CONFIG_BLK_DEV_ATARAID_PDC=m +CONFIG_BLK_DEV_ATARAID_HPT=m + +# +# SCSI support +# +CONFIG_SCSI=m +CONFIG_BLK_DEV_SD=m +CONFIG_SD_EXTRA_DEVS=40 +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_SR_EXTRA_DEVS=4 +CONFIG_CHR_DEV_SG=m +# CONFIG_SCSI_DEBUG_QUEUES is not set +# CONFIG_SCSI_MULTI_LUN is not set +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI low-level drivers +# +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_7000FASST=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AHA152X=m +CONFIG_SCSI_AHA1542=m +CONFIG_SCSI_AHA1740=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=253 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_PROBE_EISA_VL is not set +# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=253 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_BUILD_FIRMWARE is not set +CONFIG_AIC79XX_ENABLE_RD_STRM=y +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_SCSI_AIC7XXX_OLD=m +CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y +CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_OLD_PROC_STATS=y +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_IN2000=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_MEGARAID=m +CONFIG_SCSI_BUSLOGIC=m +# CONFIG_SCSI_OMIT_FLASHPOINT is not set +CONFIG_SCSI_CPQFCTS=m +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_DTC3280=m +CONFIG_SCSI_EATA=m +CONFIG_SCSI_EATA_TAGGED_QUEUE=y +# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set +CONFIG_SCSI_EATA_MAX_TAGS=16 +CONFIG_SCSI_EATA_DMA=m +CONFIG_SCSI_EATA_PIO=m +CONFIG_SCSI_FUTURE_DOMAIN=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_GENERIC_NCR5380=m +# CONFIG_SCSI_GENERIC_NCR53C400 is not set +CONFIG_SCSI_G_NCR5380_PORT=y +# CONFIG_SCSI_G_NCR5380_MEM is not set +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_NCR53C406A=m +CONFIG_SCSI_NCR53C7xx=m +# CONFIG_SCSI_NCR53C7xx_sync is not set +CONFIG_SCSI_NCR53C7xx_FAST=y +CONFIG_SCSI_NCR53C7xx_DISCONNECT=y +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set +CONFIG_SCSI_NCR53C8XX=m +CONFIG_SCSI_SYM53C8XX=m +CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8 +CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32 +CONFIG_SCSI_NCR53C8XX_SYNC=40 +# CONFIG_SCSI_NCR53C8XX_PROFILE is not set +# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set +# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set +# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set +CONFIG_SCSI_PAS16=m +CONFIG_SCSI_PCI2000=m +CONFIG_SCSI_PCI2220I=m +CONFIG_SCSI_PSI240I=m +CONFIG_SCSI_QLOGIC_FAS=m +CONFIG_SCSI_QLOGIC_ISP=m +CONFIG_SCSI_QLOGIC_FC=m +# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_NEWISP=m +CONFIG_SCSI_SEAGATE=m +CONFIG_SCSI_SIM710=m +CONFIG_SCSI_SYM53C416=m +CONFIG_SCSI_DC390T=m +# CONFIG_SCSI_DC390T_NOGENSUPP is not set +CONFIG_SCSI_T128=m +CONFIG_SCSI_U14_34F=m +# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set +CONFIG_SCSI_U14_34F_MAX_TAGS=8 +CONFIG_SCSI_ULTRASTOR=m +CONFIG_SCSI_DEBUG=m + +# +# PCMCIA SCSI adapter support +# +CONFIG_SCSI_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_NINJA_SCSI=m +CONFIG_PCMCIA_QLOGIC=m + +# +# Fusion MPT device support +# +CONFIG_FUSION=m +# CONFIG_FUSION_BOOT is not set +# CONFIG_FUSION_ISENSE is not set +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +CONFIG_NET_FC=y + +# +# IEEE 1394 (FireWire) support (EXPERIMENTAL) +# +CONFIG_IEEE1394=m +# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_OHCI1394=m +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +# CONFIG_IEEE1394_VERBOSEDEBUG is not set + +# +# I2O device support +# +CONFIG_I2O=m +CONFIG_I2O_PCI=m +CONFIG_I2O_BLOCK=m +CONFIG_I2O_LAN=m +CONFIG_I2O_SCSI=m +CONFIG_I2O_PROC=m + +# +# Network device support +# +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +CONFIG_ETHERTAP=m +CONFIG_NET_SB1000=m + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +# CONFIG_SUNLANCE is not set +CONFIG_HAPPYMEAL=m +# CONFIG_SUNBMAC is not set +# CONFIG_SUNQE is not set +CONFIG_SUNGEM=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_EL1=m +CONFIG_EL2=m +CONFIG_ELPLUS=m +CONFIG_EL16=m +CONFIG_EL3=m +CONFIG_3C515=m +# CONFIG_ELMC is not set +# CONFIG_ELMC_II is not set +CONFIG_VORTEX=m +CONFIG_LANCE=m +CONFIG_NET_VENDOR_SMC=y +CONFIG_WD80x3=m +# CONFIG_ULTRAMCA is not set +CONFIG_ULTRA=m +CONFIG_ULTRA32=m +CONFIG_SMC9194=m +CONFIG_NET_VENDOR_RACAL=y +CONFIG_NI5010=m +CONFIG_NI52=m +CONFIG_NI65=m +CONFIG_AT1700=m +CONFIG_DEPCA=m +CONFIG_HP100=m +CONFIG_NET_ISA=y +CONFIG_E2100=m +CONFIG_EWRK3=m +CONFIG_EEXPRESS=m +CONFIG_EEXPRESS_PRO=m +CONFIG_HPLAN_PLUS=m +CONFIG_HPLAN=m +CONFIG_LP486E=m +CONFIG_ETH16I=m +CONFIG_NE2000=m +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_AC3200=m +CONFIG_APRICOT=m +CONFIG_CS89x0=m +CONFIG_TULIP=m +CONFIG_TC35815=m +# CONFIG_TULIP_MWI is not set +CONFIG_TULIP_MMIO=y +CONFIG_DE4X5=m +CONFIG_DGRS=m +CONFIG_DM9102=m +CONFIG_EEPRO100=m +CONFIG_NET_E100=m +CONFIG_LNE390=m +CONFIG_FEALNX=m +CONFIG_NATSEMI=m +# CONFIG_NATSEMI_CABLE_MAGIC is not set +CONFIG_NE2K_PCI=m +CONFIG_NE3210=m +CONFIG_ES3210=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +CONFIG_8139TOO_8129=y +# CONFIG_8139_NEW_RX_RESET is not set +CONFIG_SIS900=m +CONFIG_SIS900_OLD=m +CONFIG_EPIC100=m +CONFIG_SUNDANCE=m +CONFIG_TLAN=m +CONFIG_VIA_RHINE=m +# CONFIG_VIA_RHINE_MMIO is not set +CONFIG_WINBOND_840=m +CONFIG_NET_POCKET=y +CONFIG_ATP=m +CONFIG_DE600=m +CONFIG_DE620=m + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_DL2K=m +# CONFIG_MYRI_SBUS is not set +CONFIG_NS83820=m +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_SK98LIN=m +CONFIG_NET_BROADCOM=m +CONFIG_TIGON3=m +CONFIG_NET_E1000=m +CONFIG_FDDI=y +CONFIG_DEFXX=m +CONFIG_SKFP=m +CONFIG_NETCONSOLE=m +# CONFIG_HIPPI is not set +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_PPPOATM=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Wireless LAN (non-hamradio) +# +CONFIG_NET_RADIO=y +CONFIG_STRIP=m +CONFIG_WAVELAN=m +CONFIG_ARLAN=m +CONFIG_AIRONET4500=m +CONFIG_AIRONET4500_NONCS=m +CONFIG_AIRONET4500_PNP=y +CONFIG_AIRONET4500_PCI=y +CONFIG_AIRONET4500_ISA=y +CONFIG_AIRONET4500_I365=y +CONFIG_AIRONET4500_PROC=m +CONFIG_AIRO=m +CONFIG_HERMES=m +CONFIG_PLX_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_AIRO_CS=m +CONFIG_NET_WIRELESS=y +CONFIG_PCMCIA_HERMES_OLD=m + +# +# Token Ring devices +# +CONFIG_TR=y +CONFIG_IBMTR=m +CONFIG_IBMOL=m +CONFIG_IBMLS=m +CONFIG_3C359=m +CONFIG_TMS380TR=m +CONFIG_TMSPCI=m +CONFIG_TMSISA=m +CONFIG_ABYSS=m +# CONFIG_MADGEMC is not set +CONFIG_SMCTR=m +CONFIG_NET_FC=y +CONFIG_IPHASE5526=m +CONFIG_RCPCI=m +CONFIG_SHAPER=m + +# +# Wan interfaces +# +CONFIG_WAN=y +CONFIG_HOSTESS_SV11=m +CONFIG_COSA=m +# CONFIG_COMX is not set +# CONFIG_DSCC4 is not set +CONFIG_FARSYNC=m +# CONFIG_LANMEDIA is not set +CONFIG_ATI_XX20=m +CONFIG_SEALEVEL_4021=m +# CONFIG_SYNCLINK_SYNCPPP is not set +# CONFIG_HDLC is not set +CONFIG_DLCI=m +CONFIG_DLCI_COUNT=24 +CONFIG_DLCI_MAX=8 +CONFIG_SDLA=m +CONFIG_WAN_ROUTER_DRIVERS=y +CONFIG_VENDOR_SANGOMA=m +CONFIG_WANPIPE_CHDLC=y +CONFIG_WANPIPE_FR=y +CONFIG_WANPIPE_X25=y +CONFIG_WANPIPE_PPP=y +CONFIG_WANPIPE_MULTPPP=y +CONFIG_CYCLADES_SYNC=m +CONFIG_CYCLOMX_X25=y +# CONFIG_LAPBETHER is not set +# CONFIG_X25_ASY is not set +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y + +# +# PCMCIA network device support +# +CONFIG_NET_PCMCIA=y +CONFIG_PCMCIA_3C589=m +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_PCMCIA_PCNET=m +CONFIG_PCMCIA_AXNET=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_PCMCIA_SMC91C92=m +CONFIG_PCMCIA_XIRC2PS=m +# CONFIG_ARCNET_COM20020_CS is not set +CONFIG_PCMCIA_IBMTR=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_PCMCIA_XIRTULIP=m +CONFIG_NET_PCMCIA_RADIO=y +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_NETWAVE=m +CONFIG_PCMCIA_WAVELAN=m +CONFIG_PCMCIA_WVLAN=m +CONFIG_AIRONET4500_CS=m + +# +# Quadrics Supercomputers +# + +# +# QsNet +# +CONFIG_QUADRICS=y +CONFIG_QSNETMOD=m +CONFIG_ELAN3MOD=m +CONFIG_EPMOD=m +CONFIG_EIPMOD=m +CONFIG_RMSMOD=m +CONFIG_JTAG=m + +# +# QsNet II +# + +# +# ATM drivers +# +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_ZATM_EXACT_TS=y +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E_MAYBE=m +CONFIG_ATM_FORE200E_PCA=y +CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_FORE200E=m + +# +# Amateur Radio support +# +CONFIG_HAMRADIO=y +CONFIG_AX25=m +# CONFIG_AX25_DAMA_SLAVE is not set +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +# CONFIG_MKISS is not set +# CONFIG_6PACK is not set +# CONFIG_BPQETHER is not set +# CONFIG_DMASCC is not set +# CONFIG_SCC is not set +# CONFIG_BAYCOM_SER_FDX is not set +# CONFIG_BAYCOM_SER_HDX is not set +# CONFIG_BAYCOM_PAR is not set +# CONFIG_BAYCOM_EPP is not set +CONFIG_SOUNDMODEM=m +CONFIG_SOUNDMODEM_SBC=y +CONFIG_SOUNDMODEM_WSS=y +CONFIG_SOUNDMODEM_AFSK1200=y +CONFIG_SOUNDMODEM_AFSK2400_7=y +CONFIG_SOUNDMODEM_AFSK2400_8=y +CONFIG_SOUNDMODEM_AFSK2666=y +CONFIG_SOUNDMODEM_HAPN4800=y +CONFIG_SOUNDMODEM_PSK4800=y +CONFIG_SOUNDMODEM_FSK9600=y +# CONFIG_YAM is not set + +# +# IrDA (infrared) support +# +CONFIG_IRDA=m +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +CONFIG_IRDA_ULTRA=y +CONFIG_IRDA_CACHE_LAST_LSAP=y +CONFIG_IRDA_FAST_RR=y +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# +CONFIG_IRTTY_SIR=m +CONFIG_IRPORT_SIR=m +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_USB_IRDA=m +CONFIG_NSC_FIR=m +CONFIG_WINBOND_FIR=m +CONFIG_TOSHIBA_FIR=m +CONFIG_SMC_IRCC_FIR=m +CONFIG_ALI_FIR=m +CONFIG_VLSI_FIR=m + +# +# ISDN subsystem +# +CONFIG_ISDN=m +CONFIG_ISDN_BOOL=y +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y + +# +# ISDN feature submodules +# +CONFIG_ISDN_DRV_LOOP=m +# CONFIG_ISDN_DIVERSION is not set + +# +# Passive ISDN cards +# +CONFIG_ISDN_DRV_HISAX=m +CONFIG_ISDN_HISAX=y +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 +CONFIG_HISAX_16_0=y +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_AVM_A1=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_IX1MICROR2=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_ASUSCOM=y +CONFIG_HISAX_TELEINT=y +CONFIG_HISAX_HFCS=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_SPORTSTER=y +CONFIG_HISAX_MIC=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_ISURF=y +CONFIG_HISAX_HSTSAPHIR=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_DEBUG=y +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_FRITZ_PCIPNP=m + +# +# Active ISDN cards +# +CONFIG_ISDN_DRV_ICN=m +CONFIG_ISDN_DRV_PCBIT=m +# CONFIG_ISDN_DRV_SC is not set +# CONFIG_ISDN_DRV_ACT2000 is not set +CONFIG_ISDN_DRV_EICON=y +CONFIG_ISDN_DRV_EICON_DIVAS=m +# CONFIG_ISDN_DRV_EICON_OLD is not set +CONFIG_ISDN_DRV_TPAM=m +CONFIG_ISDN_CAPI=m +CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_CAPIFS_BOOL=y +CONFIG_ISDN_CAPI_CAPIFS=m +CONFIG_ISDN_CAPI_CAPIDRV=m +CONFIG_ISDN_DRV_AVMB1_B1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_T1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +CONFIG_KALLSYMS=y + +# +# Old CD-ROM drivers (not SCSI, not IDE) +# +# CONFIG_CD_NO_IDESCSI is not set + +# +# Input core support +# +CONFIG_INPUT=m +CONFIG_INPUT_KEYBDEV=m +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m + +# +# Character devices +# +CONFIG_ECC=m +# CONFIG_CHAOSTEST is not set +# CONFIG_P4THERM is not set +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_SERIAL=y +CONFIG_SERIAL_CONSOLE=y +CONFIG_SERIAL_EXTENDED=y +CONFIG_SERIAL_MANY_PORTS=y +CONFIG_SERIAL_SHARE_IRQ=y +# CONFIG_SERIAL_DETECT_IRQ is not set +CONFIG_SERIAL_MULTIPORT=y +# CONFIG_HUB6 is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_COMPUTONE=m +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +# CONFIG_CYZ_INTR is not set +CONFIG_DIGIEPCA=m +CONFIG_ESPSERIAL=m +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_ISI=m +CONFIG_SYNCLINK=m +CONFIG_N_HDLC=m +CONFIG_RISCOM8=m +CONFIG_SPECIALIX=m +CONFIG_SPECIALIX_RTSCTS=y +CONFIG_SX=m +# CONFIG_RIO is not set +CONFIG_STALDRV=y +CONFIG_STALLION=m +CONFIG_ISTALLION=m +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=512 +CONFIG_PRINTER=m +CONFIG_LP_CONSOLE=y +CONFIG_PPDEV=m + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_PHILIPSPAR=m +CONFIG_I2C_ELV=m +CONFIG_I2C_VELLEMAN=m +CONFIG_I2C_ALGOPCF=m +CONFIG_I2C_ELEKTOR=m +CONFIG_I2C_MAINBOARD=y +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_HYDRA=m +CONFIG_I2C_AMD756=m +# CONFIG_I2C_TSUNAMI is not set +CONFIG_I2C_I801=m +CONFIG_I2C_I810=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m +CONFIG_I2C_VOODOO3=m +CONFIG_I2C_ISA=m +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_PROC=m + +# +# Hardware sensors support +# +CONFIG_SENSORS=y +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1024=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_FSCPOS=m +CONFIG_SENSORS_FSCSCY=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_MAXILIFE=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_MTP008=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_OTHER=y +CONFIG_SENSORS_BT869=m +CONFIG_SENSORS_DDCMON=m +CONFIG_SENSORS_EEPROM=m +CONFIG_SENSORS_MATORB=m + +# +# Mice +# +CONFIG_BUSMOUSE=m +CONFIG_ATIXL_BUSMOUSE=m +CONFIG_LOGIBUSMOUSE=m +CONFIG_MS_BUSMOUSE=m +CONFIG_MOUSE=y +CONFIG_PSMOUSE=y +CONFIG_82C710_MOUSE=m +CONFIG_PC110_PAD=m +CONFIG_MK712_MOUSE=m + +# +# Joysticks +# +CONFIG_INPUT_GAMEPORT=m +CONFIG_INPUT_NS558=m +CONFIG_INPUT_LIGHTNING=m +CONFIG_INPUT_PCIGAME=m +CONFIG_INPUT_CS461X=m +CONFIG_INPUT_EMU10K1=m +CONFIG_INPUT_SERIO=m +CONFIG_INPUT_SERPORT=m +CONFIG_INPUT_ANALOG=m +CONFIG_INPUT_A3D=m +CONFIG_INPUT_ADI=m +CONFIG_INPUT_COBRA=m +CONFIG_INPUT_GF2K=m +CONFIG_INPUT_GRIP=m +CONFIG_INPUT_INTERACT=m +CONFIG_INPUT_TMDC=m +CONFIG_INPUT_SIDEWINDER=m +CONFIG_INPUT_IFORCE_USB=m +CONFIG_INPUT_IFORCE_232=m +CONFIG_INPUT_WARRIOR=m +CONFIG_INPUT_MAGELLAN=m +CONFIG_INPUT_SPACEORB=m +CONFIG_INPUT_SPACEBALL=m +CONFIG_INPUT_STINGER=m +CONFIG_INPUT_DB9=m +CONFIG_INPUT_GAMECON=m +CONFIG_INPUT_TURBOGRAFX=m +# CONFIG_QIC02_TAPE is not set + +# +# Watchdog Cards +# +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_SC520_WDT=m +CONFIG_PCWATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_WAFER_WDT=m +CONFIG_I810_TCO=m +# CONFIG_MIXCOMWD is not set +# CONFIG_60XX_WDT is not set +CONFIG_SC1200_WDT=m +CONFIG_SOFT_WATCHDOG=m +CONFIG_W83877F_WDT=m +CONFIG_WDT=m +CONFIG_WDTPCI=m +# CONFIG_WDT_501 is not set +CONFIG_MACHZ_WDT=m +CONFIG_AMD7XX_TCO=m +CONFIG_AMD_RNG=m +CONFIG_INTEL_RNG=m +CONFIG_AMD_PM768=m +CONFIG_NVRAM=m +CONFIG_RTC=y +CONFIG_DTLK=m +CONFIG_R3964=m +# CONFIG_APPLICOM is not set +CONFIG_SONYPI=m + +# +# Ftape, the floppy tape device driver +# +CONFIG_FTAPE=m +CONFIG_ZFTAPE=m +CONFIG_ZFT_DFLT_BLK_SZ=10240 +CONFIG_ZFT_COMPRESSOR=m +CONFIG_FT_NR_BUFFERS=3 +# CONFIG_FT_PROC_FS is not set +CONFIG_FT_NORMAL_DEBUG=y +# CONFIG_FT_FULL_DEBUG is not set +# CONFIG_FT_NO_TRACE is not set +# CONFIG_FT_NO_TRACE_AT_ALL is not set +CONFIG_FT_STD_FDC=y +# CONFIG_FT_MACH2 is not set +# CONFIG_FT_PROBE_FC10 is not set +# CONFIG_FT_ALT_FDC is not set +CONFIG_FT_FDC_THR=8 +CONFIG_FT_FDC_MAX_RATE=2000 +CONFIG_FT_ALPHA_CLOCK=0 +CONFIG_AGP=m +CONFIG_AGP_INTEL=y +CONFIG_AGP_I810=y +CONFIG_AGP_VIA=y +CONFIG_AGP_AMD=y +CONFIG_AGP_SIS=y +CONFIG_AGP_ALI=y +CONFIG_AGP_SWORKS=y +# CONFIG_DRM is not set + +# +# PCMCIA character devices +# +CONFIG_PCMCIA_SERIAL_CS=m +CONFIG_SYNCLINK_CS=m +CONFIG_MWAVE=m +CONFIG_BATTERY_GERICOM=m + +# +# Multimedia devices +# +CONFIG_VIDEO_DEV=m + +# +# Video For Linux +# +CONFIG_VIDEO_PROC_FS=y +CONFIG_I2C_PARPORT=m +CONFIG_VIDEO_BT848=m +# CONFIG_VIDEO_LS220 is not set +# CONFIG_VIDEO_MARGI is not set +CONFIG_VIDEO_PMS=m +CONFIG_VIDEO_BWQCAM=m +CONFIG_VIDEO_CQCAM=m +CONFIG_VIDEO_W9966=m +CONFIG_VIDEO_CPIA=m +CONFIG_VIDEO_CPIA_PP=m +CONFIG_VIDEO_CPIA_USB=m +CONFIG_VIDEO_SAA5249=m +CONFIG_TUNER_3036=m +CONFIG_VIDEO_STRADIS=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZR36120=m +CONFIG_VIDEO_MEYE=m + +# +# Radio Adapters +# +CONFIG_RADIO_CADET=m +CONFIG_RADIO_RTRACK=m +CONFIG_RADIO_RTRACK2=m +CONFIG_RADIO_AZTECH=m +CONFIG_RADIO_GEMTEK=m +CONFIG_RADIO_GEMTEK_PCI=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_MAESTRO=m +CONFIG_RADIO_MIROPCM20=m +CONFIG_RADIO_MIROPCM20_RDS=m +CONFIG_RADIO_SF16FMI=m +CONFIG_RADIO_TERRATEC=m +CONFIG_RADIO_TRUST=m +CONFIG_RADIO_TYPHOON=m +CONFIG_RADIO_TYPHOON_PROC_FS=y +CONFIG_RADIO_ZOLTRIX=m + +# +# Crypto Hardware support +# +CONFIG_CRYPTO=m +CONFIG_CRYPTO_BROADCOM=m + +# +# File systems +# +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +# CONFIG_ADFS_FS is not set +# CONFIG_AFS_FS is not set +# CONFIG_ADFS_FS_RW is not set +# CONFIG_AFFS_FS is not set +CONFIG_HFS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EXT3_FS=m +CONFIG_JBD=m +# CONFIG_JBD_DEBUG is not set +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_UMSDOS_FS=m +CONFIG_VFAT_FS=m +# CONFIG_EFS_FS is not set +# CONFIG_JFFS_FS is not set +# CONFIG_JFFS2_FS is not set +CONFIG_CRAMFS=m +CONFIG_TMPFS=y +CONFIG_RAMFS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_JFS_FS=m +CONFIG_JFS_DEBUG=y +# CONFIG_JFS_STATISTICS is not set +CONFIG_MINIX_FS=m +CONFIG_VXFS_FS=m +# CONFIG_NTFS_FS is not set +# CONFIG_NTFS_RW is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +# CONFIG_DEVFS_FS is not set +# CONFIG_DEVFS_MOUNT is not set +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX4FS_RW is not set +CONFIG_ROMFS_FS=m +CONFIG_EXT2_FS=y +CONFIG_SYSV_FS=m +CONFIG_UDF_FS=m +CONFIG_UDF_RW=y +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_CODA_FS=m +CONFIG_INTERMEZZO_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3=y +# CONFIG_ROOT_NFS is not set +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +# CONFIG_NFSD_TCP is not set +CONFIG_SUNRPC=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_SMB_FS=m +# CONFIG_SMB_NLS_DEFAULT is not set +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +# CONFIG_PFS_FS is not set +CONFIG_ZISOFS_FS=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +CONFIG_SMB_NLS=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m + +# +# Console drivers +# +CONFIG_VGA_CONSOLE=y +CONFIG_VIDEO_SELECT=y +# CONFIG_VIDEO_IGNORE_BAD_MODE is not set +CONFIG_MDA_CONSOLE=m + +# +# Frame-buffer support +# +CONFIG_FB=y +CONFIG_DUMMY_CONSOLE=y +CONFIG_FB_RIVA=m +CONFIG_FB_CLGEN=m +CONFIG_FB_PM2=m +# CONFIG_FB_PM2_FIFO_DISCONNECT is not set +CONFIG_FB_PM2_PCI=y +CONFIG_FB_PM3=m +# CONFIG_FB_CYBER2000 is not set +CONFIG_FB_VESA=y +# CONFIG_FB_VGA16 is not set +CONFIG_FB_HGA=m +CONFIG_VIDEO_SELECT=y +CONFIG_FB_MATROX=m +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G100=y +CONFIG_FB_MATROX_I2C=m +CONFIG_FB_MATROX_MAVEN=m +# CONFIG_FB_MATROX_G450 is not set +# CONFIG_FB_MATROX_PROC is not set +CONFIG_FB_MATROX_MULTIHEAD=y +CONFIG_FB_ATY=m +CONFIG_FB_ATY_GX=y +CONFIG_FB_ATY_CT=y +CONFIG_FB_RADEON=m +CONFIG_FB_ATY128=m +CONFIG_FB_SIS=m +CONFIG_FB_SIS_300=y +CONFIG_FB_SIS_315=y +CONFIG_FB_NEOMAGIC=m +CONFIG_FB_3DFX=m +CONFIG_FB_VOODOO1=m +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FBCON_ADVANCED is not set +CONFIG_FBCON_MFB=m +CONFIG_FBCON_CFB8=y +CONFIG_FBCON_CFB16=y +CONFIG_FBCON_CFB24=y +CONFIG_FBCON_CFB32=y +CONFIG_FBCON_HGA=m +# CONFIG_FBCON_FONTWIDTH8_ONLY is not set +# CONFIG_FBCON_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +CONFIG_SPEAKUP=y +CONFIG_SPEAKUP_ACNTSA=y +CONFIG_SPEAKUP_ACNTPC=y +CONFIG_SPEAKUP_APOLO=y +CONFIG_SPEAKUP_AUDPTR=y +CONFIG_SPEAKUP_BNS=y +CONFIG_SPEAKUP_DECTLK=y +CONFIG_SPEAKUP_DECEXT=y +CONFIG_SPEAKUP_DTLK=y +CONFIG_SPEAKUP_LTLK=y +CONFIG_SPEAKUP_SPKOUT=y +CONFIG_SPEAKUP_TXPRT=y +CONFIG_SPEAKUP_DEFAULT="none" +# CONFIG_SPEAKUP_KEYMAP is not set + +# +# Sound +# +CONFIG_SOUND=m +CONFIG_SOUND_BT878=m +CONFIG_SOUND_CMPCI=m +CONFIG_SOUND_CMPCI_FM=y +CONFIG_SOUND_CMPCI_FMIO=388 +CONFIG_SOUND_CMPCI_FMIO=388 +CONFIG_SOUND_CMPCI_MIDI=y +CONFIG_SOUND_CMPCI_MPUIO=330 +CONFIG_SOUND_CMPCI_JOYSTICK=y +CONFIG_SOUND_CMPCI_CM8738=y +# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set +CONFIG_SOUND_CMPCI_SPDIFLOOP=y +CONFIG_SOUND_CMPCI_SPEAKERS=2 +CONFIG_SOUND_EMU10K1=m +CONFIG_MIDI_EMU10K1=y +CONFIG_SOUND_AUDIGY=m +CONFIG_SOUND_FUSION=m +CONFIG_SOUND_CS4281=m +CONFIG_SOUND_ES1370=m +CONFIG_SOUND_ES1371=m +CONFIG_SOUND_ESSSOLO1=m +CONFIG_SOUND_MAESTRO=m +CONFIG_SOUND_MAESTRO3=m +CONFIG_SOUND_ICH=m +CONFIG_SOUND_RME96XX=m +CONFIG_SOUND_SONICVIBES=m +CONFIG_SOUND_TRIDENT=m +CONFIG_SOUND_MSNDCLAS=m +# CONFIG_MSNDCLAS_HAVE_BOOT is not set +CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin" +CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin" +CONFIG_SOUND_MSNDPIN=m +# CONFIG_MSNDPIN_HAVE_BOOT is not set +CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin" +CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin" +CONFIG_SOUND_VIA82CXXX=m +CONFIG_MIDI_VIA82CXXX=y +CONFIG_SOUND_OSS=m +# CONFIG_SOUND_TRACEINIT is not set +CONFIG_SOUND_DMAP=y +CONFIG_SOUND_AD1816=m +CONFIG_SOUND_SGALAXY=m +CONFIG_SOUND_ADLIB=m +CONFIG_SOUND_ACI_MIXER=m +CONFIG_SOUND_CS4232=m +CONFIG_SOUND_SSCAPE=m +CONFIG_SOUND_GUS=m +CONFIG_SOUND_GUS16=y +CONFIG_SOUND_GUSMAX=y +CONFIG_SOUND_VMIDI=m +CONFIG_SOUND_TRIX=m +CONFIG_SOUND_MSS=m +CONFIG_SOUND_MPU401=m +CONFIG_SOUND_NM256=m +CONFIG_SOUND_MAD16=m +CONFIG_MAD16_OLDCARD=y +CONFIG_SOUND_PAS=m +# CONFIG_PAS_JOYSTICK is not set +CONFIG_SOUND_PSS=m +# CONFIG_PSS_MIXER is not set +# CONFIG_PSS_HAVE_BOOT is not set +CONFIG_SOUND_SB=m +CONFIG_SOUND_AWE32_SYNTH=m +CONFIG_SOUND_WAVEFRONT=m +CONFIG_SOUND_MAUI=m +CONFIG_SOUND_YM3812=m +CONFIG_SOUND_OPL3SA1=m +CONFIG_SOUND_OPL3SA2=m +CONFIG_SOUND_YMFPCI=m +CONFIG_SOUND_YMFPCI_LEGACY=y +CONFIG_SOUND_UART6850=m +CONFIG_SOUND_AEDSP16=m +CONFIG_SC6600=y +CONFIG_SC6600_JOY=y +CONFIG_SC6600_CDROM=4 +CONFIG_SC6600_CDROMBASE=0 +CONFIG_AEDSP16_SBPRO=y +CONFIG_AEDSP16_MPU401=y +CONFIG_SOUND_TVMIXER=m + +# +# USB support +# +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +CONFIG_USB_LONG_TIMEOUT=y +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_UHCI=m +CONFIG_USB_UHCI_ALT=m +CONFIG_USB_OHCI=m +CONFIG_USB_AUDIO=m +# CONFIG_USB_EMI26 is not set +CONFIG_USB_BLUETOOTH=m +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=y +CONFIG_USB_STORAGE_FREECOM=y +CONFIG_USB_STORAGE_ISD200=y +CONFIG_USB_STORAGE_DPCM=y +CONFIG_USB_STORAGE_HP8200e=y +CONFIG_USB_STORAGE_SDDR09=y +CONFIG_USB_STORAGE_JUMPSHOT=y +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_HID=m +CONFIG_USB_HIDINPUT=y +CONFIG_USB_HIDDEV=y +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +CONFIG_USB_WACOM=m +# CONFIG_USB_DC2XX is not set +CONFIG_USB_MDC800=m +CONFIG_USB_SCANNER=m +CONFIG_USB_MICROTEK=m +CONFIG_USB_HPUSBSCSI=m +CONFIG_USB_IBMCAM=m +CONFIG_USB_OV511=m +CONFIG_USB_PWC=m +CONFIG_USB_SE401=m +CONFIG_USB_STV680=m +CONFIG_USB_VICAM=m +CONFIG_USB_DSBR=m +CONFIG_USB_DABUSB=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_KAWETH=m +CONFIG_USB_CATC=m +CONFIG_USB_CDCETHER=m +CONFIG_USB_USBNET=m +CONFIG_USB_USS720=m + +# +# USB Serial Converter support +# +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +CONFIG_USB_SERIAL_KEYSPAN_USA49W=y +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_RIO500=m +CONFIG_USB_AUERSWALD=m +CONFIG_USB_BRLVGER=m +CONFIG_USB_USBLCD=m + +# +# Bluetooth support +# +CONFIG_BLUEZ=m +CONFIG_BLUEZ_L2CAP=m +CONFIG_BLUEZ_SCO=m + +# +# Bluetooth device drivers +# +CONFIG_BLUEZ_HCIUSB=m +CONFIG_BLUEZ_USB_FW_LOAD=y +CONFIG_BLUEZ_USB_ZERO_PACKET=y +CONFIG_BLUEZ_HCIUART=m +CONFIG_BLUEZ_HCIUART_H4=y +CONFIG_BLUEZ_HCIDTL1=m +CONFIG_BLUEZ_HCIVHCI=m + +# +# Kernel hacking +# +CONFIG_DEBUG_KERNEL=y +# CONFIG_FRAME_POINTER is not set +# CONFIG_STACK_TRACE_SCAN is not set +CONFIG_STACK_TRACE_PARAM_COUNT=4 +# CONFIG_DEBUG_HIGHMEM is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_IOVIRT is not set +CONFIG_MAGIC_SYSRQ=y +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_MCL_COREDUMP is not set +# CONFIG_OPROFILE is not set + +# +# Library routines +# +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos new file mode 100644 index 0000000..0de1146 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-p4smp-61chaos @@ -0,0 +1,1035 @@ +# +# Automatically generated by make menuconfig: don't edit +# +CONFIG_X86=y +CONFIG_ISA=y +# CONFIG_SBUS is not set +CONFIG_UID16=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y + +# +# Processor type and features +# +CONFIG_LOLAT=y +# CONFIG_LOLAT_SYSCTL is not set +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMIII is not set +CONFIG_MPENTIUM4=y +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MELAN is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MCYRIXIII is not set +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_CMPXCHG=y +CONFIG_X86_XADD=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +# CONFIG_RWSEM_GENERIC_SPINLOCK is not set +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_TSC=y +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_PGE=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_MCE=y +# CONFIG_CPU_FREQ is not set +# CONFIG_TOSHIBA is not set +# CONFIG_I8K is not set +CONFIG_MICROCODE=m +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_E820_PROC is not set +# CONFIG_NOHIGHMEM is not set +CONFIG_HIGHMEM4G=y +# CONFIG_HIGHMEM64G is not set +CONFIG_HIGHMEM=y +CONFIG_HIGHIO=y +# CONFIG_MATH_EMULATION is not set +CONFIG_MTRR=y +CONFIG_SMP=y +# CONFIG_MULTIQUAD is not set +CONFIG_HAVE_DEC_LOCK=y + +# +# General setup +# +CONFIG_HZ=100 +CONFIG_NET=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_NAMES=y + +# +# Performance-monitoring counters support +# +CONFIG_PERFCTR=m +CONFIG_KPERFCTR=y +# CONFIG_PERFCTR_DEBUG is not set +# CONFIG_PERFCTR_INIT_TESTS is not set +CONFIG_PERFCTR_VIRTUAL=y +CONFIG_PERFCTR_GLOBAL=y +# CONFIG_EISA is not set +# CONFIG_MCA is not set +# CONFIG_HOTPLUG is not set +# CONFIG_PCMCIA is not set +# CONFIG_HOTPLUG_PCI is not set +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_KCORE_ELF=y +# CONFIG_KCORE_AOUT is not set +CONFIG_BINFMT_AOUT=m +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=m +# CONFIG_IKCONFIG is not set +CONFIG_PM=y + +# +# Additional device driver support +# +# CONFIG_CIPE is not set +# CONFIG_CRYPTO_AEP is not set +# CONFIG_MEGARAC is not set +CONFIG_FC_QLA2200=m +CONFIG_FC_QLA2300=m +# CONFIG_SCSI_ISCSI is not set +CONFIG_IBMASM=m +CONFIG_IBMSER=m +# CONFIG_ACPI is not set +CONFIG_APM=y +CONFIG_APM_IGNORE_USER_SUSPEND=y +# CONFIG_APM_DO_ENABLE is not set +# CONFIG_APM_CPU_IDLE is not set +# CONFIG_APM_DISPLAY_BLANK is not set +CONFIG_APM_RTC_IS_GMT=y +# CONFIG_APM_ALLOW_INTS is not set +# CONFIG_APM_REAL_MODE_POWER_OFF is not set + +# +# Binary emulation of other systems +# +# CONFIG_ABI is not set +# CONFIG_ABI_SVR4 is not set +# CONFIG_BINFMT_COFF is not set +# CONFIG_BINFMT_XOUT is not set +# CONFIG_BINFMT_XOUT_X286 is not set + +# +# Memory Technology Devices (MTD) +# +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +# CONFIG_MTD_PARTITIONS is not set +# CONFIG_MTD_CONCAT is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +# CONFIG_MTD_CMDLINE_PARTS is not set +CONFIG_MTD_CHAR=m +# CONFIG_MTD_BLOCK is not set +# CONFIG_MTD_BLOCK_RO is not set +# CONFIG_FTL is not set +# CONFIG_NFTL is not set + +# +# RAM/ROM/Flash chip drivers +# +# CONFIG_MTD_CFI is not set +CONFIG_MTD_JEDECPROBE=y +CONFIG_MTD_GEN_PROBE=y +CONFIG_MTD_CFI_ADV_OPTIONS=y +CONFIG_MTD_CFI_NOSWAP=y +# CONFIG_MTD_CFI_BE_BYTE_SWAP is not set +# CONFIG_MTD_CFI_LE_BYTE_SWAP is not set +CONFIG_MTD_CFI_GEOMETRY=y +CONFIG_MTD_CFI_B1=y +# CONFIG_MTD_CFI_B2 is not set +# CONFIG_MTD_CFI_B4 is not set +# CONFIG_MTD_CFI_B8 is not set +CONFIG_MTD_CFI_I1=y +# CONFIG_MTD_CFI_I2 is not set +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +CONFIG_MTD_CFI_INTELEXT=y +CONFIG_MTD_CFI_AMDSTD=y +# CONFIG_MTD_RAM is not set +CONFIG_MTD_ROM=y +# CONFIG_MTD_ABSENT is not set +# CONFIG_MTD_OBSOLETE_CHIPS is not set +# CONFIG_MTD_AMDSTD is not set +# CONFIG_MTD_SHARP is not set +# CONFIG_MTD_JEDEC is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_PHYSMAP is not set +# CONFIG_MTD_PNC2000 is not set +# CONFIG_MTD_SC520CDP is not set +# CONFIG_MTD_NETSC520 is not set +# CONFIG_MTD_SBC_GXX is not set +# CONFIG_MTD_ELAN_104NC is not set +# CONFIG_MTD_DILNETPC is not set +# CONFIG_MTD_MIXMEM is not set +# CONFIG_MTD_OCTAGON is not set +# CONFIG_MTD_VMAX is not set +# CONFIG_MTD_L440GX is not set +# CONFIG_MTD_AMD766ROM is not set +CONFIG_MTD_ICH2ROM=m +# CONFIG_MTD_PCI is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLKMTD is not set +# CONFIG_MTD_DOC1000 is not set +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOCPROBE is not set + +# +# NAND Flash Device Drivers +# +# CONFIG_MTD_NAND is not set + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +# CONFIG_PARPORT_AMIGA is not set +# CONFIG_PARPORT_MFC3 is not set +# CONFIG_PARPORT_ATARI is not set +# CONFIG_PARPORT_GSC is not set +# CONFIG_PARPORT_SUNBPP is not set +# CONFIG_PARPORT_OTHER is not set +CONFIG_PARPORT_1284=y + +# +# Plug and Play configuration +# +CONFIG_PNP=y +CONFIG_ISAPNP=y +# CONFIG_PNPBIOS is not set + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +# CONFIG_BLK_DEV_XD is not set +# CONFIG_PARIDE is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_CISS_SCSI_TAPE is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_ENBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +# CONFIG_MD_LINEAR is not set +# CONFIG_MD_RAID0 is not set +# CONFIG_MD_RAID1 is not set +# CONFIG_MD_RAID5 is not set +# CONFIG_MD_MULTIPATH is not set +CONFIG_BLK_DEV_LVM=m + +# +# Cryptography support (CryptoAPI) +# +# CONFIG_CRYPTO is not set +# CONFIG_CIPHERS is not set +# CONFIG_CRYPTODEV is not set +# CONFIG_CRYPTOLOOP is not set + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_FILTER=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_TUX is not set +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_FWMARK=y +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_LARGE_TABLES=y +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +CONFIG_SYN_COOKIES=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +# CONFIG_IP_NF_FTP is not set +# CONFIG_IP_NF_IRC is not set +# CONFIG_IP_NF_QUEUE is not set +# CONFIG_IP_NF_IPTABLES is not set +# CONFIG_IP_NF_ARPTABLES is not set +# CONFIG_IP_NF_COMPAT_IPCHAINS is not set +# CONFIG_IP_NF_COMPAT_IPFWADM is not set + +# +# IP: Virtual Server Configuration +# +# CONFIG_IP_VS is not set +# CONFIG_IPV6 is not set +CONFIG_KHTTPD=m +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +CONFIG_NET_PKTGEN=m + +# +# Telephony Support +# +# CONFIG_PHONE is not set +# CONFIG_PHONE_IXJ is not set +# CONFIG_PHONE_IXJ_PCMCIA is not set + +# +# ATA/IDE/MFM/RLL support +# +CONFIG_IDE=y + +# +# IDE, ATA and ATAPI Block devices +# +CONFIG_BLK_DEV_IDE=y +# CONFIG_BLK_DEV_HD_IDE is not set +# CONFIG_BLK_DEV_HD is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +# CONFIG_IDEDISK_STROKE is not set +# CONFIG_BLK_DEV_IDEDISK_VENDOR is not set +# CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set +# CONFIG_BLK_DEV_IDEDISK_IBM is not set +# CONFIG_BLK_DEV_IDEDISK_MAXTOR is not set +# CONFIG_BLK_DEV_IDEDISK_QUANTUM is not set +# CONFIG_BLK_DEV_IDEDISK_SEAGATE is not set +# CONFIG_BLK_DEV_IDEDISK_WD is not set +# CONFIG_BLK_DEV_COMMERIAL is not set +# CONFIG_BLK_DEV_TIVO is not set +# CONFIG_BLK_DEV_IDECS is not set +CONFIG_BLK_DEV_IDECD=m +# CONFIG_BLK_DEV_IDETAPE is not set +CONFIG_BLK_DEV_IDEFLOPPY=y +# CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set +# CONFIG_BLK_DEV_CMD640 is not set +# CONFIG_BLK_DEV_CMD640_ENHANCED is not set +CONFIG_BLK_DEV_ISAPNP=y +# CONFIG_BLK_DEV_RZ1000 is not set +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_IDEPCI_SHARE_IRQ=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_PCI_WIP is not set +# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set +# CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set +CONFIG_BLK_DEV_ADMA=y +CONFIG_BLK_DEV_AEC62XX=y +CONFIG_AEC62XX_TUNING=y +CONFIG_BLK_DEV_ALI15X3=y +# CONFIG_WDC_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_AMD74XX_OVERRIDE is not set +CONFIG_BLK_DEV_CMD64X=y +CONFIG_BLK_DEV_CMD680=y +CONFIG_BLK_DEV_CY82C693=y +CONFIG_BLK_DEV_CS5530=y +CONFIG_BLK_DEV_HPT34X=y +# CONFIG_HPT34X_AUTODMA is not set +CONFIG_BLK_DEV_HPT366=y +CONFIG_BLK_DEV_PIIX=y +CONFIG_PIIX_TUNING=y +# CONFIG_BLK_DEV_NS87415 is not set +# CONFIG_BLK_DEV_OPTI621 is not set +# CONFIG_BLK_DEV_ADMA100 is not set +CONFIG_BLK_DEV_PDC202XX=y +# CONFIG_PDC202XX_BURST is not set +# CONFIG_PDC202XX_FORCE is not set +CONFIG_BLK_DEV_SVWKS=y +CONFIG_BLK_DEV_SIS5513=y +CONFIG_BLK_DEV_SLC90E66=y +# CONFIG_BLK_DEV_TRM290 is not set +CONFIG_BLK_DEV_VIA82CXXX=y +CONFIG_BLK_DEV_CENATEK=y +# CONFIG_IDE_CHIPSETS is not set +# CONFIG_BLK_DEV_ELEVATOR_NOOP is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_IDEDMA_IVB is not set +# CONFIG_DMA_NONPCI is not set +CONFIG_BLK_DEV_IDE_MODES=y +# CONFIG_BLK_DEV_ATARAID is not set +# CONFIG_BLK_DEV_ATARAID_PDC is not set +# CONFIG_BLK_DEV_ATARAID_HPT is not set + +# +# SCSI support +# +CONFIG_SCSI=y +CONFIG_BLK_DEV_SD=y +CONFIG_SD_EXTRA_DEVS=40 +# CONFIG_CHR_DEV_ST is not set +# CONFIG_CHR_DEV_OSST is not set +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_SR_EXTRA_DEVS=4 +CONFIG_CHR_DEV_SG=m +# CONFIG_SCSI_DEBUG_QUEUES is not set +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI low-level drivers +# +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_7000FASST is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AHA152X is not set +# CONFIG_SCSI_AHA1542 is not set +# CONFIG_SCSI_AHA1740 is not set +# CONFIG_SCSI_AACRAID is not set +CONFIG_SCSI_AIC7XXX=y +CONFIG_AIC7XXX_CMDS_PER_DEVICE=253 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_PROBE_EISA_VL is not set +# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set +# CONFIG_SCSI_AIC79XX is not set +# CONFIG_SCSI_DPT_I2O is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_SCSI_IN2000 is not set +# CONFIG_SCSI_AM53C974 is not set +# CONFIG_SCSI_MEGARAID is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_SCSI_CPQFCTS is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_DTC3280 is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_EATA_DMA is not set +# CONFIG_SCSI_EATA_PIO is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_GENERIC_NCR5380 is not set +# CONFIG_SCSI_IPS is not set +# CONFIG_SCSI_INITIO is not set +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_PPA is not set +# CONFIG_SCSI_IMM is not set +# CONFIG_SCSI_NCR53C406A is not set +# CONFIG_SCSI_NCR53C7xx is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_NCR53C8XX is not set +# CONFIG_SCSI_SYM53C8XX is not set +# CONFIG_SCSI_PAS16 is not set +# CONFIG_SCSI_PCI2000 is not set +# CONFIG_SCSI_PCI2220I is not set +# CONFIG_SCSI_PSI240I is not set +# CONFIG_SCSI_QLOGIC_FAS is not set +# CONFIG_SCSI_QLOGIC_ISP is not set +# CONFIG_SCSI_QLOGIC_FC is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +# CONFIG_SCSI_NEWISP is not set +# CONFIG_SCSI_SEAGATE is not set +# CONFIG_SCSI_SIM710 is not set +# CONFIG_SCSI_SYM53C416 is not set +# CONFIG_SCSI_DC390T is not set +# CONFIG_SCSI_T128 is not set +# CONFIG_SCSI_U14_34F is not set +# CONFIG_SCSI_ULTRASTOR is not set +CONFIG_SCSI_DEBUG=m + +# +# Fusion MPT device support +# +CONFIG_FUSION=y +CONFIG_FUSION_BOOT=y +CONFIG_FUSION_ISENSE=m +CONFIG_FUSION_CTL=m +# CONFIG_FUSION_LAN is not set + +# +# IEEE 1394 (FireWire) support (EXPERIMENTAL) +# +# CONFIG_IEEE1394 is not set + +# +# I2O device support +# +# CONFIG_I2O is not set +# CONFIG_I2O_PCI is not set +# CONFIG_I2O_BLOCK is not set +# CONFIG_I2O_LAN is not set +# CONFIG_I2O_SCSI is not set +# CONFIG_I2O_PROC is not set + +# +# Network device support +# +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=m +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +# CONFIG_TUN is not set +# CONFIG_ETHERTAP is not set +# CONFIG_NET_SB1000 is not set + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +# CONFIG_SUNLANCE is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNBMAC is not set +# CONFIG_SUNQE is not set +# CONFIG_SUNGEM is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_LANCE is not set +# CONFIG_NET_VENDOR_SMC is not set +# CONFIG_NET_VENDOR_RACAL is not set +# CONFIG_AT1700 is not set +# CONFIG_DEPCA is not set +# CONFIG_HP100 is not set +# CONFIG_NET_ISA is not set +CONFIG_NET_PCI=y +# CONFIG_PCNET32 is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_AC3200 is not set +# CONFIG_APRICOT is not set +# CONFIG_CS89x0 is not set +CONFIG_TULIP=m +# CONFIG_TC35815 is not set +# CONFIG_TULIP_MWI is not set +CONFIG_TULIP_MMIO=y +CONFIG_DE4X5=m +# CONFIG_DGRS is not set +# CONFIG_DM9102 is not set +CONFIG_EEPRO100=m +CONFIG_NET_E100=m +# CONFIG_LNE390 is not set +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +# CONFIG_NE2K_PCI is not set +# CONFIG_NE3210 is not set +# CONFIG_ES3210 is not set +# CONFIG_8139CP is not set +# CONFIG_8139TOO is not set +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +# CONFIG_8139TOO_8129 is not set +# CONFIG_8139_NEW_RX_RESET is not set +# CONFIG_SIS900 is not set +# CONFIG_SIS900_OLD is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_TLAN is not set +# CONFIG_VIA_RHINE is not set +# CONFIG_VIA_RHINE_MMIO is not set +# CONFIG_WINBOND_840 is not set +# CONFIG_NET_POCKET is not set + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +# CONFIG_DL2K is not set +# CONFIG_MYRI_SBUS is not set +CONFIG_NS83820=m +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_SK98LIN is not set +CONFIG_NET_BROADCOM=m +CONFIG_TIGON3=m +CONFIG_NET_E1000=m +# CONFIG_FDDI is not set +# CONFIG_NETCONSOLE is not set +# CONFIG_HIPPI is not set +# CONFIG_PLIP is not set +# CONFIG_PPP is not set +# CONFIG_SLIP is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices +# +# CONFIG_TR is not set +# CONFIG_NET_FC is not set +# CONFIG_RCPCI is not set +# CONFIG_SHAPER is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# Quadrics Supercomputers +# + +# +# QsNet +# +CONFIG_QUADRICS=y +CONFIG_QSNETMOD=m +CONFIG_ELAN3MOD=m +CONFIG_EPMOD=m +CONFIG_EIPMOD=m +CONFIG_RMSMOD=m +CONFIG_JTAG=m + +# +# QsNet II +# + +# +# Amateur Radio support +# +# CONFIG_HAMRADIO is not set + +# +# IrDA (infrared) support +# +# CONFIG_IRDA is not set + +# +# ISDN subsystem +# +# CONFIG_ISDN is not set +CONFIG_KALLSYMS=y + +# +# Old CD-ROM drivers (not SCSI, not IDE) +# +# CONFIG_CD_NO_IDESCSI is not set + +# +# Input core support +# +# CONFIG_INPUT is not set +# CONFIG_INPUT_KEYBDEV is not set +# CONFIG_INPUT_MOUSEDEV is not set +# CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_EVDEV is not set + +# +# Character devices +# +CONFIG_ECC=m +CONFIG_CHAOSTEST=m +CONFIG_P4THERM=m +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_SERIAL=y +CONFIG_SERIAL_CONSOLE=y +CONFIG_SERIAL_EXTENDED=y +# CONFIG_SERIAL_MANY_PORTS is not set +CONFIG_SERIAL_SHARE_IRQ=y +# CONFIG_SERIAL_DETECT_IRQ is not set +# CONFIG_SERIAL_MULTIPORT is not set +# CONFIG_HUB6 is not set +# CONFIG_SERIAL_NONSTANDARD is not set +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=2048 +# CONFIG_PRINTER is not set +# CONFIG_PPDEV is not set + +# +# I2C support +# +CONFIG_I2C=y +# CONFIG_I2C_ALGOBIT is not set +# CONFIG_I2C_ALGOPCF is not set +CONFIG_I2C_MAINBOARD=y +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_HYDRA is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_TSUNAMI is not set +CONFIG_I2C_I801=m +# CONFIG_I2C_I810 is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set +# CONFIG_I2C_VOODOO3 is not set +CONFIG_I2C_ISA=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_PROC=y + +# +# Hardware sensors support +# +CONFIG_SENSORS=y +CONFIG_SENSORS_ADM1021=m +# CONFIG_SENSORS_ADM1024 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_DS1621 is not set +# CONFIG_SENSORS_FSCPOS is not set +# CONFIG_SENSORS_FSCSCY is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_MAXILIFE is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_MTP008 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +CONFIG_SENSORS_LM87=m +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_THMC50 is not set +# CONFIG_SENSORS_VIA686A is not set +CONFIG_SENSORS_W83781D=y +# CONFIG_SENSORS_OTHER is not set + +# +# Mice +# +# CONFIG_BUSMOUSE is not set +CONFIG_MOUSE=y +CONFIG_PSMOUSE=y +# CONFIG_82C710_MOUSE is not set +# CONFIG_PC110_PAD is not set +# CONFIG_MK712_MOUSE is not set + +# +# Joysticks +# +# CONFIG_INPUT_GAMEPORT is not set +# CONFIG_QIC02_TAPE is not set + +# +# Watchdog Cards +# +# CONFIG_WATCHDOG is not set +# CONFIG_AMD_RNG is not set +# CONFIG_INTEL_RNG is not set +# CONFIG_AMD_PM768 is not set +# CONFIG_NVRAM is not set +CONFIG_RTC=y +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set +# CONFIG_SONYPI is not set + +# +# Ftape, the floppy tape device driver +# +# CONFIG_FTAPE is not set +CONFIG_AGP=m +CONFIG_AGP_INTEL=y +# CONFIG_AGP_I810 is not set +# CONFIG_AGP_VIA is not set +# CONFIG_AGP_AMD is not set +# CONFIG_AGP_SIS is not set +# CONFIG_AGP_ALI is not set +# CONFIG_AGP_SWORKS is not set +# CONFIG_DRM is not set +# CONFIG_MWAVE is not set +# CONFIG_BATTERY_GERICOM is not set + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set + +# +# Crypto Hardware support +# +# CONFIG_CRYPTO is not set + +# +# File systems +# +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set +# CONFIG_REISERFS_FS is not set +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_ADFS_FS is not set +# CONFIG_AFS_FS is not set +# CONFIG_ADFS_FS_RW is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +CONFIG_EXT3_FS=y +CONFIG_EXTN_FS=m +CONFIG_JBD=y +CONFIG_JBD_DEBUG=y +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_UMSDOS_FS=m +CONFIG_VFAT_FS=m +# CONFIG_EFS_FS is not set +# CONFIG_JFFS_FS is not set +# CONFIG_JFFS2_FS is not set +CONFIG_CRAMFS=y +CONFIG_TMPFS=y +CONFIG_RAMFS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +# CONFIG_JFS_FS is not set +# CONFIG_JFS_DEBUG is not set +# CONFIG_JFS_STATISTICS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_NTFS_FS is not set +# CONFIG_NTFS_RW is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +# CONFIG_DEVFS_FS is not set +# CONFIG_DEVFS_MOUNT is not set +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX4FS_RW is not set +CONFIG_ROMFS_FS=m +CONFIG_EXT2_FS=y +# CONFIG_SYSV_FS is not set +# CONFIG_UDF_FS is not set +# CONFIG_UDF_RW is not set +# CONFIG_UFS_FS is not set +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +CONFIG_NFS_FS=m +CONFIG_NFS_V3=y +# CONFIG_ROOT_NFS is not set +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +CONFIG_NFSD_TCP=y +CONFIG_SUNRPC=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +# CONFIG_SMB_FS is not set +# CONFIG_NCP_FS is not set +# CONFIG_NCPFS_PACKET_SIGNING is not set +# CONFIG_NCPFS_IOCTL_LOCKING is not set +# CONFIG_NCPFS_STRONG is not set +# CONFIG_NCPFS_NFS_NS is not set +# CONFIG_NCPFS_OS2_NS is not set +# CONFIG_NCPFS_SMALLDOS is not set +# CONFIG_NCPFS_NLS is not set +# CONFIG_NCPFS_EXTRAS is not set +# CONFIG_PFS_FS is not set +CONFIG_ZISOFS_FS=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +# CONFIG_SMB_NLS is not set +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +CONFIG_NLS_CODEPAGE_850=m +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +CONFIG_NLS_ISO8859_1=m +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set + +# +# Console drivers +# +CONFIG_VGA_CONSOLE=y +CONFIG_VIDEO_SELECT=y +# CONFIG_VIDEO_IGNORE_BAD_MODE is not set +# CONFIG_MDA_CONSOLE is not set + +# +# Frame-buffer support +# +# CONFIG_FB is not set +# CONFIG_SPEAKUP is not set + +# +# Sound +# +# CONFIG_SOUND is not set + +# +# USB support +# +# CONFIG_USB is not set + +# +# Bluetooth support +# +# CONFIG_BLUEZ is not set + +# +# Kernel hacking +# +CONFIG_DEBUG_KERNEL=y +CONFIG_FRAME_POINTER=y +CONFIG_STACK_TRACE_SCAN=y +CONFIG_STACK_TRACE_FPTR=y +CONFIG_STACK_TRACE_PARAM_COUNT=4 +# CONFIG_DEBUG_HIGHMEM is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_IOVIRT is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_SPINLOCK=y +CONFIG_MCL_COREDUMP=y +CONFIG_BOOTIMG=y +# CONFIG_OPROFILE is not set + +# +# Library routines +# +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml new file mode 100644 index 0000000..bb79c22 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/config-linux-2.4.18-uml @@ -0,0 +1,458 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +# CONFIG_ISA is not set +# CONFIG_SBUS is not set +# CONFIG_PCI is not set +CONFIG_UID16=y +# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set +CONFIG_RWSEM_GENERIC_SPINLOCK=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General Setup +# +CONFIG_NET=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_BINFMT_AOUT=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_HOSTFS=y +# CONFIG_HPPFS is not set +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_KMOD=y + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +# CONFIG_NULL_CHAN is not set +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=2048 +# CONFIG_WATCHDOG is not set +# CONFIG_UML_SOUND is not set +# CONFIG_SOUND is not set +# CONFIG_HOSTAUDIO is not set +# CONFIG_TTY_LOG is not set + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# Network Devices +# +CONFIG_UML_NET=y +# CONFIG_UML_NET_ETHERTAP is not set +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=y +CONFIG_PPP=y +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_FILTER=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_TUX is not set +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_FWMARK=y +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_LARGE_TABLES=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=y +CONFIG_NET_IPGRE=y +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +CONFIG_SYN_COOKIES=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=y +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_UNCLEAN=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_MIRROR=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_NAT_LOCAL=y +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m + +# +# IP: Virtual Server Configuration +# +CONFIG_IP_VS=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=16 + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IPV6=y + +# +# IPv6: Netfilter Configuration +# +# CONFIG_IP6_NF_QUEUE is not set +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m +# CONFIG_KHTTPD is not set +CONFIG_ATM=y +CONFIG_ATM_CLIP=y +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=y +CONFIG_ATM_MPOA=y +CONFIG_ATM_BR2684=m +CONFIG_ATM_BR2684_IPFILTER=y +CONFIG_VLAN_8021Q=m + +# +# +# +CONFIG_IPX=m +# CONFIG_IPX_INTERN is not set +CONFIG_ATALK=m + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set +CONFIG_DECNET=m +CONFIG_DECNET_SIOCGIFCONF=y +CONFIG_DECNET_ROUTER=y +CONFIG_DECNET_ROUTE_FWMARK=y +CONFIG_BRIDGE=m +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +CONFIG_NET_DIVERT=y +# CONFIG_ECONET is not set +CONFIG_WAN_ROUTER=y +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_CSZ=m +# CONFIG_NET_SCH_ATM is not set +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# File systems +# +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +# CONFIG_ADFS_FS is not set +# CONFIG_AFS_FS is not set +# CONFIG_AFFS_FS is not set +CONFIG_HFS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EXT3_FS=y +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=m +CONFIG_UMSDOS_FS=m +CONFIG_VFAT_FS=y +# CONFIG_EFS_FS is not set +CONFIG_CRAMFS=m +CONFIG_TMPFS=y +CONFIG_RAMFS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_JFS_FS=m +CONFIG_JFS_DEBUG=y +# CONFIG_JFS_STATISTICS is not set +CONFIG_MINIX_FS=m +CONFIG_VXFS_FS=m +# CONFIG_NTFS_FS is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_EXT2_FS=y +CONFIG_SYSV_FS=m +CONFIG_UDF_FS=m +CONFIG_UDF_RW=y +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_CODA_FS=m +CONFIG_INTERMEZZO_FS=m +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +# CONFIG_NFSD_TCP is not set +CONFIG_SUNRPC=y +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_SMB_FS=m +# CONFIG_SMB_NLS_DEFAULT is not set +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +CONFIG_ZISOFS_FS=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +CONFIG_SMB_NLS=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m + +# +# SCSI support +# +# CONFIG_SCSI is not set + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_LVM=m + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Kernel hacking +# +# CONFIG_DEBUG_SLAB is not set +CONFIG_DEBUGSYM=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set + +# +# Library routines +# +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh new file mode 100644 index 0000000..dec210a --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-i386-rh @@ -0,0 +1,1849 @@ +# +# Automatically generated by make menuconfig: don't edit +# +CONFIG_X86=y +# CONFIG_SBUS is not set +CONFIG_UID16=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_MODVERSIONS=y +CONFIG_KMOD=y + +# +# Processor type and features +# +CONFIG_LOLAT=y +# CONFIG_M386 is not set +# CONFIG_M486 is not set +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +CONFIG_M686=y +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MELAN is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MCYRIXIII is not set +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_CMPXCHG=y +CONFIG_X86_XADD=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +# CONFIG_RWSEM_GENERIC_SPINLOCK is not set +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_HAS_TSC=y +CONFIG_X86_GOOD_APIC=y +CONFIG_X86_PGE=y +CONFIG_X86_USE_PPRO_CHECKSUM=y +CONFIG_X86_PPRO_FENCE=y +CONFIG_X86_F00F_WORKS_OK=y +CONFIG_X86_MCE=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +# CONFIG_CPU_FREQ_24_API is not set +CONFIG_X86_POWERNOW_K6=m +# CONFIG_X86_LONGHAUL is not set +CONFIG_X86_SPEEDSTEP=m +# CONFIG_X86_P4_CLOCKMOD is not set +# CONFIG_X86_LONGRUN is not set +CONFIG_TOSHIBA=m +CONFIG_I8K=m +CONFIG_MICROCODE=m +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_E820_PROC is not set +CONFIG_EDD=m +# CONFIG_NOHIGHMEM is not set +CONFIG_HIGHMEM4G=y +# CONFIG_HIGHMEM64G is not set +CONFIG_HIGHMEM=y +CONFIG_HIGHPTE=y +CONFIG_HIGHIO=y +# CONFIG_MATH_EMULATION is not set +CONFIG_MTRR=y +# CONFIG_SMP is not set +CONFIG_X86_UP_APIC=y +CONFIG_X86_UP_IOAPIC=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +# CONFIG_X86_TSC_DISABLE is not set +CONFIG_X86_TSC=y + +# +# General setup +# +CONFIG_NET=y +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +CONFIG_ISA=y +CONFIG_PCI_NAMES=y +CONFIG_EISA=y +# CONFIG_MCA is not set +CONFIG_HOTPLUG=y + +# +# PCMCIA/CardBus support +# +CONFIG_PCMCIA=m +CONFIG_CARDBUS=y +CONFIG_TCIC=y +CONFIG_I82092=y +CONFIG_I82365=y + +# +# PCI Hotplug Support +# +# CONFIG_HOTPLUG_PCI is not set +# CONFIG_HOTPLUG_PCI_ACPI is not set +# CONFIG_HOTPLUG_PCI_COMPAQ is not set +# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set +# CONFIG_HOTPLUG_PCI_IBM is not set +# CONFIG_HOTPLUG_PCI_H2999 is not set +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_KCORE_ELF=y +# CONFIG_KCORE_AOUT is not set +CONFIG_BINFMT_AOUT=m +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=m +CONFIG_PM=y +# CONFIG_ACPI is not set +CONFIG_APM=y +# CONFIG_APM_IGNORE_USER_SUSPEND is not set +# CONFIG_APM_DO_ENABLE is not set +CONFIG_APM_CPU_IDLE=y +# CONFIG_APM_DISPLAY_BLANK is not set +CONFIG_APM_RTC_IS_GMT=y +# CONFIG_APM_ALLOW_INTS is not set +# CONFIG_APM_REAL_MODE_POWER_OFF is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Parallel port support +# +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_PC_CML1=m +CONFIG_PARPORT_SERIAL=m +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +CONFIG_PARPORT_PC_PCMCIA=m +# CONFIG_PARPORT_AMIGA is not set +# CONFIG_PARPORT_MFC3 is not set +# CONFIG_PARPORT_ATARI is not set +# CONFIG_PARPORT_GSC is not set +# CONFIG_PARPORT_SUNBPP is not set +# CONFIG_PARPORT_OTHER is not set +CONFIG_PARPORT_1284=y + +# +# Plug and Play configuration +# +CONFIG_PNP=y +CONFIG_ISAPNP=y + +# +# Block devices +# +CONFIG_BLK_DEV_FD=y +CONFIG_BLK_DEV_XD=m +CONFIG_PARIDE=m +CONFIG_PARIDE_PARPORT=m +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_BPCK6=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_CPQ_DA=m +CONFIG_BLK_CPQ_CISS_DA=m +CONFIG_CISS_SCSI_TAPE=y +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +CONFIG_BLK_STATS=y + +# +# Multi-device support (RAID and LVM) +# +CONFIG_MD=y +CONFIG_BLK_DEV_MD=y +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID5=m +CONFIG_MD_MULTIPATH=m +CONFIG_BLK_DEV_LVM=m + +# +# Cryptography support (CryptoAPI) +# +CONFIG_CRYPTO=m +CONFIG_CIPHERS=m +CONFIG_CIPHER_AES=m +CONFIG_CIPHER_IDENTITY=m +CONFIG_CRYPTODEV=m +CONFIG_CRYPTOLOOP=m + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_NETLINK_DEV=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_FILTER=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_TUX=m +CONFIG_TUX_EXTCGI=y +# CONFIG_TUX_EXTENDED_LOG is not set +# CONFIG_TUX_DEBUG is not set +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_FWMARK=y +CONFIG_IP_ROUTE_NAT=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_TOS=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_LARGE_TABLES=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +CONFIG_SYN_COOKIES=y + +# +# IP: Netfilter Configuration +# +CONFIG_IP_NF_CONNTRACK=m +CONFIG_IP_NF_FTP=m +CONFIG_IP_NF_IRC=m +CONFIG_IP_NF_QUEUE=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_LIMIT=m +CONFIG_IP_NF_MATCH_MAC=m +CONFIG_IP_NF_MATCH_PKTTYPE=m +CONFIG_IP_NF_MATCH_MARK=m +CONFIG_IP_NF_MATCH_MULTIPORT=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_DSCP=m +CONFIG_IP_NF_MATCH_AH_ESP=m +CONFIG_IP_NF_MATCH_LENGTH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_TCPMSS=m +CONFIG_IP_NF_MATCH_HELPER=m +CONFIG_IP_NF_MATCH_STATE=m +CONFIG_IP_NF_MATCH_CONNTRACK=m +CONFIG_IP_NF_MATCH_UNCLEAN=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_MIRROR=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_NAT_LOCAL=y +CONFIG_IP_NF_NAT_SNMP_BASIC=m +CONFIG_IP_NF_NAT_IRC=m +CONFIG_IP_NF_NAT_FTP=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_DSCP=m +CONFIG_IP_NF_TARGET_MARK=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_IP_NF_TARGET_TCPMSS=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_COMPAT_IPCHAINS=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IP_NF_COMPAT_IPFWADM=m +CONFIG_IP_NF_NAT_NEEDED=y +CONFIG_IPV6=m + +# +# IPv6: Netfilter Configuration +# +# CONFIG_IP6_NF_QUEUE is not set +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_LIMIT=m +CONFIG_IP6_NF_MATCH_MAC=m +CONFIG_IP6_NF_MATCH_MULTIPORT=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_MARK=m +CONFIG_IP6_NF_MATCH_LENGTH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_MARK=m +# CONFIG_KHTTPD is not set +CONFIG_ATM=y +CONFIG_ATM_CLIP=y +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +CONFIG_ATM_BR2684_IPFILTER=y +CONFIG_VLAN_8021Q=m +CONFIG_IPX=m +# CONFIG_IPX_INTERN is not set +CONFIG_ATALK=m + +# +# Appletalk devices +# +CONFIG_DEV_APPLETALK=y +CONFIG_LTPC=m +CONFIG_COPS=m +CONFIG_COPS_DAYNA=y +CONFIG_COPS_TANGENT=y +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_DECNET=m +CONFIG_DECNET_SIOCGIFCONF=y +CONFIG_DECNET_ROUTER=y +CONFIG_DECNET_ROUTE_FWMARK=y +CONFIG_BRIDGE=m +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +CONFIG_NET_DIVERT=y +# CONFIG_ECONET is not set +CONFIG_WAN_ROUTER=m +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_CSZ=m +# CONFIG_NET_SCH_ATM is not set +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_QOS=y +CONFIG_NET_ESTIMATOR=y +CONFIG_NET_CLS=y +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_POLICE=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# Telephony Support +# +CONFIG_PHONE=m +CONFIG_PHONE_IXJ=m +CONFIG_PHONE_IXJ_PCMCIA=m + +# +# ATA/IDE/MFM/RLL support +# +CONFIG_IDE=y + +# +# IDE, ATA and ATAPI Block devices +# +CONFIG_BLK_DEV_IDE=y +# CONFIG_BLK_DEV_HD_IDE is not set +# CONFIG_BLK_DEV_HD is not set +CONFIG_BLK_DEV_IDEDISK=y +CONFIG_IDEDISK_MULTI_MODE=y +# CONFIG_IDEDISK_STROKE is not set +CONFIG_BLK_DEV_IDECS=m +CONFIG_BLK_DEV_IDECD=m +CONFIG_BLK_DEV_IDETAPE=m +CONFIG_BLK_DEV_IDEFLOPPY=y +CONFIG_BLK_DEV_IDESCSI=m +# CONFIG_IDE_TASK_IOCTL is not set +CONFIG_BLK_DEV_CMD640=y +# CONFIG_BLK_DEV_CMD640_ENHANCED is not set +CONFIG_BLK_DEV_ISAPNP=y +CONFIG_BLK_DEV_IDEPCI=y +CONFIG_BLK_DEV_GENERIC=y +CONFIG_IDEPCI_SHARE_IRQ=y +CONFIG_BLK_DEV_IDEDMA_PCI=y +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set +CONFIG_IDEDMA_PCI_AUTO=y +# CONFIG_IDEDMA_ONLYDISK is not set +CONFIG_BLK_DEV_IDEDMA=y +# CONFIG_IDEDMA_PCI_WIP is not set +CONFIG_BLK_DEV_ADMA=y +CONFIG_BLK_DEV_AEC62XX=y +CONFIG_BLK_DEV_ALI15X3=y +# CONFIG_WDC_ALI15X3 is not set +CONFIG_BLK_DEV_AMD74XX=y +# CONFIG_AMD74XX_OVERRIDE is not set +CONFIG_BLK_DEV_CMD64X=y +CONFIG_BLK_DEV_TRIFLEX=y +CONFIG_BLK_DEV_CY82C693=y +CONFIG_BLK_DEV_CS5530=y +CONFIG_BLK_DEV_HPT34X=y +# CONFIG_HPT34X_AUTODMA is not set +CONFIG_BLK_DEV_HPT366=y +CONFIG_BLK_DEV_PIIX=y +CONFIG_BLK_DEV_NFORCE=y +# CONFIG_BLK_DEV_NS87415 is not set +# CONFIG_BLK_DEV_OPTI621 is not set +CONFIG_BLK_DEV_PDC202XX_OLD=y +# CONFIG_PDC202XX_BURST is not set +CONFIG_BLK_DEV_PDC202XX_NEW=y +CONFIG_PDC202XX_FORCE=y +CONFIG_BLK_DEV_RZ1000=y +# CONFIG_BLK_DEV_SC1200 is not set +CONFIG_BLK_DEV_SVWKS=y +CONFIG_BLK_DEV_SIIMAGE=y +CONFIG_BLK_DEV_SIS5513=y +CONFIG_BLK_DEV_SLC90E66=y +# CONFIG_BLK_DEV_TRM290 is not set +CONFIG_BLK_DEV_VIA82CXXX=y +# CONFIG_IDE_CHIPSETS is not set +CONFIG_IDEDMA_AUTO=y +# CONFIG_IDEDMA_IVB is not set +# CONFIG_DMA_NONPCI is not set +CONFIG_BLK_DEV_PDC202XX=y +CONFIG_BLK_DEV_IDE_MODES=y +CONFIG_BLK_DEV_ATARAID=m +CONFIG_BLK_DEV_ATARAID_PDC=m +CONFIG_BLK_DEV_ATARAID_HPT=m +CONFIG_BLK_DEV_ATARAID_SII=m + +# +# SCSI support +# +CONFIG_SCSI=m +CONFIG_BLK_DEV_SD=m +CONFIG_SD_EXTRA_DEVS=40 +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_SR_EXTRA_DEVS=4 +CONFIG_CHR_DEV_SG=m +# CONFIG_SCSI_DEBUG_QUEUES is not set +# CONFIG_SCSI_MULTI_LUN is not set +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y + +# +# SCSI low-level drivers +# +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_7000FASST=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AHA152X=m +CONFIG_SCSI_AHA1542=m +CONFIG_SCSI_AHA1740=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=253 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_PROBE_EISA_VL is not set +# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=253 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_BUILD_FIRMWARE is not set +CONFIG_AIC79XX_ENABLE_RD_STRM=y +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_SCSI_AIC7XXX_OLD=m +CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y +CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_OLD_PROC_STATS=y +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_IN2000=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_MEGARAID=m +CONFIG_SCSI_BUSLOGIC=m +# CONFIG_SCSI_OMIT_FLASHPOINT is not set +CONFIG_SCSI_CPQFCTS=m +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_DTC3280=m +CONFIG_SCSI_EATA=m +CONFIG_SCSI_EATA_TAGGED_QUEUE=y +# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set +CONFIG_SCSI_EATA_MAX_TAGS=16 +CONFIG_SCSI_EATA_DMA=m +CONFIG_SCSI_EATA_PIO=m +CONFIG_SCSI_FUTURE_DOMAIN=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_GENERIC_NCR5380=m +# CONFIG_SCSI_GENERIC_NCR53C400 is not set +CONFIG_SCSI_G_NCR5380_PORT=y +# CONFIG_SCSI_G_NCR5380_MEM is not set +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_NCR53C406A=m +CONFIG_SCSI_NCR53C7xx=m +# CONFIG_SCSI_NCR53C7xx_sync is not set +CONFIG_SCSI_NCR53C7xx_FAST=y +CONFIG_SCSI_NCR53C7xx_DISCONNECT=y +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set +CONFIG_SCSI_NCR53C8XX=m +CONFIG_SCSI_SYM53C8XX=m +CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8 +CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32 +CONFIG_SCSI_NCR53C8XX_SYNC=40 +# CONFIG_SCSI_NCR53C8XX_PROFILE is not set +# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set +# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set +# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set +CONFIG_SCSI_PAS16=m +CONFIG_SCSI_PCI2000=m +CONFIG_SCSI_PCI2220I=m +CONFIG_SCSI_PSI240I=m +CONFIG_SCSI_QLOGIC_FAS=m +CONFIG_SCSI_QLOGIC_ISP=m +CONFIG_SCSI_QLOGIC_FC=m +# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_NEWISP=m +CONFIG_SCSI_SEAGATE=m +CONFIG_SCSI_SIM710=m +CONFIG_SCSI_SYM53C416=m +CONFIG_SCSI_DC390T=m +# CONFIG_SCSI_DC390T_NOGENSUPP is not set +CONFIG_SCSI_T128=m +CONFIG_SCSI_U14_34F=m +# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set +CONFIG_SCSI_U14_34F_MAX_TAGS=8 +CONFIG_SCSI_ULTRASTOR=m +CONFIG_SCSI_NSP32=m +CONFIG_SCSI_DEBUG=m + +# +# PCMCIA SCSI adapter support +# +CONFIG_SCSI_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_NINJA_SCSI=m +CONFIG_PCMCIA_QLOGIC=m + +# +# Fusion MPT device support +# +CONFIG_FUSION=m +# CONFIG_FUSION_BOOT is not set +CONFIG_FUSION_MAX_SGE=40 +# CONFIG_FUSION_ISENSE is not set +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +CONFIG_NET_FC=y + +# +# IEEE 1394 (FireWire) support (EXPERIMENTAL) +# +CONFIG_IEEE1394=m +# CONFIG_IEEE1394_PCILYNX is not set +CONFIG_IEEE1394_OHCI1394=m +CONFIG_IEEE1394_VIDEO1394=m +CONFIG_IEEE1394_SBP2=m +CONFIG_IEEE1394_SBP2_PHYS_DMA=y +CONFIG_IEEE1394_ETH1394=m +CONFIG_IEEE1394_DV1394=m +CONFIG_IEEE1394_RAWIO=m +CONFIG_IEEE1394_CMP=m +CONFIG_IEEE1394_AMDTP=m +# CONFIG_IEEE1394_VERBOSEDEBUG is not set + +# +# I2O device support +# +CONFIG_I2O=m +CONFIG_I2O_PCI=m +CONFIG_I2O_BLOCK=m +CONFIG_I2O_LAN=m +CONFIG_I2O_SCSI=m +CONFIG_I2O_PROC=m + +# +# Network device support +# +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=m +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=m +CONFIG_ETHERTAP=m +CONFIG_NET_SB1000=m + +# +# Ethernet (10 or 100Mbit) +# +CONFIG_NET_ETHERNET=y +# CONFIG_SUNLANCE is not set +CONFIG_HAPPYMEAL=m +# CONFIG_SUNBMAC is not set +# CONFIG_SUNQE is not set +CONFIG_SUNGEM=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_EL1=m +CONFIG_EL2=m +CONFIG_ELPLUS=m +CONFIG_EL16=m +CONFIG_EL3=m +CONFIG_3C515=m +# CONFIG_ELMC is not set +# CONFIG_ELMC_II is not set +CONFIG_VORTEX=m +CONFIG_LANCE=m +CONFIG_NET_VENDOR_SMC=y +CONFIG_WD80x3=m +# CONFIG_ULTRAMCA is not set +CONFIG_ULTRA=m +CONFIG_ULTRA32=m +CONFIG_SMC9194=m +CONFIG_NET_VENDOR_RACAL=y +CONFIG_NI5010=m +CONFIG_NI52=m +CONFIG_NI65=m +CONFIG_AT1700=m +CONFIG_DEPCA=m +CONFIG_HP100=m +CONFIG_NET_ISA=y +CONFIG_E2100=m +CONFIG_EWRK3=m +CONFIG_EEXPRESS=m +CONFIG_EEXPRESS_PRO=m +CONFIG_HPLAN_PLUS=m +CONFIG_HPLAN=m +CONFIG_LP486E=m +CONFIG_ETH16I=m +CONFIG_NE2000=m +CONFIG_NET_PCI=y +CONFIG_PCNET32=m +CONFIG_AMD8111_ETH=m +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_AC3200=m +CONFIG_APRICOT=m +CONFIG_CS89x0=m +CONFIG_TULIP=m +# CONFIG_TULIP_MWI is not set +CONFIG_TULIP_MMIO=y +CONFIG_DE4X5=m +CONFIG_DGRS=m +CONFIG_DM9102=m +CONFIG_EEPRO100=m +CONFIG_E100=m +CONFIG_LNE390=m +CONFIG_FEALNX=m +CONFIG_NATSEMI=m +CONFIG_NE2K_PCI=m +CONFIG_NE3210=m +CONFIG_ES3210=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +# CONFIG_8139TOO_TUNE_TWISTER is not set +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_SIS900=m +CONFIG_EPIC100=m +CONFIG_SUNDANCE=m +CONFIG_SUNDANCE_MMIO=y +CONFIG_TLAN=m +CONFIG_TC35815=m +CONFIG_VIA_RHINE=m +# CONFIG_VIA_RHINE_MMIO is not set +CONFIG_WINBOND_840=m +CONFIG_NET_POCKET=y +CONFIG_ATP=m +CONFIG_DE600=m +CONFIG_DE620=m + +# +# Ethernet (1000 Mbit) +# +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_DL2K=m +CONFIG_E1000=m +# CONFIG_MYRI_SBUS is not set +CONFIG_NS83820=m +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_R8169=m +CONFIG_SK98LIN=m +CONFIG_TIGON3=m +CONFIG_FDDI=y +CONFIG_DEFXX=m +CONFIG_SKFP=m +CONFIG_NETCONSOLE=m +# CONFIG_HIPPI is not set +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPP_FILTER=y +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_PPPOATM=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Wireless LAN (non-hamradio) +# +CONFIG_NET_RADIO=y +CONFIG_STRIP=m +CONFIG_WAVELAN=m +CONFIG_ARLAN=m +CONFIG_AIRONET4500=m +CONFIG_AIRONET4500_NONCS=m +CONFIG_AIRONET4500_PNP=y +CONFIG_AIRONET4500_PCI=y +CONFIG_AIRONET4500_ISA=y +CONFIG_AIRONET4500_I365=y +CONFIG_AIRONET4500_PROC=m +CONFIG_AIRO=m +CONFIG_HERMES=m +CONFIG_PLX_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_AIRO_CS=m +CONFIG_NET_WIRELESS=y +CONFIG_PCMCIA_HERMES_OLD=m + +# +# Token Ring devices +# +CONFIG_TR=y +CONFIG_IBMTR=m +CONFIG_IBMOL=m +CONFIG_IBMLS=m +CONFIG_3C359=m +CONFIG_TMS380TR=m +CONFIG_TMSPCI=m +CONFIG_TMSISA=m +CONFIG_ABYSS=m +# CONFIG_MADGEMC is not set +CONFIG_SMCTR=m +CONFIG_NET_FC=y +CONFIG_IPHASE5526=m +CONFIG_RCPCI=m +CONFIG_SHAPER=m + +# +# Wan interfaces +# +CONFIG_WAN=y +CONFIG_HOSTESS_SV11=m +CONFIG_COSA=m +# CONFIG_COMX is not set +# CONFIG_DSCC4 is not set +# CONFIG_LANMEDIA is not set +CONFIG_ATI_XX20=m +CONFIG_SEALEVEL_4021=m +# CONFIG_SYNCLINK_SYNCPPP is not set +# CONFIG_HDLC is not set +CONFIG_DLCI=m +CONFIG_DLCI_COUNT=24 +CONFIG_DLCI_MAX=8 +CONFIG_SDLA=m +CONFIG_WAN_ROUTER_DRIVERS=y +CONFIG_VENDOR_SANGOMA=m +CONFIG_WANPIPE_CHDLC=y +CONFIG_WANPIPE_FR=y +CONFIG_WANPIPE_X25=y +CONFIG_WANPIPE_PPP=y +CONFIG_WANPIPE_MULTPPP=y +CONFIG_CYCLADES_SYNC=m +CONFIG_CYCLOMX_X25=y +# CONFIG_LAPBETHER is not set +# CONFIG_X25_ASY is not set +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y + +# +# PCMCIA network device support +# +CONFIG_NET_PCMCIA=y +CONFIG_PCMCIA_3C589=m +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_PCMCIA_PCNET=m +CONFIG_PCMCIA_AXNET=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_PCMCIA_SMC91C92=m +CONFIG_PCMCIA_XIRC2PS=m +# CONFIG_ARCNET_COM20020_CS is not set +CONFIG_PCMCIA_IBMTR=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_PCMCIA_XIRTULIP=m +CONFIG_NET_PCMCIA_RADIO=y +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_NETWAVE=m +CONFIG_PCMCIA_WAVELAN=m +CONFIG_PCMCIA_WVLAN=m +CONFIG_AIRONET4500_CS=m + +# +# ATM drivers +# +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_ZATM_EXACT_TS=y +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E_MAYBE=m +CONFIG_ATM_FORE200E_PCA=y +CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_FORE200E=m + +# +# Amateur Radio support +# +CONFIG_HAMRADIO=y +CONFIG_AX25=m +# CONFIG_AX25_DAMA_SLAVE is not set +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +# CONFIG_MKISS is not set +# CONFIG_6PACK is not set +# CONFIG_BPQETHER is not set +# CONFIG_DMASCC is not set +# CONFIG_SCC is not set +# CONFIG_BAYCOM_SER_FDX is not set +# CONFIG_BAYCOM_SER_HDX is not set +# CONFIG_BAYCOM_PAR is not set +# CONFIG_BAYCOM_EPP is not set +CONFIG_SOUNDMODEM=m +CONFIG_SOUNDMODEM_SBC=y +CONFIG_SOUNDMODEM_WSS=y +CONFIG_SOUNDMODEM_AFSK1200=y +CONFIG_SOUNDMODEM_AFSK2400_7=y +CONFIG_SOUNDMODEM_AFSK2400_8=y +CONFIG_SOUNDMODEM_AFSK2666=y +CONFIG_SOUNDMODEM_HAPN4800=y +CONFIG_SOUNDMODEM_PSK4800=y +CONFIG_SOUNDMODEM_FSK9600=y +# CONFIG_YAM is not set + +# +# IrDA (infrared) support +# +CONFIG_IRDA=m +CONFIG_IRLAN=m +CONFIG_IRNET=m +CONFIG_IRCOMM=m +CONFIG_IRDA_ULTRA=y +CONFIG_IRDA_CACHE_LAST_LSAP=y +CONFIG_IRDA_FAST_RR=y +# CONFIG_IRDA_DEBUG is not set + +# +# Infrared-port device drivers +# +CONFIG_IRTTY_SIR=m +CONFIG_IRPORT_SIR=m +CONFIG_DONGLE=y +CONFIG_ESI_DONGLE=m +CONFIG_ACTISYS_DONGLE=m +CONFIG_TEKRAM_DONGLE=m +CONFIG_GIRBIL_DONGLE=m +CONFIG_LITELINK_DONGLE=m +CONFIG_MCP2120_DONGLE=m +CONFIG_OLD_BELKIN_DONGLE=m +CONFIG_ACT200L_DONGLE=m +CONFIG_MA600_DONGLE=m +CONFIG_USB_IRDA=m +CONFIG_NSC_FIR=m +CONFIG_WINBOND_FIR=m +CONFIG_TOSHIBA_OLD=m +CONFIG_TOSHIBA_FIR=m +CONFIG_SMC_IRCC_FIR=m +CONFIG_ALI_FIR=m +CONFIG_VLSI_FIR=m + +# +# ISDN subsystem +# +CONFIG_ISDN=m +CONFIG_ISDN_BOOL=y +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y + +# +# ISDN feature submodules +# +CONFIG_ISDN_DRV_LOOP=m +# CONFIG_ISDN_DIVERSION is not set + +# +# Passive ISDN cards +# +CONFIG_ISDN_DRV_HISAX=m +CONFIG_ISDN_HISAX=y +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 +CONFIG_HISAX_16_0=y +CONFIG_HISAX_16_3=y +CONFIG_HISAX_AVM_A1=y +CONFIG_HISAX_IX1MICROR2=y +CONFIG_HISAX_ASUSCOM=y +CONFIG_HISAX_TELEINT=y +CONFIG_HISAX_HFCS=y +CONFIG_HISAX_SPORTSTER=y +CONFIG_HISAX_MIC=y +CONFIG_HISAX_ISURF=y +CONFIG_HISAX_HSTSAPHIR=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +CONFIG_HISAX_DEBUG=y +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_FRITZ_PCIPNP=m + +# +# Active ISDN cards +# +CONFIG_ISDN_DRV_ICN=m +CONFIG_ISDN_DRV_PCBIT=m +# CONFIG_ISDN_DRV_SC is not set +# CONFIG_ISDN_DRV_ACT2000 is not set +CONFIG_ISDN_DRV_EICON=y +CONFIG_ISDN_DRV_EICON_DIVAS=m +# CONFIG_ISDN_DRV_EICON_OLD is not set +CONFIG_ISDN_DRV_TPAM=m +CONFIG_ISDN_CAPI=m +CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_CAPIFS_BOOL=y +CONFIG_ISDN_CAPI_CAPIFS=m +CONFIG_ISDN_CAPI_CAPIDRV=m +CONFIG_ISDN_DRV_AVMB1_B1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_T1ISA=m +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +CONFIG_KALLSYMS=y + +# +# Old CD-ROM drivers (not SCSI, not IDE) +# +# CONFIG_CD_NO_IDESCSI is not set + +# +# Input core support +# +CONFIG_INPUT=m +CONFIG_INPUT_KEYBDEV=m +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m + +# +# Character devices +# +CONFIG_VT=y +CONFIG_ECC=m +CONFIG_VT_CONSOLE=y +CONFIG_SERIAL=y +CONFIG_SERIAL_CONSOLE=y +CONFIG_SERIAL_EXTENDED=y +CONFIG_SERIAL_MANY_PORTS=y +CONFIG_SERIAL_SHARE_IRQ=y +# CONFIG_SERIAL_DETECT_IRQ is not set +CONFIG_SERIAL_MULTIPORT=y +# CONFIG_HUB6 is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_COMPUTONE=m +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +# CONFIG_CYZ_INTR is not set +CONFIG_DIGIEPCA=m +CONFIG_ESPSERIAL=m +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_ISI=m +CONFIG_SYNCLINK=m +# CONFIG_SYNCLINKMP is not set +CONFIG_N_HDLC=m +CONFIG_RISCOM8=m +CONFIG_SPECIALIX=m +CONFIG_SPECIALIX_RTSCTS=y +CONFIG_SX=m +# CONFIG_RIO is not set +CONFIG_STALDRV=y +CONFIG_STALLION=m +CONFIG_ISTALLION=m +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=2048 +CONFIG_PRINTER=m +CONFIG_LP_CONSOLE=y +CONFIG_PPDEV=m +CONFIG_TIPAR=m + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_PHILIPSPAR=m +CONFIG_I2C_ELV=m +CONFIG_I2C_VELLEMAN=m +# CONFIG_SCx200_I2C is not set +# CONFIG_SCx200_ACB is not set +CONFIG_I2C_ALGOPCF=m +CONFIG_I2C_ELEKTOR=m +CONFIG_I2C_MAINBOARD=y +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_HYDRA=m +CONFIG_I2C_AMD756=m +# CONFIG_I2C_TSUNAMI is not set +CONFIG_I2C_I801=m +CONFIG_I2C_I810=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m +CONFIG_I2C_VOODOO3=m +CONFIG_I2C_ISA=m +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_PROC=m + +# +# Hardware sensors support +# +CONFIG_SENSORS=y +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1024=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_FSCPOS=m +CONFIG_SENSORS_FSCSCY=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_MAXILIFE=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_MTP008=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_OTHER=y +CONFIG_SENSORS_BT869=m +CONFIG_SENSORS_DDCMON=m +CONFIG_SENSORS_EEPROM=m +CONFIG_SENSORS_MATORB=m +CONFIG_SENSORS_PCF8574=m +CONFIG_SENSORS_PCF8591=m + +# +# Mice +# +CONFIG_BUSMOUSE=m +CONFIG_ATIXL_BUSMOUSE=m +CONFIG_LOGIBUSMOUSE=m +CONFIG_MS_BUSMOUSE=m +CONFIG_MOUSE=y +CONFIG_PSMOUSE=y +CONFIG_82C710_MOUSE=m +CONFIG_PC110_PAD=m +CONFIG_MK712_MOUSE=m + +# +# Joysticks +# +CONFIG_INPUT_GAMEPORT=m +CONFIG_INPUT_NS558=m +CONFIG_INPUT_LIGHTNING=m +CONFIG_INPUT_PCIGAME=m +CONFIG_INPUT_CS461X=m +CONFIG_INPUT_EMU10K1=m +CONFIG_INPUT_SERIO=m +CONFIG_INPUT_SERPORT=m +CONFIG_INPUT_ANALOG=m +CONFIG_INPUT_A3D=m +CONFIG_INPUT_ADI=m +CONFIG_INPUT_COBRA=m +CONFIG_INPUT_GF2K=m +CONFIG_INPUT_GRIP=m +CONFIG_INPUT_INTERACT=m +CONFIG_INPUT_TMDC=m +CONFIG_INPUT_SIDEWINDER=m +CONFIG_INPUT_IFORCE_USB=m +CONFIG_INPUT_IFORCE_232=m +CONFIG_INPUT_WARRIOR=m +CONFIG_INPUT_MAGELLAN=m +CONFIG_INPUT_SPACEORB=m +CONFIG_INPUT_SPACEBALL=m +CONFIG_INPUT_STINGER=m +CONFIG_INPUT_DB9=m +CONFIG_INPUT_GAMECON=m +CONFIG_INPUT_TURBOGRAFX=m +# CONFIG_QIC02_TAPE is not set +CONFIG_IPMI_HANDLER=m +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_KCS=m +CONFIG_IPMI_WATCHDOG=m + +# +# Watchdog Cards +# +CONFIG_WATCHDOG=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_SC520_WDT=m +CONFIG_PCWATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_WAFER_WDT=m +CONFIG_I810_TCO=m +# CONFIG_MIXCOMWD is not set +# CONFIG_60XX_WDT is not set +CONFIG_SC1200_WDT=m +# CONFIG_SCx200_WDT is not set +CONFIG_SOFT_WATCHDOG=m +CONFIG_W83877F_WDT=m +CONFIG_WDT=m +CONFIG_WDTPCI=m +# CONFIG_WDT_501 is not set +CONFIG_MACHZ_WDT=m +CONFIG_AMD7XX_TCO=m +# CONFIG_SCx200_GPIO is not set +CONFIG_AMD_RNG=m +CONFIG_INTEL_RNG=m +CONFIG_AMD_PM768=m +CONFIG_NVRAM=m +CONFIG_RTC=y +CONFIG_DTLK=m +CONFIG_R3964=m +# CONFIG_APPLICOM is not set +CONFIG_SONYPI=m + +# +# Ftape, the floppy tape device driver +# +CONFIG_FTAPE=m +CONFIG_ZFTAPE=m +CONFIG_ZFT_DFLT_BLK_SZ=10240 +CONFIG_ZFT_COMPRESSOR=m +CONFIG_FT_NR_BUFFERS=3 +# CONFIG_FT_PROC_FS is not set +CONFIG_FT_NORMAL_DEBUG=y +# CONFIG_FT_FULL_DEBUG is not set +# CONFIG_FT_NO_TRACE is not set +# CONFIG_FT_NO_TRACE_AT_ALL is not set +CONFIG_FT_STD_FDC=y +# CONFIG_FT_MACH2 is not set +# CONFIG_FT_PROBE_FC10 is not set +# CONFIG_FT_ALT_FDC is not set +CONFIG_FT_FDC_THR=8 +CONFIG_FT_FDC_MAX_RATE=2000 +CONFIG_FT_ALPHA_CLOCK=0 +CONFIG_AGP=m +CONFIG_AGP_INTEL=y +CONFIG_AGP_I810=y +CONFIG_AGP_VIA=y +CONFIG_AGP_AMD=y +CONFIG_AGP_AMD_8151=y +CONFIG_AGP_SIS=y +CONFIG_AGP_ALI=y +CONFIG_AGP_SWORKS=y +CONFIG_DRM=y +# CONFIG_DRM_OLD is not set +CONFIG_DRM_NEW=y +CONFIG_DRM_TDFX=m +CONFIG_DRM_R128=m +CONFIG_DRM_RADEON=m +CONFIG_DRM_I810=m +# CONFIG_DRM_I810_XFREE_41 is not set +CONFIG_DRM_I830=m +CONFIG_DRM_MGA=m +CONFIG_DRM_SIS=m + +# +# PCMCIA character devices +# +CONFIG_PCMCIA_SERIAL_CS=m +CONFIG_SYNCLINK_CS=m +CONFIG_MWAVE=m +CONFIG_BATTERY_GERICOM=m + +# +# Multimedia devices +# +CONFIG_VIDEO_DEV=m + +# +# Video For Linux +# +CONFIG_VIDEO_PROC_FS=y +CONFIG_I2C_PARPORT=m +CONFIG_VIDEO_BT848=m +CONFIG_VIDEO_PMS=m +CONFIG_VIDEO_BWQCAM=m +CONFIG_VIDEO_CQCAM=m +CONFIG_VIDEO_W9966=m +CONFIG_VIDEO_CPIA=m +CONFIG_VIDEO_CPIA_PP=m +CONFIG_VIDEO_CPIA_USB=m +CONFIG_VIDEO_SAA5249=m +CONFIG_TUNER_3036=m +CONFIG_VIDEO_STRADIS=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZR36120=m +CONFIG_VIDEO_MEYE=m + +# +# Radio Adapters +# +CONFIG_RADIO_CADET=m +CONFIG_RADIO_RTRACK=m +CONFIG_RADIO_RTRACK2=m +CONFIG_RADIO_AZTECH=m +CONFIG_RADIO_GEMTEK=m +CONFIG_RADIO_GEMTEK_PCI=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_MAESTRO=m +CONFIG_RADIO_MIROPCM20=m +CONFIG_RADIO_MIROPCM20_RDS=m +CONFIG_RADIO_SF16FMI=m +CONFIG_RADIO_SF16FMR2=m +CONFIG_RADIO_TERRATEC=m +CONFIG_RADIO_TRUST=m +CONFIG_RADIO_TYPHOON=m +CONFIG_RADIO_TYPHOON_PROC_FS=y +CONFIG_RADIO_ZOLTRIX=m + +# +# Crypto Hardware support +# +CONFIG_CRYPTO=m +CONFIG_CRYPTO_BROADCOM=m + +# +# File systems +# +CONFIG_QUOTA=y +# CONFIG_QFMT_V1 is not set +CONFIG_QFMT_V2=y +# CONFIG_QIFACE_COMPAT is not set +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +# CONFIG_ADFS_FS is not set +CONFIG_AFS_FS=m +# CONFIG_ADFS_FS_RW is not set +# CONFIG_AFFS_FS is not set +CONFIG_HFS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_XATTR_SHARING=y +CONFIG_EXT3_FS_XATTR_USER=y +CONFIG_JBD=m +# CONFIG_JBD_DEBUG is not set +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_UMSDOS_FS=m +CONFIG_VFAT_FS=m +# CONFIG_EFS_FS is not set +# CONFIG_JFFS_FS is not set +# CONFIG_JFFS2_FS is not set +CONFIG_CRAMFS=m +CONFIG_TMPFS=y +CONFIG_RAMFS=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_JFS_FS=m +CONFIG_JFS_DEBUG=y +# CONFIG_JFS_STATISTICS is not set +CONFIG_MINIX_FS=m +CONFIG_VXFS_FS=m +# CONFIG_NTFS_FS is not set +# CONFIG_NTFS_RW is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +# CONFIG_DEVFS_FS is not set +# CONFIG_DEVFS_MOUNT is not set +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX4FS_RW is not set +CONFIG_ROMFS_FS=m +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT2_FS_XATTR_SHARING is not set +# CONFIG_EXT2_FS_XATTR_USER is not set +CONFIG_SYSV_FS=m +CONFIG_UDF_FS=m +CONFIG_UDF_RW=y +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +CONFIG_CODA_FS=m +CONFIG_INTERMEZZO_FS=m +CONFIG_NFS_FS=m +CONFIG_NFS_V3=y +# CONFIG_ROOT_NFS is not set +CONFIG_NFSD=m +CONFIG_NFSD_V3=y +# CONFIG_NFSD_TCP is not set +CONFIG_SUNRPC=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_SMB_FS=m +# CONFIG_SMB_NLS_DEFAULT is not set +CONFIG_NCP_FS=m +CONFIG_NCPFS_PACKET_SIGNING=y +CONFIG_NCPFS_IOCTL_LOCKING=y +CONFIG_NCPFS_STRONG=y +CONFIG_NCPFS_NFS_NS=y +CONFIG_NCPFS_OS2_NS=y +CONFIG_NCPFS_SMALLDOS=y +CONFIG_NCPFS_NLS=y +CONFIG_NCPFS_EXTRAS=y +CONFIG_ZISOFS_FS=y +CONFIG_FS_MBCACHE=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_OSF_PARTITION=y +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +# CONFIG_LDM_PARTITION is not set +CONFIG_SGI_PARTITION=y +# CONFIG_ULTRIX_PARTITION is not set +CONFIG_SUN_PARTITION=y +# CONFIG_EFI_PARTITION is not set +CONFIG_SMB_NLS=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m + +# +# Console drivers +# +CONFIG_VGA_CONSOLE=y +CONFIG_VIDEO_SELECT=y +# CONFIG_VIDEO_IGNORE_BAD_MODE is not set +CONFIG_MDA_CONSOLE=m + +# +# Frame-buffer support +# +CONFIG_FB=y +CONFIG_DUMMY_CONSOLE=y +CONFIG_FB_RIVA=m +CONFIG_FB_CLGEN=m +CONFIG_FB_PM2=m +# CONFIG_FB_PM2_FIFO_DISCONNECT is not set +CONFIG_FB_PM2_PCI=y +CONFIG_FB_PM3=m +# CONFIG_FB_CYBER2000 is not set +CONFIG_FB_VESA=y +CONFIG_FB_VGA16=m +CONFIG_FB_HGA=m +CONFIG_VIDEO_SELECT=y +CONFIG_FB_MATROX=m +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +# CONFIG_FB_MATROX_G450 is not set +CONFIG_FB_MATROX_G100A=y +CONFIG_FB_MATROX_G100=y +CONFIG_FB_MATROX_I2C=m +CONFIG_FB_MATROX_MAVEN=m +# CONFIG_FB_MATROX_PROC is not set +CONFIG_FB_MATROX_MULTIHEAD=y +CONFIG_FB_ATY=m +CONFIG_FB_ATY_GX=y +CONFIG_FB_ATY_CT=y +CONFIG_FB_ATY_CT_VAIO_LCD=y +CONFIG_FB_RADEON=m +CONFIG_FB_ATY128=m +CONFIG_FB_SIS=m +CONFIG_FB_SIS_300=y +CONFIG_FB_SIS_315=y +CONFIG_FB_NEOMAGIC=m +CONFIG_FB_3DFX=m +CONFIG_FB_VOODOO1=m +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FBCON_ADVANCED is not set +CONFIG_FBCON_MFB=m +CONFIG_FBCON_CFB8=y +CONFIG_FBCON_CFB16=y +CONFIG_FBCON_CFB24=y +CONFIG_FBCON_CFB32=y +CONFIG_FBCON_VGA_PLANES=m +CONFIG_FBCON_HGA=m +# CONFIG_FBCON_FONTWIDTH8_ONLY is not set +# CONFIG_FBCON_FONTS is not set +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y + +# +# Sound +# +CONFIG_SOUND=m +CONFIG_SOUND_ALI5455=m +CONFIG_SOUND_BT878=m +CONFIG_SOUND_CMPCI=m +CONFIG_SOUND_CMPCI_FM=y +CONFIG_SOUND_CMPCI_FMIO=388 +CONFIG_SOUND_CMPCI_FMIO=388 +CONFIG_SOUND_CMPCI_MIDI=y +CONFIG_SOUND_CMPCI_MPUIO=330 +CONFIG_SOUND_CMPCI_JOYSTICK=y +CONFIG_SOUND_CMPCI_CM8738=y +# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set +CONFIG_SOUND_CMPCI_SPDIFLOOP=y +CONFIG_SOUND_CMPCI_SPEAKERS=2 +CONFIG_SOUND_EMU10K1=m +CONFIG_MIDI_EMU10K1=y +CONFIG_SOUND_AUDIGY=m +CONFIG_SOUND_FUSION=m +CONFIG_SOUND_CS4281=m +CONFIG_SOUND_ES1370=m +CONFIG_SOUND_ES1371=m +CONFIG_SOUND_ESSSOLO1=m +CONFIG_SOUND_MAESTRO=m +CONFIG_SOUND_MAESTRO3=m +CONFIG_SOUND_FORTE=m +CONFIG_SOUND_ICH=m +CONFIG_SOUND_RME96XX=m +CONFIG_SOUND_SONICVIBES=m +CONFIG_SOUND_TRIDENT=m +CONFIG_SOUND_MSNDCLAS=m +# CONFIG_MSNDCLAS_HAVE_BOOT is not set +CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin" +CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin" +CONFIG_SOUND_MSNDPIN=m +# CONFIG_MSNDPIN_HAVE_BOOT is not set +CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin" +CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin" +CONFIG_SOUND_VIA82CXXX=m +CONFIG_MIDI_VIA82CXXX=y +CONFIG_SOUND_OSS=m +# CONFIG_SOUND_TRACEINIT is not set +CONFIG_SOUND_DMAP=y +CONFIG_SOUND_AD1816=m +CONFIG_SOUND_AD1889=m +CONFIG_SOUND_SGALAXY=m +CONFIG_SOUND_ADLIB=m +CONFIG_SOUND_ACI_MIXER=m +CONFIG_SOUND_CS4232=m +CONFIG_SOUND_SSCAPE=m +CONFIG_SOUND_GUS=m +CONFIG_SOUND_GUS16=y +CONFIG_SOUND_GUSMAX=y +CONFIG_SOUND_VMIDI=m +CONFIG_SOUND_TRIX=m +CONFIG_SOUND_MSS=m +CONFIG_SOUND_MPU401=m +CONFIG_SOUND_NM256=m +CONFIG_SOUND_MAD16=m +CONFIG_MAD16_OLDCARD=y +CONFIG_SOUND_PAS=m +# CONFIG_PAS_JOYSTICK is not set +CONFIG_SOUND_PSS=m +# CONFIG_PSS_MIXER is not set +# CONFIG_PSS_HAVE_BOOT is not set +CONFIG_SOUND_SB=m +CONFIG_SOUND_AWE32_SYNTH=m +CONFIG_SOUND_WAVEFRONT=m +CONFIG_SOUND_MAUI=m +CONFIG_SOUND_YM3812=m +CONFIG_SOUND_OPL3SA1=m +CONFIG_SOUND_OPL3SA2=m +CONFIG_SOUND_YMFPCI=m +CONFIG_SOUND_YMFPCI_LEGACY=y +CONFIG_SOUND_UART6850=m +CONFIG_SOUND_AEDSP16=m +CONFIG_SC6600=y +CONFIG_SC6600_JOY=y +CONFIG_SC6600_CDROM=4 +CONFIG_SC6600_CDROMBASE=0 +CONFIG_AEDSP16_SBPRO=y +CONFIG_AEDSP16_MPU401=y +CONFIG_SOUND_TVMIXER=m + +# +# USB support +# +CONFIG_USB=m +# CONFIG_USB_DEBUG is not set +CONFIG_USB_DEVICEFS=y +# CONFIG_USB_BANDWIDTH is not set +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_UHCI=m +CONFIG_USB_UHCI_ALT=m +CONFIG_USB_OHCI=m +CONFIG_USB_AUDIO=m +# CONFIG_USB_EMI26 is not set +CONFIG_USB_MIDI=m +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_DATAFAB=y +CONFIG_USB_STORAGE_FREECOM=y +CONFIG_USB_STORAGE_ISD200=y +CONFIG_USB_STORAGE_DPCM=y +CONFIG_USB_STORAGE_HP8200e=y +CONFIG_USB_STORAGE_SDDR09=y +CONFIG_USB_STORAGE_SDDR55=y +CONFIG_USB_STORAGE_JUMPSHOT=y +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_HID=m +CONFIG_USB_HIDINPUT=y +CONFIG_USB_HIDDEV=y +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +CONFIG_USB_AIPTEK=m +CONFIG_USB_WACOM=m +CONFIG_USB_POWERMATE=m +# CONFIG_USB_DC2XX is not set +CONFIG_USB_MDC800=m +CONFIG_USB_SCANNER=m +CONFIG_USB_MICROTEK=m +CONFIG_USB_HPUSBSCSI=m +CONFIG_USB_IBMCAM=m +CONFIG_USB_OV511=m +CONFIG_USB_PWC=m +CONFIG_USB_SE401=m +CONFIG_USB_STV680=m +CONFIG_USB_VICAM=m +CONFIG_USB_DSBR=m +CONFIG_USB_DABUSB=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_KAWETH=m +CONFIG_USB_CATC=m +CONFIG_USB_CDCETHER=m +CONFIG_USB_USBNET=m +CONFIG_USB_USS720=m + +# +# USB Serial Converter support +# +CONFIG_USB_SERIAL=m +# CONFIG_USB_SERIAL_DEBUG is not set +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set +CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y +CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y +# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set +# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set +CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y +CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y +# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_RIO500=m +CONFIG_USB_AUERSWALD=m +CONFIG_USB_TIGL=m +CONFIG_USB_BRLVGER=m +CONFIG_USB_LCD=m + +# +# Additional device driver support +# +CONFIG_NET_BROADCOM=m +CONFIG_CIPE=m +CONFIG_CRYPTO_AEP=m +CONFIG_MEGARAC=m +CONFIG_FC_QLA2200=m +CONFIG_FC_QLA2300=m +CONFIG_SCSI_ISCSI=m + +# +# Bluetooth support +# +CONFIG_BLUEZ=m +CONFIG_BLUEZ_L2CAP=m +CONFIG_BLUEZ_SCO=m +CONFIG_BLUEZ_RFCOMM=m +CONFIG_BLUEZ_RFCOMM_TTY=y +CONFIG_BLUEZ_BNEP=m +CONFIG_BLUEZ_BNEP_MC_FILTER=y +CONFIG_BLUEZ_BNEP_PROTO_FILTER=y + +# +# Bluetooth device drivers +# +CONFIG_BLUEZ_HCIUSB=m +CONFIG_BLUEZ_USB_ZERO_PACKET=y +CONFIG_BLUEZ_HCIUART=m +CONFIG_BLUEZ_HCIUART_H4=y +CONFIG_BLUEZ_HCIUART_BCSP=y +CONFIG_BLUEZ_HCIUART_BCSP_TXCRC=y +CONFIG_BLUEZ_HCIDTL1=m +CONFIG_BLUEZ_HCIBT3C=m +CONFIG_BLUEZ_HCIBLUECARD=m +CONFIG_BLUEZ_HCIBTUART=m +CONFIG_BLUEZ_HCIVHCI=m + +# +# Profiling support +# +# CONFIG_PROFILING is not set + +# +# Kernel hacking +# +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_HIGHMEM is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_IOVIRT is not set +CONFIG_MAGIC_SYSRQ=y +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_FRAME_POINTER is not set +CONFIG_MCL_COREDUMP=y +CONFIG_BOOTIMG=y + +# +# Library routines +# +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y diff --git a/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml new file mode 100644 index 0000000..2d4a2d5 --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/config-linux-2.4.20-uml @@ -0,0 +1,297 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +# CONFIG_ISA is not set +# CONFIG_SBUS is not set +# CONFIG_PCI is not set +CONFIG_UID16=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General Setup +# +# CONFIG_MODE_SKAS is not set +CONFIG_MODE_TT=y +CONFIG_MODE_TT=y +CONFIG_NET=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +# CONFIG_BINFMT_AOUT is not set +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_HOSTFS=y +# CONFIG_HPPFS is not set +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 +# CONFIG_HIGHMEM is not set +# CONFIG_PROC_MM is not set +CONFIG_KERNEL_STACK_ORDER=2 +CONFIG_MODE_TT=y +# +# Loadable module support +# +CONFIG_MODULES=y +# CONFIG_KMOD is not set + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +# CONFIG_NULL_CHAN is not set +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +# CONFIG_WATCHDOG is not set +# CONFIG_WATCHDOG_NOWAYOUT is not set +# CONFIG_SOFT_WATCHDOG is not set +# CONFIG_UML_WATCHDOG is not set +# CONFIG_UML_SOUND is not set +# CONFIG_SOUND is not set +# CONFIG_HOSTAUDIO is not set +# CONFIG_TTY_LOG is not set + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_LOOP=y +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_BLK_DEV_INITRD is not set +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +# CONFIG_UML_NET_SLIRP is not set +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +# CONFIG_NETFILTER is not set +# CONFIG_FILTER is not set +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_IPV6 is not set +# CONFIG_KHTTPD is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set + +# +# +# +# CONFIG_IPX is not set +# CONFIG_ATALK is not set + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# File systems +# +CONFIG_QUOTA=y +# CONFIG_AUTOFS_FS is not set +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_ADFS_FS is not set +# CONFIG_ADFS_FS_RW is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +# CONFIG_EXT3_FS_XATTR_SHARING is not set +# CONFIG_EXT3_FS_XATTR_USER is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +# CONFIG_FAT_FS is not set +# CONFIG_MSDOS_FS is not set +# CONFIG_UMSDOS_FS is not set +# CONFIG_VFAT_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_JFFS_FS is not set +# CONFIG_JFFS2_FS is not set +# CONFIG_CRAMFS is not set +CONFIG_TMPFS=y +CONFIG_RAMFS=y +# CONFIG_ISO9660_FS is not set +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +# CONFIG_JFS_FS is not set +# CONFIG_JFS_DEBUG is not set +# CONFIG_JFS_STATISTICS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_NTFS_FS is not set +# CONFIG_NTFS_RW is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX4FS_RW is not set +# CONFIG_ROMFS_FS is not set +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT2_FS_XATTR_SHARING is not set +# CONFIG_EXT2_FS_XATTR_USER is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UDF_FS is not set +# CONFIG_UDF_RW is not set +# CONFIG_UFS_FS is not set +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_NFS_FS is not set +# CONFIG_NFS_V3 is not set +# CONFIG_ROOT_NFS is not set +# CONFIG_NFSD is not set +# CONFIG_NFSD_V3 is not set +# CONFIG_NFSD_TCP is not set +# CONFIG_SUNRPC is not set +# CONFIG_LOCKD is not set +# CONFIG_SMB_FS is not set +# CONFIG_NCP_FS is not set +# CONFIG_NCPFS_PACKET_SIGNING is not set +# CONFIG_NCPFS_IOCTL_LOCKING is not set +# CONFIG_NCPFS_STRONG is not set +# CONFIG_NCPFS_NFS_NS is not set +# CONFIG_NCPFS_OS2_NS is not set +# CONFIG_NCPFS_SMALLDOS is not set +# CONFIG_NCPFS_NLS is not set +# CONFIG_NCPFS_EXTRAS is not set +# CONFIG_ZISOFS_FS is not set +CONFIG_FS_MBCACHE=y + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_SMB_NLS is not set +# CONFIG_NLS is not set + +# +# SCSI support +# +# CONFIG_SCSI is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set +# CONFIG_BLK_DEV_MD is not set +# CONFIG_MD_LINEAR is not set +# CONFIG_MD_RAID0 is not set +# CONFIG_MD_RAID1 is not set +# CONFIG_MD_RAID5 is not set +# CONFIG_MD_MULTIPATH is not set +# CONFIG_BLK_DEV_LVM is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Library routines +# +# CONFIG_ZLIB_INFLATE is not set +# CONFIG_ZLIB_DEFLATE is not set + +# +# Kernel hacking +# +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUGSYM=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff --git a/lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config b/lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config new file mode 100644 index 0000000..4aa8a2c --- /dev/null +++ b/lustre/kernel_patches/kernel_configs/jdike-2.5.69-uml.config @@ -0,0 +1,321 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +CONFIG_MMU=y +CONFIG_UID16=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=14 + +# +# Loadable module support +# +# CONFIG_MODULES is not set + +# +# UML-specific options +# +CONFIG_MODE_TT=y +# CONFIG_MODE_SKAS is not set +CONFIG_NET=y +CONFIG_HOSTFS=y +# CONFIG_HPPFS is not set +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 +# CONFIG_HIGHMEM is not set +# CONFIG_PROC_MM is not set +CONFIG_KERNEL_STACK_ORDER=3 + +# +# Executable file formats +# +# CONFIG_BINFMT_AOUT is not set +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +# CONFIG_NULL_CHAN is not set +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +# CONFIG_WATCHDOG is not set +# CONFIG_UML_SOUND is not set +# CONFIG_SOUND is not set +# CONFIG_HOSTAUDIO is not set + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_LOOP=y +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# Networking support +# + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +# CONFIG_NETFILTER is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_IPV6 is not set +# CONFIG_XFRM_USER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IPV6_SCTP__=y +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_LLC is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y +# CONFIG_ETHERTAP is not set + +# +# Ethernet (10 or 100Mbit) +# +# CONFIG_NET_ETHERNET is not set + +# +# Ethernet (1000 Mbit) +# + +# +# Ethernet (10000 Mbit) +# +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices (depends on LLC=y) +# +# CONFIG_SHAPER is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# UML Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +# CONFIG_UML_NET_SLIRP is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +# CONFIG_EXT3_FS_POSIX_ACL is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_ROMFS_FS is not set +CONFIG_QUOTA=y +# CONFIG_QFMT_V1 is not set +# CONFIG_QFMT_V2 is not set +CONFIG_QUOTACTL=y +# CONFIG_AUTOFS_FS is not set +CONFIG_AUTOFS4_FS=y + +# +# CD-ROM/DVD Filesystems +# +# CONFIG_ISO9660_FS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_FAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +CONFIG_TMPFS=y +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set +# CONFIG_EXPORTFS is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y + +# +# Security options +# +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Library routines +# +CONFIG_CRC32=y + +# +# SCSI support +# +# CONFIG_SCSI is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Kernel hacking +# +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUGSYM=y +CONFIG_FRAME_POINTER=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff --git a/lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch b/lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch new file mode 100644 index 0000000..55057d9 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only_2.4.20-rh.patch @@ -0,0 +1,77 @@ + + + + drivers/block/blkpg.c | 35 +++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 3 +++ + drivers/ide/ide-disk.c | 5 ++++- + 3 files changed, 42 insertions(+), 1 deletion(-) + +--- rh-2.4.20/drivers/block/blkpg.c~dev_read_only_2.4.20 2003-04-11 14:05:03.000000000 +0800 ++++ rh-2.4.20-root/drivers/block/blkpg.c 2003-04-12 13:11:31.000000000 +0800 +@@ -297,3 +297,38 @@ int blk_ioctl(kdev_t dev, unsigned int c + } + + EXPORT_SYMBOL(blk_ioctl); ++ ++#define NUM_DEV_NO_WRITE 16 ++static int dev_no_write[NUM_DEV_NO_WRITE]; ++ ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(kdev_t dev, int no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ dev_no_write[no_write] = 0xdead0000 + dev; ++ } ++} ++ ++int dev_check_rdonly(kdev_t dev) { ++ int i; ++ ++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { ++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && ++ dev == (dev_no_write[i] & 0xffff)) ++ return 1; ++ } ++ return 0; ++} ++ ++void dev_clear_rdonly(int no_write) { ++ dev_no_write[no_write] = 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); +--- rh-2.4.20/drivers/block/loop.c~dev_read_only_2.4.20 2003-04-11 14:05:08.000000000 +0800 ++++ rh-2.4.20-root/drivers/block/loop.c 2003-04-12 13:11:31.000000000 +0800 +@@ -491,6 +491,9 @@ static int loop_make_request(request_que + spin_unlock_irq(&lo->lo_lock); + + if (rw == WRITE) { ++ if (dev_check_rdonly(rbh->b_rdev)) ++ goto err; ++ + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { +--- rh-2.4.20/drivers/ide/ide-disk.c~dev_read_only_2.4.20 2003-04-11 14:04:53.000000000 +0800 ++++ rh-2.4.20-root/drivers/ide/ide-disk.c 2003-04-12 13:14:48.000000000 +0800 +@@ -381,7 +381,10 @@ static ide_startstop_t do_rw_disk (ide_d + if (IS_PDC4030_DRIVE) + return promise_rw_disk(drive, rq, block); + #endif /* CONFIG_BLK_DEV_PDC4030 */ +- ++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { ++ ide_end_request(1, HWGROUP(drive)); ++ return ide_stopped; ++ } + if (IDE_CONTROL_REG) + hwif->OUTB(drive->ctl, IDE_CONTROL_REG); + + +_ diff --git a/lustre/kernel_patches/patches/dev_read_only_hp.patch b/lustre/kernel_patches/patches/dev_read_only_2.4.20.patch similarity index 62% rename from lustre/kernel_patches/patches/dev_read_only_hp.patch rename to lustre/kernel_patches/patches/dev_read_only_2.4.20.patch index b2cf6f0..0d3476c 100644 --- a/lustre/kernel_patches/patches/dev_read_only_hp.patch +++ b/lustre/kernel_patches/patches/dev_read_only_2.4.20.patch @@ -1,14 +1,16 @@ - drivers/block/blkpg.c | 38 ++++++++++++++++++++++++++++++++++++++ - drivers/block/loop.c | 5 +++++ - drivers/ide/ide-disk.c | 6 ++++++ - 3 files changed, 49 insertions(+) + drivers/block/blkpg.c | 39 +++++++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 3 +++ + drivers/ide/ide-disk.c | 4 ++++ + 3 files changed, 46 insertions(+) ---- linux-2.4.19-hp2_pnnl2/drivers/block/blkpg.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 -+++ linux-2.4.19-hp2_pnnl2-root/drivers/block/blkpg.c Sun Jan 19 18:52:28 2003 -@@ -310,6 +310,42 @@ int blk_ioctl(kdev_t dev, unsigned int c +--- linux-2.4.20/drivers/block/blkpg.c~dev_read_only_hp 2003-04-09 15:14:54.000000000 -0600 ++++ linux-2.4.20-braam/drivers/block/blkpg.c 2003-04-09 15:37:02.000000000 -0600 +@@ -296,3 +296,42 @@ int blk_ioctl(kdev_t dev, unsigned int c + } EXPORT_SYMBOL(blk_ioctl); - ++ ++ + +#define NUM_DEV_NO_WRITE 16 +static int dev_no_write[NUM_DEV_NO_WRITE]; @@ -45,11 +47,9 @@ +EXPORT_SYMBOL(dev_check_rdonly); +EXPORT_SYMBOL(dev_clear_rdonly); + - /** - * get_last_sector() - * ---- linux-2.4.19-hp2_pnnl2/drivers/block/loop.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 -+++ linux-2.4.19-hp2_pnnl2-root/drivers/block/loop.c Sun Jan 19 18:51:12 2003 ++ +--- linux-2.4.20/drivers/block/loop.c~dev_read_only_hp 2003-04-09 15:14:54.000000000 -0600 ++++ linux-2.4.20-braam/drivers/block/loop.c 2003-04-09 15:37:02.000000000 -0600 @@ -474,6 +474,9 @@ static int loop_make_request(request_que spin_unlock_irq(&lo->lo_lock); @@ -60,9 +60,9 @@ if (lo->lo_flags & LO_FLAGS_READ_ONLY) goto err; } else if (rw == READA) { ---- linux-2.4.19-hp2_pnnl2/drivers/ide/ide-disk.c~dev_read_only_hp Sun Jan 19 18:51:12 2003 -+++ linux-2.4.19-hp2_pnnl2-root/drivers/ide/ide-disk.c Sun Jan 19 18:51:12 2003 -@@ -551,6 +551,10 @@ static ide_startstop_t lba_48_rw_disk (i +--- linux-2.4.20/drivers/ide/ide-disk.c~dev_read_only_hp 2003-04-09 15:14:54.000000000 -0600 ++++ linux-2.4.20-braam/drivers/ide/ide-disk.c 2003-04-09 15:37:02.000000000 -0600 +@@ -558,6 +558,10 @@ static ide_startstop_t lba_48_rw_disk (i */ static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) { diff --git a/lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch b/lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch new file mode 100644 index 0000000..60081db --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only_hp_2.4.20.patch @@ -0,0 +1,77 @@ + drivers/block/blkpg.c | 36 ++++++++++++++++++++++++++++++++++++ + drivers/block/loop.c | 3 +++ + drivers/ide/ide-disk.c | 4 ++++ + 3 files changed, 43 insertions(+) + +--- linux/drivers/block/blkpg.c~dev_read_only_hp_2.4.20 Mon May 19 07:07:52 2003 ++++ linux-mmonroe/drivers/block/blkpg.c Mon May 19 07:37:22 2003 +@@ -310,6 +310,42 @@ int blk_ioctl(kdev_t dev, unsigned int c + + EXPORT_SYMBOL(blk_ioctl); + ++ ++#define NUM_DEV_NO_WRITE 16 ++static int dev_no_write[NUM_DEV_NO_WRITE]; ++/* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++void dev_set_rdonly(kdev_t dev, int no_write) ++{ ++ if (dev) { ++ printk(KERN_WARNING "Turning device %s read-only\n", ++ bdevname(dev)); ++ dev_no_write[no_write] = 0xdead0000 + dev; ++ } ++} ++ ++int dev_check_rdonly(kdev_t dev) { ++ int i; ++ ++ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { ++ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && ++ dev == (dev_no_write[i] & 0xffff)) ++ return 1; ++ } ++ return 0; ++} ++ ++void dev_clear_rdonly(int no_write) { ++ dev_no_write[no_write] = 0; ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++ ++ + /** + * get_last_sector() + * +--- linux/drivers/block/loop.c~dev_read_only_hp_2.4.20 Thu Nov 28 15:53:12 2002 ++++ linux-mmonroe/drivers/block/loop.c Mon May 19 07:28:29 2003 +@@ -474,6 +474,9 @@ static int loop_make_request(request_que + spin_unlock_irq(&lo->lo_lock); + + if (rw == WRITE) { ++ if (dev_check_rdonly(rbh->b_rdev)) ++ goto err; ++ + if (lo->lo_flags & LO_FLAGS_READ_ONLY) + goto err; + } else if (rw == READA) { +--- linux/drivers/ide/ide-disk.c~dev_read_only_hp_2.4.20 Thu Nov 28 15:53:13 2002 ++++ linux-mmonroe/drivers/ide/ide-disk.c Mon May 19 07:28:29 2003 +@@ -558,6 +558,10 @@ static ide_startstop_t lba_48_rw_disk (i + */ + static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) + { ++ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { ++ ide_end_request(1, HWGROUP(drive)); ++ return ide_stopped; ++ } + if (IDE_CONTROL_REG) + OUT_BYTE(drive->ctl,IDE_CONTROL_REG); + + +_ diff --git a/lustre/kernel_patches/patches/dsp.patch b/lustre/kernel_patches/patches/dsp.patch new file mode 100644 index 0000000..f2e5b93 --- /dev/null +++ b/lustre/kernel_patches/patches/dsp.patch @@ -0,0 +1,130 @@ + arch/i386/kernel/crash.c | 24 +++++++++++++++++------- + arch/i386/kernel/nmi.c | 2 +- + include/asm-i386/apic.h | 1 + + include/linux/crash.h | 2 +- + kernel/bootimg.c | 13 ++++++++++++- + kernel/bootimg_pic.c | 6 ++++-- + 6 files changed, 36 insertions(+), 12 deletions(-) + +--- linux-rh-2.4.20-8/kernel/bootimg.c~dsp 2003-05-07 19:30:47.000000000 +0800 ++++ linux-rh-2.4.20-8-root/kernel/bootimg.c 2003-05-07 19:31:12.000000000 +0800 +@@ -238,9 +238,20 @@ int boot_image() + int error = -ENOMEM; + + if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) +- != bootimg_dsc.csum) ++ != bootimg_dsc.csum) { + printk("Checksum of kernel image failed. Rebooting via BIOS\n"); + ++ /* Before calling machine_restart(), make sure it will not ++ * simply call this function recursively. ++ */ ++ bootimg_dsc.page_dir = NULL; ++ machine_restart(NULL); ++ ++ /* We should never get here, but just in case... */ ++ for (; ; ) ++ __asm__ __volatile__ ("hlt"); ++ } ++ + code_page = get_identity_mapped_page(); + if (!code_page) goto out3; + code = (relocate_and_jump_t) virt_to_phys((void *) code_page); +--- linux-rh-2.4.20-8/kernel/bootimg_pic.c~dsp 2003-05-07 19:30:47.000000000 +0800 ++++ linux-rh-2.4.20-8-root/kernel/bootimg_pic.c 2003-05-07 19:31:12.000000000 +0800 +@@ -69,7 +69,8 @@ void __bootimg relocate_and_jump(void) + for (j = i+1; j < dsc.pages; j++) { + table = dsc.page_dir+FROM_TABLE(j); + if (((unsigned long) *table) == to) { +- copy_and_swap(*table,dsc.scratch); ++ copy_and_swap((unsigned long) (*table), ++ dsc.scratch); + break; + } + if ((*table)[PAGE_NR(j)] == to) { +@@ -79,7 +80,8 @@ void __bootimg relocate_and_jump(void) + } + table = dsc.page_dir+TO_TABLE(j); + if (((unsigned long) *table) == to) { +- copy_and_swap(*table,dsc.scratch); ++ copy_and_swap((unsigned long) (*table), ++ dsc.scratch); + break; + } + } +--- linux-rh-2.4.20-8/include/asm-i386/apic.h~dsp 2003-05-07 17:00:16.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-i386/apic.h 2003-05-07 19:31:12.000000000 +0800 +@@ -86,6 +86,7 @@ extern struct pm_dev *apic_pm_register(p + extern void apic_pm_unregister(struct pm_dev*); + + extern int check_nmi_watchdog (void); ++extern void disable_apic_nmi_watchdog(void); + + extern unsigned int nmi_watchdog; + #define NMI_NONE 0 +--- linux-rh-2.4.20-8/include/linux/crash.h~dsp 2003-05-07 19:30:47.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/crash.h 2003-05-07 19:31:12.000000000 +0800 +@@ -71,7 +71,7 @@ extern void stop_this_cpu(void *); + #define CRASH_ZALLOC_PAGES 16*5*2 /* 2 to handle crash in crash */ + #define CRASH_LOW_WATER_PAGES 100 + +-#define CRASH_CPU_TIMEOUT 5000 /* 5 sec wait for other cpus to stop */ ++#define CRASH_CPU_TIMEOUT 15000 /* 15 sec wait for other cpus to stop */ + + #define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) + #define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) +--- linux-rh-2.4.20-8/arch/i386/kernel/crash.c~dsp 2003-05-07 19:30:47.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/i386/kernel/crash.c 2003-05-07 19:31:39.000000000 +0800 +@@ -9,6 +9,8 @@ + #include + #include + #include ++#include ++#include + + inline void crash_save_regs(void) { + static unsigned long regs[8]; +@@ -30,15 +32,23 @@ inline void crash_save_regs(void) { + */ + void crash_save_current_state(struct task_struct *tp) + { ++ if (tp != NULL) { ++ /* ++ * Here we save ebp instead of esp just in case the compiler ++ * decides to put an extra push in before we execute this ++ * instruction (thus invalidating our frame pointer). ++ */ ++ asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp)); ++ tp->thread.eip = (u_long)crash_save_current_state; ++ panic_ksp[smp_processor_id()] = tp->thread.esp; ++ mb(); ++ } ++ + /* +- * Here we save ebp instead of esp just in case the compiler +- * decides to put an extra push in before we execute this +- * instruction (thus invalidating our frame pointer). ++ * Just to be safe, disable the NMI watchdog on the calling CPU so it ++ * doesn't get in the way while we are trying to save a dump. + */ +- asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp)); +- tp->thread.eip = (u_long)crash_save_current_state; +- panic_ksp[smp_processor_id()] = tp->thread.esp; +- mb(); ++ disable_apic_nmi_watchdog(); + + save_core(); + +--- linux-rh-2.4.20-8/arch/i386/kernel/nmi.c~dsp 2003-05-07 19:30:47.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/i386/kernel/nmi.c 2003-05-07 19:31:12.000000000 +0800 +@@ -138,7 +138,7 @@ __setup("nmi_watchdog=", setup_nmi_watch + + struct pm_dev *nmi_pmdev; + +-static void disable_apic_nmi_watchdog(void) ++void disable_apic_nmi_watchdog(void) + { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + +_ diff --git a/lustre/kernel_patches/patches/export-truncate-2.5.63.patch b/lustre/kernel_patches/patches/export-truncate-2.5.63.patch new file mode 100644 index 0000000..3d82572 --- /dev/null +++ b/lustre/kernel_patches/patches/export-truncate-2.5.63.patch @@ -0,0 +1,37 @@ + include/linux/mm.h | 2 ++ + mm/truncate.c | 4 ++-- + 2 files changed, 4 insertions(+), 2 deletions(-) + +--- linux-2.5.63/include/linux/mm.h~export-truncate-2.5.63 Mon May 5 18:08:15 2003 ++++ linux-2.5.63-root/include/linux/mm.h Mon May 5 18:08:58 2003 +@@ -540,6 +540,8 @@ can_vma_merge(struct vm_area_struct *vma + else + return 0; + } ++/* truncate.c */ ++extern void truncate_complete_page(struct page *); + + /* filemap.c */ + extern unsigned long page_unuse(struct page *); +--- linux-2.5.63/mm/truncate.c~export-truncate-2.5.63 Mon May 5 18:09:50 2003 ++++ linux-2.5.63-root/mm/truncate.c Mon May 5 18:11:29 2003 +@@ -41,7 +41,7 @@ static inline void truncate_partial_page + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +-static void ++void + truncate_complete_page(struct address_space *mapping, struct page *page) + { + if (page->mapping != mapping) +@@ -56,7 +56,7 @@ truncate_complete_page(struct address_sp + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ + } +- ++EXPORT_SYMBOL_GPL(truncate_complete_page); + /* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + +_ diff --git a/lustre/kernel_patches/patches/export-truncate.patch b/lustre/kernel_patches/patches/export-truncate.patch new file mode 100644 index 0000000..2cd96b9 --- /dev/null +++ b/lustre/kernel_patches/patches/export-truncate.patch @@ -0,0 +1,35 @@ + include/linux/mm.h | 1 + + mm/filemap.c | 3 ++- + 2 files changed, 3 insertions(+), 1 deletion(-) + +--- linux-2.4.18-18/include/linux/mm.h~export-truncate 2003-04-05 02:54:55.000000000 -0700 ++++ linux-2.4.18-18-braam/include/linux/mm.h 2003-04-09 17:37:46.000000000 -0600 +@@ -650,6 +650,7 @@ struct zone_t; + /* filemap.c */ + extern void remove_inode_page(struct page *); + extern unsigned long page_unuse(struct page *); ++extern void truncate_complete_page(struct page *); + extern void truncate_inode_pages(struct address_space *, loff_t); + + /* generic vm_area_ops exported for stackable file systems */ +--- linux-2.4.18-18/mm/filemap.c~export-truncate 2003-04-05 02:54:55.000000000 -0700 ++++ linux-2.4.18-18-braam/mm/filemap.c 2003-04-09 17:37:46.000000000 -0600 +@@ -245,7 +245,7 @@ static inline void truncate_partial_page + do_flushpage(page, partial); + } + +-static void truncate_complete_page(struct page *page) ++void truncate_complete_page(struct page *page) + { + /* + * Leave it on the LRU if it gets converted into anonymous buffers +@@ -266,6 +266,7 @@ static void truncate_complete_page(struc + remove_inode_page(page); + page_cache_release(page); + } ++EXPORT_SYMBOL_GPL(truncate_complete_page); + + static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); + static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) + +_ diff --git a/lustre/kernel_patches/patches/exports.patch b/lustre/kernel_patches/patches/exports.patch index 716c156..33e0b6c 100644 --- a/lustre/kernel_patches/patches/exports.patch +++ b/lustre/kernel_patches/patches/exports.patch @@ -7,20 +7,20 @@ kernel/ksyms.c | 5 +++++ 4 files changed, 9 insertions(+), 1 deletion(-) ---- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile Sun Jan 19 18:52:38 2003 +--- linux-2.4.18-18/fs/ext3/Makefile~exports Sat Apr 5 02:51:27 2003 ++++ linux-2.4.18-18-braam/fs/ext3/Makefile Sat Apr 5 02:54:45 2003 @@ -9,6 +9,8 @@ O_TARGET := ext3.o -+export-objs := super.o ++export-objs := super.o inode.o + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o obj-m := $(O_TARGET) ---- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c Sun Jan 19 18:52:38 2003 -@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) +--- linux-2.4.18-18/fs/ext3/super.c~exports Sat Apr 5 02:51:27 2003 ++++ linux-2.4.18-18-braam/fs/ext3/super.c Sat Apr 5 02:54:28 2003 +@@ -1746,7 +1746,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); } @@ -29,9 +29,9 @@ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h Sun Jan 19 18:52:38 2003 -@@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct +--- linux-2.4.18-18/include/linux/fs.h~exports Sat Apr 5 02:51:27 2003 ++++ linux-2.4.18-18-braam/include/linux/fs.h Sat Apr 5 02:54:29 2003 +@@ -1046,6 +1046,7 @@ extern int unregister_filesystem(struct extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); @@ -39,11 +39,11 @@ extern void umount_tree(struct vfsmount *); #define kern_umount mntput ---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003 -@@ -308,6 +308,11 @@ EXPORT_SYMBOL(dcache_dir_fsync); - EXPORT_SYMBOL(dcache_readdir); - EXPORT_SYMBOL(dcache_dir_ops); +--- linux-2.4.18-18/kernel/ksyms.c~exports Sat Apr 5 02:51:27 2003 ++++ linux-2.4.18-18-braam/kernel/ksyms.c Sat Apr 5 02:54:29 2003 +@@ -306,6 +306,11 @@ EXPORT_SYMBOL_GPL(buffermem_pages); + EXPORT_SYMBOL_GPL(nr_free_pages); + EXPORT_SYMBOL_GPL(page_cache_size); +/* lustre */ +EXPORT_SYMBOL(panic_notifier_list); diff --git a/lustre/kernel_patches/patches/exports_hp.patch b/lustre/kernel_patches/patches/exports_2.4.20-rh-hp.patch similarity index 61% rename from lustre/kernel_patches/patches/exports_hp.patch rename to lustre/kernel_patches/patches/exports_2.4.20-rh-hp.patch index 0222b46..feaeec6 100644 --- a/lustre/kernel_patches/patches/exports_hp.patch +++ b/lustre/kernel_patches/patches/exports_2.4.20-rh-hp.patch @@ -1,26 +1,23 @@ - - - fs/ext3/Makefile | 2 ++ fs/ext3/super.c | 2 +- include/linux/fs.h | 1 + kernel/ksyms.c | 4 ++++ - 4 files changed, 9 insertions(+), 1 deletion(-) + 4 files changed, 8 insertions(+), 1 deletion(-) ---- linux-2.4.19-hp2_pnnl2/fs/ext3/Makefile~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/Makefile Sun Jan 19 18:52:38 2003 +--- linux/fs/ext3/Makefile~exports_2.4.20 Wed Apr 9 10:07:14 2003 ++++ linux-mmonroe/fs/ext3/Makefile Wed Apr 9 10:19:53 2003 @@ -9,6 +9,8 @@ O_TARGET := ext3.o -+export-objs := super.o ++export-objs := super.o inode.o + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o obj-m := $(O_TARGET) ---- linux-2.4.19-hp2_pnnl2/fs/ext3/super.c~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/fs/ext3/super.c Sun Jan 19 18:52:38 2003 -@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) +--- linux/fs/ext3/super.c~exports_2.4.20 Wed Apr 9 10:07:14 2003 ++++ linux-mmonroe/fs/ext3/super.c Wed Apr 9 10:19:53 2003 +@@ -1769,7 +1769,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); } @@ -29,18 +26,18 @@ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-2.4.19-hp2_pnnl2/include/linux/fs.h~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/include/linux/fs.h Sun Jan 19 18:52:38 2003 +--- linux/include/linux/fs.h~exports_2.4.20 Wed Apr 9 10:07:14 2003 ++++ linux-mmonroe/include/linux/fs.h Wed Apr 9 10:19:53 2003 @@ -1020,6 +1020,7 @@ extern int unregister_filesystem(struct extern struct vfsmount *kern_mount(struct file_system_type *); extern int may_umount(struct vfsmount *); extern long do_mount(char *, char *, char *, unsigned long, void *); +struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); - extern void umount_tree(struct vfsmount *); #define kern_umount mntput ---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~exports Sun Jan 19 18:52:38 2003 -+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 18:52:38 2003 + +--- linux/kernel/ksyms.c~exports_2.4.20 Wed Apr 9 10:07:14 2003 ++++ linux-mmonroe/kernel/ksyms.c Wed Apr 9 10:19:53 2003 @@ -308,6 +308,10 @@ EXPORT_SYMBOL(dcache_dir_fsync); EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(dcache_dir_ops); diff --git a/lustre/kernel_patches/patches/exports_2.4.20.patch b/lustre/kernel_patches/patches/exports_2.4.20.patch new file mode 100644 index 0000000..bed8693 --- /dev/null +++ b/lustre/kernel_patches/patches/exports_2.4.20.patch @@ -0,0 +1,57 @@ + + + + fs/ext3/Makefile | 4 +++- + fs/ext3/super.c | 2 +- + include/linux/fs.h | 1 + + kernel/ksyms.c | 5 +++++ + 4 files changed, 10 insertions(+), 2 deletions(-) + +--- linux-2.4.20/fs/ext3/Makefile~exports_hp Sat Apr 5 03:55:19 2003 ++++ linux-2.4.20-braam/fs/ext3/Makefile Sat Apr 5 03:56:03 2003 +@@ -9,6 +9,8 @@ + + O_TARGET := ext3.o + ++export-objs := super.o inode.o ++ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) +--- linux-2.4.20/fs/ext3/super.c~exports_hp Sat Apr 5 03:55:19 2003 ++++ linux-2.4.20-braam/fs/ext3/super.c Sat Apr 5 03:55:19 2003 +@@ -1769,7 +1769,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + } + +-EXPORT_NO_SYMBOLS; ++EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); +--- linux-2.4.20/include/linux/fs.h~exports_hp Sat Apr 5 03:55:19 2003 ++++ linux-2.4.20-braam/include/linux/fs.h Sat Apr 5 03:55:19 2003 +@@ -1005,6 +1005,7 @@ extern int unregister_filesystem(struct + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); ++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); + + #define kern_umount mntput + +--- linux-2.4.20/kernel/ksyms.c~exports_hp Sat Apr 5 03:55:19 2003 ++++ linux-2.4.20-braam/kernel/ksyms.c Sat Apr 5 03:55:19 2003 +@@ -284,6 +284,11 @@ EXPORT_SYMBOL(dcache_dir_fsync); + EXPORT_SYMBOL(dcache_readdir); + EXPORT_SYMBOL(dcache_dir_ops); + ++/* lustre */ ++EXPORT_SYMBOL(pagecache_lock_cacheline); ++EXPORT_SYMBOL(panic_notifier_list); ++EXPORT_SYMBOL(do_kern_mount); ++ + /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ + EXPORT_SYMBOL(default_llseek); + EXPORT_SYMBOL(dentry_open); + +_ diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch new file mode 100644 index 0000000..b59cea2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1-chaos.patch @@ -0,0 +1,2527 @@ + fs/ext3/Makefile | 2 + fs/ext3/dir.c | 299 +++++++++ + fs/ext3/file.c | 3 + fs/ext3/hash.c | 215 ++++++ + fs/ext3/namei.c | 1388 ++++++++++++++++++++++++++++++++++++++++----- + fs/ext3/super.c | 7 + include/linux/ext3_fs.h | 85 ++ + include/linux/ext3_fs_sb.h | 2 + include/linux/ext3_jbd.h | 2 + include/linux/rbtree.h | 2 + lib/rbtree.c | 42 + + 11 files changed, 1887 insertions(+), 160 deletions(-) + +--- linux-chaos-2.4.20-6/fs/ext3/Makefile~ext-2.4-patch-1-chaos 2003-04-09 16:10:38.000000000 -0600 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/Makefile 2003-04-09 16:18:55.000000000 -0600 +@@ -12,7 +12,7 @@ O_TARGET := ext3.o + export-objs := super.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o hash.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +--- linux-chaos-2.4.20-6/fs/ext3/dir.c~ext-2.4-patch-1-chaos 2002-05-07 15:53:46.000000000 -0600 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/dir.c 2003-04-09 16:18:55.000000000 -0600 +@@ -21,12 +21,16 @@ + #include + #include + #include ++#include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio + fsync: ext3_sync_file, /* BKL held */ + }; + ++ ++static unsigned char get_dtype(struct super_block *sb, int filetype) ++{ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || ++ (filetype >= EXT3_FT_MAX)) ++ return DT_UNKNOWN; ++ ++ return (ext3_filetype_table[filetype]); ++} ++ ++ + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi + + sb = inode->i_sb; + ++ if (is_dx(inode)) { ++ err = ext3_dx_readdir(filp, dirent, filldir); ++ if (err != ERR_BAD_DX_DIR) ++ return err; ++ /* ++ * We don't set the inode dirty flag since it's not ++ * critical that it get flushed back to the disk. ++ */ ++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; ++ } + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); +@@ -162,18 +187,12 @@ revalidate: + * during the copy operation. + */ + unsigned long version = filp->f_version; +- unsigned char d_type = DT_UNKNOWN; + +- if (EXT3_HAS_INCOMPAT_FEATURE(sb, +- EXT3_FEATURE_INCOMPAT_FILETYPE) +- && de->file_type < EXT3_FT_MAX) +- d_type = +- ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), +- d_type); ++ get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) +@@ -188,3 +207,269 @@ revalidate: + UPDATE_ATIME(inode); + return 0; + } ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * These functions convert from the major/minor hash to an f_pos ++ * value. ++ * ++ * Currently we only use major hash numer. This is unfortunate, but ++ * on 32-bit machines, the same VFS interface is used for lseek and ++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of ++ * lseek/telldir/seekdir will blow out spectacularly, and from within ++ * the ext2 low-level routine, we don't know if we're being called by ++ * a 64-bit version of the system call or the 32-bit version of the ++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir ++ * cookie. Sigh. ++ */ ++#define hash2pos(major, minor) (major >> 1) ++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) ++#define pos2min_hash(pos) (0) ++ ++/* ++ * This structure holds the nodes of the red-black tree used to store ++ * the directory entry in hash order. ++ */ ++struct fname { ++ __u32 hash; ++ __u32 minor_hash; ++ rb_node_t rb_hash; ++ struct fname *next; ++ __u32 inode; ++ __u8 name_len; ++ __u8 file_type; ++ char name[0]; ++}; ++ ++/* ++ * This functoin implements a non-recursive way of freeing all of the ++ * nodes in the red-black tree. ++ */ ++static void free_rb_tree_fname(rb_root_t *root) ++{ ++ rb_node_t *n = root->rb_node; ++ rb_node_t *parent; ++ struct fname *fname; ++ ++ while (n) { ++ /* Do the node's children first */ ++ if ((n)->rb_left) { ++ n = n->rb_left; ++ continue; ++ } ++ if (n->rb_right) { ++ n = n->rb_right; ++ continue; ++ } ++ /* ++ * The node has no children; free it, and then zero ++ * out parent's link to it. Finally go to the ++ * beginning of the loop and try to free the parent ++ * node. ++ */ ++ parent = n->rb_parent; ++ fname = rb_entry(n, struct fname, rb_hash); ++ kfree(fname); ++ if (!parent) ++ root->rb_node = 0; ++ else if (parent->rb_left == n) ++ parent->rb_left = 0; ++ else if (parent->rb_right == n) ++ parent->rb_right = 0; ++ n = parent; ++ } ++ root->rb_node = 0; ++} ++ ++ ++struct dir_private_info *create_dir_info(loff_t pos) ++{ ++ struct dir_private_info *p; ++ ++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ if (!p) ++ return NULL; ++ p->root.rb_node = 0; ++ p->curr_node = 0; ++ p->extra_fname = 0; ++ p->last_pos = 0; ++ p->curr_hash = pos2maj_hash(pos); ++ p->curr_minor_hash = pos2min_hash(pos); ++ p->next_hash = 0; ++ return p; ++} ++ ++void ext3_htree_free_dir_info(struct dir_private_info *p) ++{ ++ free_rb_tree_fname(&p->root); ++ kfree(p); ++} ++ ++/* ++ * Given a directory entry, enter it into the fname rb tree. ++ */ ++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent) ++{ ++ rb_node_t **p, *parent = NULL; ++ struct fname * fname, *new_fn; ++ struct dir_private_info *info; ++ int len; ++ ++ info = (struct dir_private_info *) dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++ len = sizeof(struct fname) + dirent->name_len + 1; ++ new_fn = kmalloc(len, GFP_KERNEL); ++ memset(new_fn, 0, len); ++ new_fn->hash = hash; ++ new_fn->minor_hash = minor_hash; ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = dirent->name_len; ++ new_fn->file_type = dirent->file_type; ++ memcpy(new_fn->name, dirent->name, dirent->name_len); ++ new_fn->name[dirent->name_len] = 0; ++ ++ while (*p) { ++ parent = *p; ++ fname = rb_entry(parent, struct fname, rb_hash); ++ ++ /* ++ * If the hash and minor hash match up, then we put ++ * them on a linked list. This rarely happens... ++ */ ++ if ((new_fn->hash == fname->hash) && ++ (new_fn->minor_hash == fname->minor_hash)) { ++ new_fn->next = fname->next; ++ fname->next = new_fn; ++ return; ++ } ++ ++ if (new_fn->hash < fname->hash) ++ p = &(*p)->rb_left; ++ else if (new_fn->hash > fname->hash) ++ p = &(*p)->rb_right; ++ else if (new_fn->minor_hash < fname->minor_hash) ++ p = &(*p)->rb_left; ++ else /* if (new_fn->minor_hash > fname->minor_hash) */ ++ p = &(*p)->rb_right; ++ } ++ ++ rb_link_node(&new_fn->rb_hash, parent, p); ++ rb_insert_color(&new_fn->rb_hash, &info->root); ++} ++ ++ ++ ++/* ++ * This is a helper function for ext3_dx_readdir. It calls filldir ++ * for all entres on the fname linked list. (Normally there is only ++ * one entry on the linked list, unless there are 62 bit hash collisions.) ++ */ ++static int call_filldir(struct file * filp, void * dirent, ++ filldir_t filldir, struct fname *fname) ++{ ++ struct dir_private_info *info = filp->private_data; ++ loff_t curr_pos; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct super_block * sb; ++ int error; ++ ++ sb = inode->i_sb; ++ ++ if (!fname) { ++ printk("call_filldir: called with null fname?!?\n"); ++ return 0; ++ } ++ curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ while (fname) { ++ error = filldir(dirent, fname->name, ++ fname->name_len, curr_pos, ++ fname->inode, ++ get_dtype(sb, fname->file_type)); ++ if (error) { ++ filp->f_pos = curr_pos; ++ info->extra_fname = fname->next; ++ return error; ++ } ++ fname = fname->next; ++ } ++ return 0; ++} ++ ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ struct dir_private_info *info = filp->private_data; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct fname *fname; ++ int ret; ++ ++ if (!info) { ++ info = create_dir_info(filp->f_pos); ++ if (!info) ++ return -ENOMEM; ++ filp->private_data = info; ++ } ++ ++ /* Some one has messed with f_pos; reset the world */ ++ if (info->last_pos != filp->f_pos) { ++ free_rb_tree_fname(&info->root); ++ info->curr_node = 0; ++ info->extra_fname = 0; ++ info->curr_hash = pos2maj_hash(filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ } ++ ++ /* ++ * If there are any leftover names on the hash collision ++ * chain, return them first. ++ */ ++ if (info->extra_fname && ++ call_filldir(filp, dirent, filldir, info->extra_fname)) ++ goto finished; ++ ++ if (!info->curr_node) ++ info->curr_node = rb_get_first(&info->root); ++ ++ while (1) { ++ /* ++ * Fill the rbtree if we have no more entries, ++ * or the inode has changed since we last read in the ++ * cached entries. ++ */ ++ if ((!info->curr_node) || ++ (filp->f_version != inode->i_version)) { ++ info->curr_node = 0; ++ free_rb_tree_fname(&info->root); ++ filp->f_version = inode->i_version; ++ ret = ext3_htree_fill_tree(filp, info->curr_hash, ++ info->curr_minor_hash, ++ &info->next_hash); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ break; ++ info->curr_node = rb_get_first(&info->root); ++ } ++ ++ fname = rb_entry(info->curr_node, struct fname, rb_hash); ++ info->curr_hash = fname->hash; ++ info->curr_minor_hash = fname->minor_hash; ++ if (call_filldir(filp, dirent, filldir, fname)) ++ break; ++ ++ info->curr_node = rb_get_next(info->curr_node); ++ if (!info->curr_node) { ++ info->curr_hash = info->next_hash; ++ info->curr_minor_hash = 0; ++ } ++ } ++finished: ++ info->last_pos = filp->f_pos; ++ UPDATE_ATIME(inode); ++ return 0; ++} ++#endif +--- linux-chaos-2.4.20-6/fs/ext3/file.c~ext-2.4-patch-1-chaos 2003-02-14 15:59:09.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/file.c 2003-04-09 16:18:55.000000000 -0600 +@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); ++ if (is_dx(inode) && filp->private_data) ++ ext3_htree_free_dir_info(filp->private_data); ++ + return 0; + } + +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/hash.c 2003-04-09 16:18:55.000000000 -0600 +@@ -0,0 +1,215 @@ ++/* ++ * linux/fs/ext3/hash.c ++ * ++ * Copyright (C) 2002 by Theodore Ts'o ++ * ++ * This file is released under the GPL v2. ++ * ++ * This file may be redistributed under the terms of the GNU Public ++ * License. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DELTA 0x9E3779B9 ++ ++static void TEA_transform(__u32 buf[4], __u32 const in[]) ++{ ++ __u32 sum = 0; ++ __u32 b0 = buf[0], b1 = buf[1]; ++ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; ++ int n = 16; ++ ++ do { ++ sum += DELTA; ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); ++ } while(--n); ++ ++ buf[0] += b0; ++ buf[1] += b1; ++} ++ ++/* F, G and H are basic MD4 functions: selection, majority, parity */ ++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) ++#define H(x, y, z) ((x) ^ (y) ^ (z)) ++ ++/* ++ * The generic round function. The application is so specific that ++ * we don't bother protecting all the arguments with parens, as is generally ++ * good macro practice, in favor of extra legibility. ++ * Rotation is separate from addition to prevent recomputation ++ */ ++#define ROUND(f, a, b, c, d, x, s) \ ++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) ++#define K1 0 ++#define K2 013240474631UL ++#define K3 015666365641UL ++ ++/* ++ * Basic cut-down MD4 transform. Returns only 32 bits of result. ++ */ ++static void halfMD4Transform (__u32 buf[4], __u32 const in[]) ++{ ++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; ++ ++ /* Round 1 */ ++ ROUND(F, a, b, c, d, in[0] + K1, 3); ++ ROUND(F, d, a, b, c, in[1] + K1, 7); ++ ROUND(F, c, d, a, b, in[2] + K1, 11); ++ ROUND(F, b, c, d, a, in[3] + K1, 19); ++ ROUND(F, a, b, c, d, in[4] + K1, 3); ++ ROUND(F, d, a, b, c, in[5] + K1, 7); ++ ROUND(F, c, d, a, b, in[6] + K1, 11); ++ ROUND(F, b, c, d, a, in[7] + K1, 19); ++ ++ /* Round 2 */ ++ ROUND(G, a, b, c, d, in[1] + K2, 3); ++ ROUND(G, d, a, b, c, in[3] + K2, 5); ++ ROUND(G, c, d, a, b, in[5] + K2, 9); ++ ROUND(G, b, c, d, a, in[7] + K2, 13); ++ ROUND(G, a, b, c, d, in[0] + K2, 3); ++ ROUND(G, d, a, b, c, in[2] + K2, 5); ++ ROUND(G, c, d, a, b, in[4] + K2, 9); ++ ROUND(G, b, c, d, a, in[6] + K2, 13); ++ ++ /* Round 3 */ ++ ROUND(H, a, b, c, d, in[3] + K3, 3); ++ ROUND(H, d, a, b, c, in[7] + K3, 9); ++ ROUND(H, c, d, a, b, in[2] + K3, 11); ++ ROUND(H, b, c, d, a, in[6] + K3, 15); ++ ROUND(H, a, b, c, d, in[1] + K3, 3); ++ ROUND(H, d, a, b, c, in[5] + K3, 9); ++ ROUND(H, c, d, a, b, in[0] + K3, 11); ++ ROUND(H, b, c, d, a, in[4] + K3, 15); ++ ++ buf[0] += a; ++ buf[1] += b; ++ buf[2] += c; ++ buf[3] += d; ++} ++ ++#undef ROUND ++#undef F ++#undef G ++#undef H ++#undef K1 ++#undef K2 ++#undef K3 ++ ++/* The old legacy hash */ ++static __u32 dx_hack_hash (const char *name, int len) ++{ ++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) { ++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return (hash0 << 1); ++} ++ ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++{ ++ __u32 pad, val; ++ int i; ++ ++ pad = (__u32)len | ((__u32)len << 8); ++ pad |= pad << 16; ++ ++ val = pad; ++ if (len > num*4) ++ len = num * 4; ++ for (i=0; i < len; i++) { ++ if ((i % 4) == 0) ++ val = pad; ++ val = msg[i] + (val << 8); ++ if ((i % 4) == 3) { ++ *buf++ = val; ++ val = pad; ++ num--; ++ } ++ } ++ if (--num >= 0) ++ *buf++ = val; ++ while (--num >= 0) ++ *buf++ = pad; ++} ++ ++/* ++ * Returns the hash of a filename. If len is 0 and name is NULL, then ++ * this function can be used to test whether or not a hash version is ++ * supported. ++ * ++ * The seed is an 4 longword (32 bits) "secret" which can be used to ++ * uniquify a hash. If the seed is all zero's, then some default seed ++ * may be used. ++ * ++ * A particular hash version specifies whether or not the seed is ++ * represented, and whether or not the returned hash is 32 bits or 64 ++ * bits. 32 bit hashes will return 0 for the minor hash. ++ */ ++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) ++{ ++ __u32 hash; ++ __u32 minor_hash = 0; ++ const char *p; ++ int i; ++ __u32 in[8], buf[4]; ++ ++ /* Initialize the default seed for the hash checksum functions */ ++ buf[0] = 0x67452301; ++ buf[1] = 0xefcdab89; ++ buf[2] = 0x98badcfe; ++ buf[3] = 0x10325476; ++ ++ /* Check to see if the seed is all zero's */ ++ if (hinfo->seed) { ++ for (i=0; i < 4; i++) { ++ if (hinfo->seed[i]) ++ break; ++ } ++ if (i < 4) ++ memcpy(buf, hinfo->seed, sizeof(buf)); ++ } ++ ++ switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY: ++ hash = dx_hack_hash(name, len); ++ break; ++ case DX_HASH_HALF_MD4: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 8); ++ halfMD4Transform(buf, in); ++ len -= 32; ++ p += 32; ++ } ++ minor_hash = buf[2]; ++ hash = buf[1]; ++ break; ++ case DX_HASH_TEA: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 4); ++ TEA_transform(buf, in); ++ len -= 16; ++ p += 16; ++ } ++ hash = buf[0]; ++ minor_hash = buf[1]; ++ break; ++ default: ++ hinfo->hash = 0; ++ return -1; ++ } ++ hinfo->hash = hash & ~1; ++ hinfo->minor_hash = minor_hash; ++ return 0; ++} +--- linux-chaos-2.4.20-6/fs/ext3/namei.c~ext-2.4-patch-1-chaos 2003-03-12 12:51:02.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/namei.c 2003-04-09 16:26:04.000000000 -0600 +@@ -16,6 +16,12 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 ++ * Hash Tree Directory indexing cleanup ++ * Theodore Ts'o, 2002 + */ + + #include +@@ -38,6 +44,630 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++static struct buffer_head *ext3_append(handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ ++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#ifdef DX_DEBUG ++#define dxtrace(command) command ++#else ++#define dxtrace(command) ++#endif ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++#ifdef CONFIG_EXT3_INDEX ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static struct dx_frame *dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_frame *frame, ++ int *err); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, ++ struct dx_map_entry *offsets, int count); ++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash); ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err); ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++ ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x00ffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - ++ EXT3_DIR_REC_LEN(2) - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* ++ * Debug ++ */ ++#ifdef DX_DEBUG ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, ++ int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ printk(":%x.%u ", h.hash, ++ ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++#endif /* DX_DEBUG */ ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static struct dx_frame * ++dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ struct dx_frame *frame = frame_in; ++ u32 hash; ++ ++ frame->bh = NULL; ++ if (dentry) ++ dir = dentry->d_parent->d_inode; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ hinfo->hash_version = root->info.hash_version; ++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ if (dentry) ++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ hash = hinfo->hash; ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ if ((indirect = root->info.indirect_levels) > 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ while (frame >= frame_in) { ++ brelse(frame->bh); ++ frame--; ++ } ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (frames[0].bh == NULL) ++ return; ++ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse(frames[1].bh); ++ brelse(frames[0].bh); ++} ++ ++/* ++ * This function increments the frame pointer to search the next leaf ++ * block, and reads in the necessary intervening nodes if the search ++ * should be necessary. Whether or not the search is necessary is ++ * controlled by the hash parameter. If the hash value is even, then ++ * the search is only continued if the next block starts with that ++ * hash value. This is used if we are searching for a specific file. ++ * ++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. ++ * ++ * This function returns 1 if the caller should continue to search, ++ * or 0 if it should not. If there is an error reading one of the ++ * index blocks, it will return -1. ++ * ++ * If start_hash is non-null, it will be filled in with the starting ++ * hash of the next page. ++ */ ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash) ++{ ++ struct dx_frame *p; ++ struct buffer_head *bh; ++ int num_frames = 0; ++ __u32 bhash; ++ ++ *err = ENOENT; ++ p = frame; ++ /* ++ * Find the next leaf page by incrementing the frame pointer. ++ * If we run out of entries in the interior node, loop around and ++ * increment pointer in the parent node. When we break out of ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ ++ while (1) { ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ if (p == frames) ++ return 0; ++ num_frames++; ++ p--; ++ } ++ ++ /* ++ * If the hash is 1, then continue only if the next page has a ++ * continuation hash of any value. This is used for readdir ++ * handling. Otherwise, check to see if the hash matches the ++ * desired contiuation hash. If it doesn't, return since ++ * there's no point to read in the successive index pages. ++ */ ++ bhash = dx_get_hash(p->at); ++ if (start_hash) ++ *start_hash = bhash; ++ if ((hash & 1) == 0) { ++ if ((bhash & ~1) != hash) ++ return 0; ++ } ++ /* ++ * If the hash is HASH_NB_ALWAYS, we always go to the next ++ * block so no check is necessary ++ */ ++ while (num_frames--) { ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), ++ 0, err))) ++ return -1; /* Failure */ ++ p++; ++ brelse (p->bh); ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } ++ return 1; ++} ++ ++ ++/* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) ++{ ++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++/* ++ * This function fills a red-black tree with information from a ++ * directory. We start scanning the directory in hash order, starting ++ * at start_hash and start_minor_hash. ++ * ++ * This function returns the number of entries inserted into the tree, ++ * or a negative error code. ++ */ ++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash) ++{ ++ struct dx_hash_info hinfo; ++ struct buffer_head *bh; ++ struct ext3_dir_entry_2 *de, *top; ++ static struct dx_frame frames[2], *frame; ++ struct inode *dir; ++ int block, err; ++ int count = 0; ++ int ret; ++ __u32 hashval; ++ ++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, ++ start_minor_hash)); ++ dir = dir_file->f_dentry->d_inode; ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ ++ while (1) { ++ block = dx_get_block(frame->at); ++ dxtrace(printk("Reading block %d\n", block)); ++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) ++ goto errout; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) { ++ ext3fs_dirhash(de->name, de->name_len, &hinfo); ++ if ((hinfo.hash < start_hash) || ++ ((hinfo.hash == start_hash) && ++ (hinfo.minor_hash < start_minor_hash))) ++ continue; ++ ext3_htree_store_dirent(dir_file, hinfo.hash, ++ hinfo.minor_hash, de); ++ count++; ++ } ++ brelse (bh); ++ hashval = ~1; ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ frame, frames, &err, &hashval); ++ if (next_hash) ++ *next_hash = hashval; ++ if (ret == -1) ++ goto errout; ++ /* ++ * Stop if: (a) there are no more entries, or ++ * (b) we have inserted at least one entry and the ++ * next hash value is not a continuation ++ */ ++ if ((ret == 0) || ++ (count && ((hashval & 1) == 0))) ++ break; ++ } ++ dx_release(frames); ++ dxtrace(printk("Fill tree: returned %d entries\n", count)); ++ return count; ++errout: ++ dx_release(frames); ++ return (err); ++} ++ ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ while ((char *) de < base + size) ++ { ++ if (de->name_len && de->inode) { ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ map_tail--; ++ map_tail->hash = h.hash; ++ map_tail->offs = (u32) ((char *) de - base); ++ count++; ++ } ++ /* XXX: do we need to check rec_len == 0 case? -Chris */ ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries; ++ struct dx_entry *old = frame->at, *new = old + 1; ++ int count = dx_get_count(entries); ++ ++ assert(count < dx_get_limit(entries)); ++ assert(old < entries + count); ++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); ++ dx_set_hash(new, hash); ++ dx_set_block(new, block); ++ dx_set_count(entries, count + 1); ++} ++#endif ++ ++ ++static void ext3_update_dx_flag(struct inode *inode) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -94,6 +724,7 @@ static int inline search_dirblock(struct + return 0; + } + ++ + /* + * ext3_find_entry() + * +@@ -105,6 +736,8 @@ static int inline search_dirblock(struct + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -119,12 +752,32 @@ static struct buffer_head * ext3_find_en + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; +- ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++ * old fashioned way. ++ */ ++ if (bh || (err != ERR_BAD_DX_DIR)) ++ return bh; ++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ } ++#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); +- start = dir->u.ext3_i.i_dir_start_lookup; ++ start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +@@ -166,7 +819,7 @@ restart: + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { +- dir->u.ext3_i.i_dir_start_lookup = block; ++ EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { +@@ -197,6 +850,66 @@ cleanup_and_exit: + return ret; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err) ++{ ++ struct super_block * sb; ++ struct dx_hash_info hinfo; ++ u32 hash; ++ struct dx_frame frames[2], *frame; ++ struct ext3_dir_entry_2 *de, *top; ++ struct buffer_head *bh; ++ unsigned long block; ++ int retval; ++ int namelen = dentry->d_name.len; ++ const u8 *name = dentry->d_name.name; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ sb = dir->i_sb; ++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++ return NULL; ++ hash = hinfo.hash; ++ do { ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ goto errout; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto errout; ++ } ++ *res_dir = de; ++ dx_release (frames); ++ return bh; ++ } ++ brelse (bh); ++ /* Check to see if we should continue to search */ ++ retval = ext3_htree_next_block(dir, hash, frame, ++ frames, err, 0); ++ if (retval == -1) { ++ ext3_warning(sb, __FUNCTION__, ++ "error reading index page in directory #%lu", ++ dir->i_ino); ++ goto errout; ++ } ++ } while (retval == 1); ++ ++ *err = -ENOENT; ++errout: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++} ++#endif ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -213,8 +926,9 @@ static struct dentry *ext3_lookup(struct + brelse (bh); + inode = iget(dir->i_sb, ino); + +- if (!inode) ++ if (!inode) { + return ERR_PTR(-EACCES); ++ } + } + d_add(dentry, inode); + return NULL; +@@ -238,6 +952,300 @@ static inline void ext3_set_de_type(stru + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct ext3_dir_entry_2 * ++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ de->inode = 0; ++ map++; ++ to += rec_len; ++ } ++ return (struct ext3_dir_entry_2 *) (to - rec_len); ++} ++ ++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) ++{ ++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ unsigned rec_len = 0; ++ ++ prev = to = de; ++ while ((char*)de < base + size) { ++ next = (struct ext3_dir_entry_2 *) ((char *) de + ++ le16_to_cpu(de->rec_len)); ++ if (de->inode && de->name_len) { ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = rec_len; ++ prev = to; ++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ } ++ de = next; ++ } ++ return prev; ++} ++ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ u32 hash2; ++ struct dx_map_entry *map; ++ char *data1 = (*bh)->b_data, *data2; ++ unsigned split; ++ struct ext3_dir_entry_2 *de = NULL, *de2; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) { ++ brelse(*bh); ++ *bh = NULL; ++ goto errout; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, *bh); ++ if (err) { ++ journal_error: ++ brelse(*bh); ++ brelse(bh2); ++ *bh = NULL; ++ ext3_std_error(dir->i_sb, err); ++ goto errout; ++ } ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ ++ data2 = bh2->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de = dx_pack_dirents(data1,blocksize); ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, frame->bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ dxtrace(dx_show_index ("frame", frame->entries)); ++errout: ++ return de; ++} ++#endif ++ ++ ++/* ++ * Add a new entry into a directory (leaf) block. If de is non-NULL, ++ * it points to a directory entry which is guaranteed to be large ++ * enough for new directory entry. If de is NULL, then ++ * add_dirent_to_buf will attempt search the directory block for ++ * space. It will return -ENOSPC if no space is available, and -EIO ++ * and -EEXIST if directory entry already exists. ++ * ++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In ++ * all other cases bh is released. ++ */ ++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct ext3_dir_entry_2 *de, ++ struct buffer_head * bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset = 0; ++ unsigned short reclen; ++ int nlen, rlen, err; ++ char *top; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ if (!de) { ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, ++ bh, offset)) { ++ brelse (bh); ++ return -EIO; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ break; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ if ((char *) de > top) ++ return -ENOSPC; ++ } ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return err; ++ } ++ ++ /* By now the buffer is marked for journaling */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ dir->i_version = ++event; ++ ext3_mark_inode_dirty(handle, dir); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return 0; ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * This converts a one block unindexed directory to a 3 block indexed ++ * directory, and adds the dentry to the indexed directory. ++ */ ++static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct buffer_head *bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de, *de2; ++ char *data1, *top; ++ unsigned len; ++ int retval; ++ unsigned blocksize; ++ struct dx_hash_info hinfo; ++ u32 block; ++ ++ blocksize = dir->i_sb->s_blocksize; ++ dxtrace(printk("Creating index\n")); ++ retval = ext3_journal_get_write_access(handle, bh); ++ if (retval) { ++ ext3_std_error(dir->i_sb, retval); ++ brelse(bh); ++ return retval; ++ } ++ root = (struct dx_root *) bh->b_data; ++ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) { ++ brelse(bh); ++ return retval; ++ } ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (struct ext3_dir_entry_2 *) &root->info; ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (struct ext3_dir_entry_2 *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (struct ext3_dir_entry_2 *) (&root->dotdot); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hinfo.hash_version = root->info.hash_version; ++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} ++#endif ++ + /* + * ext3_add_entry() + * +@@ -248,127 +1256,198 @@ static inline void ext3_set_de_type(stru + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +- +-/* +- * AKPM: the journalling code here looks wrong on the error paths +- */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; +- struct ext3_dir_entry_2 * de, * de1; ++ struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; ++#ifdef CONFIG_EXT3_INDEX ++ int dx_fallback=0; ++#endif ++ unsigned blocksize; ++ unsigned nlen, rlen; ++ u32 block, blocks; + + sb = dir->i_sb; +- +- if (!namelen) ++ blocksize = sb->s_blocksize; ++ if (!dentry->d_name.len) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ retval = ext3_dx_add_entry(handle, dentry, inode); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ return retval; ++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; ++ dx_fallback++; ++ ext3_mark_inode_dirty(handle, dir); ++ } ++#endif ++ blocks = dir->i_size >> sb->s_blocksize_bits; ++ for (block = 0, offset = 0; block < blocks; block++) { ++ bh = ext3_bread(handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (retval != -ENOSPC) ++ return retval; ++ ++#ifdef CONFIG_EXT3_INDEX ++ if (blocks == 1 && !dx_fallback && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ return make_indexed_dir(handle, dentry, inode, bh); ++#endif ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; +- } ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} + +- ext3_debug ("creating next block\n"); ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block * sb = dir->i_sb; ++ struct ext3_dir_entry_2 *de; ++ int err; + +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- } else { ++ frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ entries = frame->entries; ++ at = frame->at; + +- ext3_debug ("skipping to next block\n"); ++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ goto cleanup; + +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- } +- } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (err != -ENOSPC) { ++ bh = 0; ++ goto cleanup; ++ } ++ ++ /* Block full, should compress but for now just split */ ++ dxtrace(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ ++ if (levels && (dx_get_count(frames->entries) == ++ dx_get_limit(frames->entries))) { ++ ext3_warning(sb, __FUNCTION__, ++ "Directory index full!\n"); ++ err = -ENOSPC; ++ goto cleanup; + } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) ++ goto cleanup; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); ++ node2->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ if (levels) { ++ unsigned icount1 = icount/2, icount2 = icount - icount1; ++ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ err = ext3_journal_get_write_access(handle, ++ frames[0].bh); ++ if (err) ++ goto journal_error; ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1), ++ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries >= icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- dir->i_version = ++event; +- ext3_mark_inode_dirty(handle, dir); +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); +- brelse(bh); +- return 0; ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace(dx_show_index ("node", frames[1].entries)); ++ dxtrace(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ } else { ++ dxtrace(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ err = ext3_journal_get_write_access(handle, ++ frame->bh); ++ if (err) ++ goto journal_error; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- brelse (bh); +- return -ENOSPC; ++ de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ if (!de) ++ goto cleanup; ++ err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ bh = 0; ++ goto cleanup; ++ ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++cleanup: ++ if (bh) ++ brelse(bh); ++ dx_release(frames); ++ return err; + } ++#endif + + /* + * ext3_delete_entry deletes a directory entry by merging it with the +@@ -455,9 +1534,11 @@ static int ext3_create (struct inode * d + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -481,9 +1562,11 @@ static int ext3_mknod (struct inode * di + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -509,9 +1592,11 @@ static int ext3_mkdir(struct inode * dir + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -523,7 +1608,7 @@ static int ext3_mkdir(struct inode * dir + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; +- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +@@ -556,21 +1641,19 @@ static int ext3_mkdir(struct inode * dir + inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); +- if (err) +- goto out_no_entry; ++ if (err) { ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + dir->i_nlink++; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- inode->i_nlink = 0; +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + /* +@@ -657,7 +1740,7 @@ int ext3_orphan_add(handle_t *handle, st + int err = 0, rc; + + lock_super(sb); +- if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks +@@ -698,7 +1781,7 @@ int ext3_orphan_add(handle_t *handle, st + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) +- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", +@@ -716,25 +1799,26 @@ out_unlock: + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); +- if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); +- prev = inode->u.ext3_i.i_orphan.prev; ++ prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + +- list_del(&inode->u.ext3_i.i_orphan); +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ list_del(&ei->i_orphan); ++ INIT_LIST_HEAD(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on +@@ -795,8 +1879,9 @@ static int ext3_rmdir (struct inode * di + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); +@@ -834,7 +1919,7 @@ static int ext3_rmdir (struct inode * di + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -852,8 +1937,9 @@ static int ext3_unlink(struct inode * di + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -880,7 +1966,7 @@ static int ext3_unlink(struct inode * di + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -906,9 +1992,11 @@ static int ext3_symlink (struct inode * + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -918,7 +2006,7 @@ static int ext3_symlink (struct inode * + if (IS_ERR(inode)) + goto out_stop; + +- if (l > sizeof (inode->u.ext3_i.i_data)) { ++ if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* +@@ -927,24 +2015,23 @@ static int ext3_symlink (struct inode * + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); +- if (err) +- goto out_no_entry; ++ if (err) { ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; +- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); ++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ EXT3_I(inode)->i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); ++ ext3_mark_inode_dirty(handle, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- ext3_dec_count(handle, inode); +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, +@@ -957,12 +2044,15 @@ static int ext3_link (struct dentry * ol + if (S_ISDIR(inode->i_mode)) + return -EPERM; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (inode->i_nlink >= EXT3_LINK_MAX) { + return -EMLINK; ++ } + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -995,9 +2085,11 @@ static int ext3_rename (struct inode * o + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; +@@ -1077,7 +2169,7 @@ static int ext3_rename (struct inode * o + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1089,7 +2181,7 @@ static int ext3_rename (struct inode * o + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } +--- linux-chaos-2.4.20-6/fs/ext3/super.c~ext-2.4-patch-1-chaos 2003-04-09 16:10:38.000000000 -0600 ++++ linux-chaos-2.4.20-6-braam/fs/ext3/super.c 2003-04-09 16:18:55.000000000 -0600 +@@ -710,6 +710,7 @@ static int ext3_setup_super(struct super + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO +@@ -720,6 +721,7 @@ static int ext3_setup_super(struct super + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +@@ -893,6 +895,7 @@ static loff_t ext3_max_size(int bits) + return res; + } + ++ + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { +@@ -1069,6 +1072,9 @@ struct super_block * ext3_read_super (st + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ for (i=0; i < 4; i++) ++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); ++ sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +@@ -1770,6 +1776,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + } + ++EXPORT_SYMBOL(ext3_force_commit); + EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +--- linux-chaos-2.4.20-6/include/linux/ext3_fs.h~ext-2.4-patch-1-chaos 2003-03-12 12:51:27.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_fs.h 2003-04-09 16:18:55.000000000 -0600 +@@ -40,6 +40,11 @@ + #define EXT3FS_VERSION "2.4-0.9.19" + + /* ++ * Always enable hashed directories ++ */ ++#define CONFIG_EXT3_INDEX ++ ++/* + * Debug code + */ + #ifdef EXT3FS_DEBUG +@@ -437,8 +442,11 @@ struct ext3_super_block { + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ +- +-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ ++ __u32 s_hash_seed[4]; /* HTREE hash seed */ ++ __u8 s_def_hash_version; /* Default hash version to use */ ++ __u8 s_reserved_char_pad; ++ __u16 s_reserved_word_pad; ++ __u32 s_reserved[192]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -575,9 +583,46 @@ struct ext3_dir_entry_2 { + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#ifdef CONFIG_EXT3_INDEX ++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif ++ ++/* Legal values for the dx_root hash_version field: */ ++ ++#define DX_HASH_LEGACY 0 ++#define DX_HASH_HALF_MD4 1 ++#define DX_HASH_TEA 2 ++ ++/* hash info structure used by the directory hash */ ++struct dx_hash_info ++{ ++ u32 hash; ++ u32 minor_hash; ++ int hash_version; ++ u32 *seed; ++}; + + #ifdef __KERNEL__ + /* ++ * Control parameters used by ext3_htree_next_block ++ */ ++#define HASH_NB_ALWAYS 1 ++ ++ ++/* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc +@@ -587,6 +632,27 @@ struct ext3_iloc + unsigned long block_group; + }; + ++ ++/* ++ * This structure is stuffed into the struct file's private_data field ++ * for directories. It is where we put information so that we can do ++ * readdir operations in hash tree order. ++ */ ++struct dir_private_info { ++ rb_root_t root; ++ rb_node_t *curr_node; ++ struct fname *extra_fname; ++ loff_t last_pos; ++ __u32 curr_hash; ++ __u32 curr_minor_hash; ++ __u32 next_hash; ++}; ++ ++/* ++ * Special error return code only used by dx_probe() and its callers. ++ */ ++#define ERR_BAD_DX_DIR -75000 ++ + /* + * Function prototypes + */ +@@ -614,11 +680,20 @@ extern struct ext3_group_desc * ext3_get + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, struct buffer_head *, +- unsigned long); ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent); ++extern void ext3_htree_free_dir_info(struct dir_private_info *p); ++ + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + ++/* hash.c */ ++extern int ext3fs_dirhash(const char *name, int len, struct ++ dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); +@@ -650,6 +725,8 @@ extern int ext3_ioctl (struct inode *, s + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); ++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) +--- linux-chaos-2.4.20-6/include/linux/ext3_fs_sb.h~ext-2.4-patch-1-chaos 2003-03-12 12:51:27.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_fs_sb.h 2003-04-09 16:18:55.000000000 -0600 +@@ -62,6 +62,8 @@ struct ext3_sb_info { + int s_inode_size; + int s_first_ino; + u32 s_next_generation; ++ u32 s_hash_seed[4]; ++ int s_def_hash_version; + + /* Journaling */ + struct inode * s_journal_inode; +--- linux-chaos-2.4.20-6/include/linux/ext3_jbd.h~ext-2.4-patch-1-chaos 2003-03-12 12:51:27.000000000 -0700 ++++ linux-chaos-2.4.20-6-braam/include/linux/ext3_jbd.h 2003-04-09 16:18:55.000000000 -0600 +@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s + + #define EXT3_RESERVE_TRANS_BLOCKS 12U + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +--- linux-chaos-2.4.20-6/include/linux/rbtree.h~ext-2.4-patch-1-chaos 2002-05-07 15:53:47.000000000 -0600 ++++ linux-chaos-2.4.20-6-braam/include/linux/rbtree.h 2003-04-09 16:18:55.000000000 -0600 +@@ -120,6 +120,8 @@ rb_root_t; + + extern void rb_insert_color(rb_node_t *, rb_root_t *); + extern void rb_erase(rb_node_t *, rb_root_t *); ++extern rb_node_t *rb_get_first(rb_root_t *root); ++extern rb_node_t *rb_get_next(rb_node_t *n); + + static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) + { +--- linux-chaos-2.4.20-6/lib/rbtree.c~ext-2.4-patch-1-chaos 2002-09-25 11:14:03.000000000 -0600 ++++ linux-chaos-2.4.20-6-braam/lib/rbtree.c 2003-04-09 16:18:55.000000000 -0600 +@@ -17,6 +17,8 @@ + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c ++ ++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 + */ + + #include +@@ -294,3 +296,43 @@ void rb_erase(rb_node_t * node, rb_root_ + __rb_erase_color(child, parent, root); + } + EXPORT_SYMBOL(rb_erase); ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++rb_node_t *rb_get_first(rb_root_t *root) ++{ ++ rb_node_t *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return 0; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++EXPORT_SYMBOL(rb_get_first); ++ ++/* ++ * Given a node, this function will return the next node in the tree. ++ */ ++rb_node_t *rb_get_next(rb_node_t *n) ++{ ++ rb_node_t *parent; ++ ++ if (n->rb_right) { ++ n = n->rb_right; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++ } else { ++ while ((parent = n->rb_parent)) { ++ if (n == parent->rb_left) ++ return parent; ++ n = parent; ++ } ++ return 0; ++ } ++} ++EXPORT_SYMBOL(rb_get_next); ++ + +_ diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-1.patch b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch new file mode 100644 index 0000000..09caec1 --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-1.patch @@ -0,0 +1,2527 @@ + fs/ext3/Makefile | 2 + fs/ext3/dir.c | 299 +++++++++ + fs/ext3/file.c | 3 + fs/ext3/hash.c | 215 ++++++ + fs/ext3/namei.c | 1387 ++++++++++++++++++++++++++++++++++++++++----- + fs/ext3/super.c | 7 + include/linux/ext3_fs.h | 85 ++ + include/linux/ext3_fs_sb.h | 2 + include/linux/ext3_jbd.h | 2 + include/linux/rbtree.h | 2 + lib/rbtree.c | 42 + + 11 files changed, 1886 insertions(+), 160 deletions(-) + +--- linux-2.4.20/fs/ext3/Makefile~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/fs/ext3/Makefile Sat Apr 5 03:57:05 2003 +@@ -12,7 +12,7 @@ O_TARGET := ext3.o + export-objs := super.o inode.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o hash.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make +--- linux-2.4.20/fs/ext3/dir.c~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/fs/ext3/dir.c Sat Apr 5 03:56:31 2003 +@@ -21,12 +21,16 @@ + #include + #include + #include ++#include ++#include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, +@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio + fsync: ext3_sync_file, /* BKL held */ + }; + ++ ++static unsigned char get_dtype(struct super_block *sb, int filetype) ++{ ++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) || ++ (filetype >= EXT3_FT_MAX)) ++ return DT_UNKNOWN; ++ ++ return (ext3_filetype_table[filetype]); ++} ++ ++ + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, +@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi + + sb = inode->i_sb; + ++ if (is_dx(inode)) { ++ err = ext3_dx_readdir(filp, dirent, filldir); ++ if (err != ERR_BAD_DX_DIR) ++ return err; ++ /* ++ * We don't set the inode dirty flag since it's not ++ * critical that it get flushed back to the disk. ++ */ ++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL; ++ } + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); +@@ -162,18 +187,12 @@ revalidate: + * during the copy operation. + */ + unsigned long version = filp->f_version; +- unsigned char d_type = DT_UNKNOWN; + +- if (EXT3_HAS_INCOMPAT_FEATURE(sb, +- EXT3_FEATURE_INCOMPAT_FILETYPE) +- && de->file_type < EXT3_FT_MAX) +- d_type = +- ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), +- d_type); ++ get_dtype(sb, de->file_type)); + if (error) + break; + if (version != filp->f_version) +@@ -188,3 +207,269 @@ revalidate: + UPDATE_ATIME(inode); + return 0; + } ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * These functions convert from the major/minor hash to an f_pos ++ * value. ++ * ++ * Currently we only use major hash numer. This is unfortunate, but ++ * on 32-bit machines, the same VFS interface is used for lseek and ++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of ++ * lseek/telldir/seekdir will blow out spectacularly, and from within ++ * the ext2 low-level routine, we don't know if we're being called by ++ * a 64-bit version of the system call or the 32-bit version of the ++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir ++ * cookie. Sigh. ++ */ ++#define hash2pos(major, minor) (major >> 1) ++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) ++#define pos2min_hash(pos) (0) ++ ++/* ++ * This structure holds the nodes of the red-black tree used to store ++ * the directory entry in hash order. ++ */ ++struct fname { ++ __u32 hash; ++ __u32 minor_hash; ++ rb_node_t rb_hash; ++ struct fname *next; ++ __u32 inode; ++ __u8 name_len; ++ __u8 file_type; ++ char name[0]; ++}; ++ ++/* ++ * This functoin implements a non-recursive way of freeing all of the ++ * nodes in the red-black tree. ++ */ ++static void free_rb_tree_fname(rb_root_t *root) ++{ ++ rb_node_t *n = root->rb_node; ++ rb_node_t *parent; ++ struct fname *fname; ++ ++ while (n) { ++ /* Do the node's children first */ ++ if ((n)->rb_left) { ++ n = n->rb_left; ++ continue; ++ } ++ if (n->rb_right) { ++ n = n->rb_right; ++ continue; ++ } ++ /* ++ * The node has no children; free it, and then zero ++ * out parent's link to it. Finally go to the ++ * beginning of the loop and try to free the parent ++ * node. ++ */ ++ parent = n->rb_parent; ++ fname = rb_entry(n, struct fname, rb_hash); ++ kfree(fname); ++ if (!parent) ++ root->rb_node = 0; ++ else if (parent->rb_left == n) ++ parent->rb_left = 0; ++ else if (parent->rb_right == n) ++ parent->rb_right = 0; ++ n = parent; ++ } ++ root->rb_node = 0; ++} ++ ++ ++struct dir_private_info *create_dir_info(loff_t pos) ++{ ++ struct dir_private_info *p; ++ ++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); ++ if (!p) ++ return NULL; ++ p->root.rb_node = 0; ++ p->curr_node = 0; ++ p->extra_fname = 0; ++ p->last_pos = 0; ++ p->curr_hash = pos2maj_hash(pos); ++ p->curr_minor_hash = pos2min_hash(pos); ++ p->next_hash = 0; ++ return p; ++} ++ ++void ext3_htree_free_dir_info(struct dir_private_info *p) ++{ ++ free_rb_tree_fname(&p->root); ++ kfree(p); ++} ++ ++/* ++ * Given a directory entry, enter it into the fname rb tree. ++ */ ++void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent) ++{ ++ rb_node_t **p, *parent = NULL; ++ struct fname * fname, *new_fn; ++ struct dir_private_info *info; ++ int len; ++ ++ info = (struct dir_private_info *) dir_file->private_data; ++ p = &info->root.rb_node; ++ ++ /* Create and allocate the fname structure */ ++ len = sizeof(struct fname) + dirent->name_len + 1; ++ new_fn = kmalloc(len, GFP_KERNEL); ++ memset(new_fn, 0, len); ++ new_fn->hash = hash; ++ new_fn->minor_hash = minor_hash; ++ new_fn->inode = le32_to_cpu(dirent->inode); ++ new_fn->name_len = dirent->name_len; ++ new_fn->file_type = dirent->file_type; ++ memcpy(new_fn->name, dirent->name, dirent->name_len); ++ new_fn->name[dirent->name_len] = 0; ++ ++ while (*p) { ++ parent = *p; ++ fname = rb_entry(parent, struct fname, rb_hash); ++ ++ /* ++ * If the hash and minor hash match up, then we put ++ * them on a linked list. This rarely happens... ++ */ ++ if ((new_fn->hash == fname->hash) && ++ (new_fn->minor_hash == fname->minor_hash)) { ++ new_fn->next = fname->next; ++ fname->next = new_fn; ++ return; ++ } ++ ++ if (new_fn->hash < fname->hash) ++ p = &(*p)->rb_left; ++ else if (new_fn->hash > fname->hash) ++ p = &(*p)->rb_right; ++ else if (new_fn->minor_hash < fname->minor_hash) ++ p = &(*p)->rb_left; ++ else /* if (new_fn->minor_hash > fname->minor_hash) */ ++ p = &(*p)->rb_right; ++ } ++ ++ rb_link_node(&new_fn->rb_hash, parent, p); ++ rb_insert_color(&new_fn->rb_hash, &info->root); ++} ++ ++ ++ ++/* ++ * This is a helper function for ext3_dx_readdir. It calls filldir ++ * for all entres on the fname linked list. (Normally there is only ++ * one entry on the linked list, unless there are 62 bit hash collisions.) ++ */ ++static int call_filldir(struct file * filp, void * dirent, ++ filldir_t filldir, struct fname *fname) ++{ ++ struct dir_private_info *info = filp->private_data; ++ loff_t curr_pos; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct super_block * sb; ++ int error; ++ ++ sb = inode->i_sb; ++ ++ if (!fname) { ++ printk("call_filldir: called with null fname?!?\n"); ++ return 0; ++ } ++ curr_pos = hash2pos(fname->hash, fname->minor_hash); ++ while (fname) { ++ error = filldir(dirent, fname->name, ++ fname->name_len, curr_pos, ++ fname->inode, ++ get_dtype(sb, fname->file_type)); ++ if (error) { ++ filp->f_pos = curr_pos; ++ info->extra_fname = fname->next; ++ return error; ++ } ++ fname = fname->next; ++ } ++ return 0; ++} ++ ++static int ext3_dx_readdir(struct file * filp, ++ void * dirent, filldir_t filldir) ++{ ++ struct dir_private_info *info = filp->private_data; ++ struct inode *inode = filp->f_dentry->d_inode; ++ struct fname *fname; ++ int ret; ++ ++ if (!info) { ++ info = create_dir_info(filp->f_pos); ++ if (!info) ++ return -ENOMEM; ++ filp->private_data = info; ++ } ++ ++ /* Some one has messed with f_pos; reset the world */ ++ if (info->last_pos != filp->f_pos) { ++ free_rb_tree_fname(&info->root); ++ info->curr_node = 0; ++ info->extra_fname = 0; ++ info->curr_hash = pos2maj_hash(filp->f_pos); ++ info->curr_minor_hash = pos2min_hash(filp->f_pos); ++ } ++ ++ /* ++ * If there are any leftover names on the hash collision ++ * chain, return them first. ++ */ ++ if (info->extra_fname && ++ call_filldir(filp, dirent, filldir, info->extra_fname)) ++ goto finished; ++ ++ if (!info->curr_node) ++ info->curr_node = rb_get_first(&info->root); ++ ++ while (1) { ++ /* ++ * Fill the rbtree if we have no more entries, ++ * or the inode has changed since we last read in the ++ * cached entries. ++ */ ++ if ((!info->curr_node) || ++ (filp->f_version != inode->i_version)) { ++ info->curr_node = 0; ++ free_rb_tree_fname(&info->root); ++ filp->f_version = inode->i_version; ++ ret = ext3_htree_fill_tree(filp, info->curr_hash, ++ info->curr_minor_hash, ++ &info->next_hash); ++ if (ret < 0) ++ return ret; ++ if (ret == 0) ++ break; ++ info->curr_node = rb_get_first(&info->root); ++ } ++ ++ fname = rb_entry(info->curr_node, struct fname, rb_hash); ++ info->curr_hash = fname->hash; ++ info->curr_minor_hash = fname->minor_hash; ++ if (call_filldir(filp, dirent, filldir, fname)) ++ break; ++ ++ info->curr_node = rb_get_next(info->curr_node); ++ if (!info->curr_node) { ++ info->curr_hash = info->next_hash; ++ info->curr_minor_hash = 0; ++ } ++ } ++finished: ++ info->last_pos = filp->f_pos; ++ UPDATE_ATIME(inode); ++ return 0; ++} ++#endif +--- linux-2.4.20/fs/ext3/file.c~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/fs/ext3/file.c Sat Apr 5 03:56:31 2003 +@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); ++ if (is_dx(inode) && filp->private_data) ++ ext3_htree_free_dir_info(filp->private_data); ++ + return 0; + } + +--- /dev/null Fri Aug 30 17:31:37 2002 ++++ linux-2.4.20-braam/fs/ext3/hash.c Sat Apr 5 03:56:31 2003 +@@ -0,0 +1,215 @@ ++/* ++ * linux/fs/ext3/hash.c ++ * ++ * Copyright (C) 2002 by Theodore Ts'o ++ * ++ * This file is released under the GPL v2. ++ * ++ * This file may be redistributed under the terms of the GNU Public ++ * License. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define DELTA 0x9E3779B9 ++ ++static void TEA_transform(__u32 buf[4], __u32 const in[]) ++{ ++ __u32 sum = 0; ++ __u32 b0 = buf[0], b1 = buf[1]; ++ __u32 a = in[0], b = in[1], c = in[2], d = in[3]; ++ int n = 16; ++ ++ do { ++ sum += DELTA; ++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); ++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); ++ } while(--n); ++ ++ buf[0] += b0; ++ buf[1] += b1; ++} ++ ++/* F, G and H are basic MD4 functions: selection, majority, parity */ ++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) ++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z))) ++#define H(x, y, z) ((x) ^ (y) ^ (z)) ++ ++/* ++ * The generic round function. The application is so specific that ++ * we don't bother protecting all the arguments with parens, as is generally ++ * good macro practice, in favor of extra legibility. ++ * Rotation is separate from addition to prevent recomputation ++ */ ++#define ROUND(f, a, b, c, d, x, s) \ ++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s))) ++#define K1 0 ++#define K2 013240474631UL ++#define K3 015666365641UL ++ ++/* ++ * Basic cut-down MD4 transform. Returns only 32 bits of result. ++ */ ++static void halfMD4Transform (__u32 buf[4], __u32 const in[]) ++{ ++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3]; ++ ++ /* Round 1 */ ++ ROUND(F, a, b, c, d, in[0] + K1, 3); ++ ROUND(F, d, a, b, c, in[1] + K1, 7); ++ ROUND(F, c, d, a, b, in[2] + K1, 11); ++ ROUND(F, b, c, d, a, in[3] + K1, 19); ++ ROUND(F, a, b, c, d, in[4] + K1, 3); ++ ROUND(F, d, a, b, c, in[5] + K1, 7); ++ ROUND(F, c, d, a, b, in[6] + K1, 11); ++ ROUND(F, b, c, d, a, in[7] + K1, 19); ++ ++ /* Round 2 */ ++ ROUND(G, a, b, c, d, in[1] + K2, 3); ++ ROUND(G, d, a, b, c, in[3] + K2, 5); ++ ROUND(G, c, d, a, b, in[5] + K2, 9); ++ ROUND(G, b, c, d, a, in[7] + K2, 13); ++ ROUND(G, a, b, c, d, in[0] + K2, 3); ++ ROUND(G, d, a, b, c, in[2] + K2, 5); ++ ROUND(G, c, d, a, b, in[4] + K2, 9); ++ ROUND(G, b, c, d, a, in[6] + K2, 13); ++ ++ /* Round 3 */ ++ ROUND(H, a, b, c, d, in[3] + K3, 3); ++ ROUND(H, d, a, b, c, in[7] + K3, 9); ++ ROUND(H, c, d, a, b, in[2] + K3, 11); ++ ROUND(H, b, c, d, a, in[6] + K3, 15); ++ ROUND(H, a, b, c, d, in[1] + K3, 3); ++ ROUND(H, d, a, b, c, in[5] + K3, 9); ++ ROUND(H, c, d, a, b, in[0] + K3, 11); ++ ROUND(H, b, c, d, a, in[4] + K3, 15); ++ ++ buf[0] += a; ++ buf[1] += b; ++ buf[2] += c; ++ buf[3] += d; ++} ++ ++#undef ROUND ++#undef F ++#undef G ++#undef H ++#undef K1 ++#undef K2 ++#undef K3 ++ ++/* The old legacy hash */ ++static __u32 dx_hack_hash (const char *name, int len) ++{ ++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; ++ while (len--) { ++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); ++ ++ if (hash & 0x80000000) hash -= 0x7fffffff; ++ hash1 = hash0; ++ hash0 = hash; ++ } ++ return (hash0 << 1); ++} ++ ++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) ++{ ++ __u32 pad, val; ++ int i; ++ ++ pad = (__u32)len | ((__u32)len << 8); ++ pad |= pad << 16; ++ ++ val = pad; ++ if (len > num*4) ++ len = num * 4; ++ for (i=0; i < len; i++) { ++ if ((i % 4) == 0) ++ val = pad; ++ val = msg[i] + (val << 8); ++ if ((i % 4) == 3) { ++ *buf++ = val; ++ val = pad; ++ num--; ++ } ++ } ++ if (--num >= 0) ++ *buf++ = val; ++ while (--num >= 0) ++ *buf++ = pad; ++} ++ ++/* ++ * Returns the hash of a filename. If len is 0 and name is NULL, then ++ * this function can be used to test whether or not a hash version is ++ * supported. ++ * ++ * The seed is an 4 longword (32 bits) "secret" which can be used to ++ * uniquify a hash. If the seed is all zero's, then some default seed ++ * may be used. ++ * ++ * A particular hash version specifies whether or not the seed is ++ * represented, and whether or not the returned hash is 32 bits or 64 ++ * bits. 32 bit hashes will return 0 for the minor hash. ++ */ ++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) ++{ ++ __u32 hash; ++ __u32 minor_hash = 0; ++ const char *p; ++ int i; ++ __u32 in[8], buf[4]; ++ ++ /* Initialize the default seed for the hash checksum functions */ ++ buf[0] = 0x67452301; ++ buf[1] = 0xefcdab89; ++ buf[2] = 0x98badcfe; ++ buf[3] = 0x10325476; ++ ++ /* Check to see if the seed is all zero's */ ++ if (hinfo->seed) { ++ for (i=0; i < 4; i++) { ++ if (hinfo->seed[i]) ++ break; ++ } ++ if (i < 4) ++ memcpy(buf, hinfo->seed, sizeof(buf)); ++ } ++ ++ switch (hinfo->hash_version) { ++ case DX_HASH_LEGACY: ++ hash = dx_hack_hash(name, len); ++ break; ++ case DX_HASH_HALF_MD4: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 8); ++ halfMD4Transform(buf, in); ++ len -= 32; ++ p += 32; ++ } ++ minor_hash = buf[2]; ++ hash = buf[1]; ++ break; ++ case DX_HASH_TEA: ++ p = name; ++ while (len > 0) { ++ str2hashbuf(p, len, in, 4); ++ TEA_transform(buf, in); ++ len -= 16; ++ p += 16; ++ } ++ hash = buf[0]; ++ minor_hash = buf[1]; ++ break; ++ default: ++ hinfo->hash = 0; ++ return -1; ++ } ++ hinfo->hash = hash & ~1; ++ hinfo->minor_hash = minor_hash; ++ return 0; ++} +--- linux-2.4.20/fs/ext3/namei.c~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/fs/ext3/namei.c Sat Apr 5 03:56:31 2003 +@@ -16,6 +16,12 @@ + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 ++ * Hash Tree Directory indexing (c) ++ * Daniel Phillips, 2001 ++ * Hash Tree Directory indexing porting ++ * Christopher Li, 2002 ++ * Hash Tree Directory indexing cleanup ++ * Theodore Ts'o, 2002 + */ + + #include +@@ -38,6 +44,630 @@ + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + ++static struct buffer_head *ext3_append(handle_t *handle, ++ struct inode *inode, ++ u32 *block, int *err) ++{ ++ struct buffer_head *bh; ++ ++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits; ++ ++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) { ++ inode->i_size += inode->i_sb->s_blocksize; ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_journal_get_write_access(handle,bh); ++ } ++ return bh; ++} ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#ifndef swap ++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) ++#endif ++ ++typedef struct { u32 v; } le_u32; ++typedef struct { u16 v; } le_u16; ++ ++#ifdef DX_DEBUG ++#define dxtrace(command) command ++#else ++#define dxtrace(command) ++#endif ++ ++struct fake_dirent ++{ ++ /*le*/u32 inode; ++ /*le*/u16 rec_len; ++ u8 name_len; ++ u8 file_type; ++}; ++ ++struct dx_countlimit ++{ ++ le_u16 limit; ++ le_u16 count; ++}; ++ ++struct dx_entry ++{ ++ le_u32 hash; ++ le_u32 block; ++}; ++ ++/* ++ * dx_root_info is laid out so that if it should somehow get overlaid by a ++ * dirent the two low bits of the hash version will be zero. Therefore, the ++ * hash version mod 4 should never be 0. Sincerely, the paranoia department. ++ */ ++ ++struct dx_root ++{ ++ struct fake_dirent dot; ++ char dot_name[4]; ++ struct fake_dirent dotdot; ++ char dotdot_name[4]; ++ struct dx_root_info ++ { ++ le_u32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; ++ } ++ info; ++ struct dx_entry entries[0]; ++}; ++ ++struct dx_node ++{ ++ struct fake_dirent fake; ++ struct dx_entry entries[0]; ++}; ++ ++ ++struct dx_frame ++{ ++ struct buffer_head *bh; ++ struct dx_entry *entries; ++ struct dx_entry *at; ++}; ++ ++struct dx_map_entry ++{ ++ u32 hash; ++ u32 offs; ++}; ++ ++#ifdef CONFIG_EXT3_INDEX ++static inline unsigned dx_get_block (struct dx_entry *entry); ++static void dx_set_block (struct dx_entry *entry, unsigned value); ++static inline unsigned dx_get_hash (struct dx_entry *entry); ++static void dx_set_hash (struct dx_entry *entry, unsigned value); ++static unsigned dx_get_count (struct dx_entry *entries); ++static unsigned dx_get_limit (struct dx_entry *entries); ++static void dx_set_count (struct dx_entry *entries, unsigned value); ++static void dx_set_limit (struct dx_entry *entries, unsigned value); ++static unsigned dx_root_limit (struct inode *dir, unsigned infosize); ++static unsigned dx_node_limit (struct inode *dir); ++static struct dx_frame *dx_probe(struct dentry *dentry, ++ struct inode *dir, ++ struct dx_hash_info *hinfo, ++ struct dx_frame *frame, ++ int *err); ++static void dx_release (struct dx_frame *frames); ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry map[]); ++static void dx_sort_map(struct dx_map_entry *map, unsigned count); ++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to, ++ struct dx_map_entry *offsets, int count); ++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size); ++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block); ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash); ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err); ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode); ++ ++/* ++ * Future: use high four bits of block for coalesce-on-delete flags ++ * Mask them off for now. ++ */ ++ ++static inline unsigned dx_get_block (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->block.v) & 0x00ffffff; ++} ++ ++static inline void dx_set_block (struct dx_entry *entry, unsigned value) ++{ ++ entry->block.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_hash (struct dx_entry *entry) ++{ ++ return le32_to_cpu(entry->hash.v); ++} ++ ++static inline void dx_set_hash (struct dx_entry *entry, unsigned value) ++{ ++ entry->hash.v = cpu_to_le32(value); ++} ++ ++static inline unsigned dx_get_count (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v); ++} ++ ++static inline unsigned dx_get_limit (struct dx_entry *entries) ++{ ++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v); ++} ++ ++static inline void dx_set_count (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value); ++} ++ ++static inline void dx_set_limit (struct dx_entry *entries, unsigned value) ++{ ++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value); ++} ++ ++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) - ++ EXT3_DIR_REC_LEN(2) - infosize; ++ return 0? 20: entry_space / sizeof(struct dx_entry); ++} ++ ++static inline unsigned dx_node_limit (struct inode *dir) ++{ ++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0); ++ return 0? 22: entry_space / sizeof(struct dx_entry); ++} ++ ++/* ++ * Debug ++ */ ++#ifdef DX_DEBUG ++struct stats ++{ ++ unsigned names; ++ unsigned space; ++ unsigned bcount; ++}; ++ ++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de, ++ int size, int show_names) ++{ ++ unsigned names = 0, space = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ printk("names: "); ++ while ((char *) de < base + size) ++ { ++ if (de->inode) ++ { ++ if (show_names) ++ { ++ int len = de->name_len; ++ char *name = de->name; ++ while (len--) printk("%c", *name++); ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ printk(":%x.%u ", h.hash, ++ ((char *) de - base)); ++ } ++ space += EXT3_DIR_REC_LEN(de->name_len); ++ names++; ++ } ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ printk("(%i)\n", names); ++ return (struct stats) { names, space, 1 }; ++} ++ ++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, ++ struct dx_entry *entries, int levels) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count = dx_get_count (entries), names = 0, space = 0, i; ++ unsigned bcount = 0; ++ struct buffer_head *bh; ++ int err; ++ printk("%i indexed blocks...\n", count); ++ for (i = 0; i < count; i++, entries++) ++ { ++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0; ++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; ++ struct stats stats; ++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); ++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue; ++ stats = levels? ++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0); ++ names += stats.names; ++ space += stats.space; ++ bcount += stats.bcount; ++ brelse (bh); ++ } ++ if (bcount) ++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", ++ names, space/bcount,(space/bcount)*100/blocksize); ++ return (struct stats) { names, space, bcount}; ++} ++#endif /* DX_DEBUG */ ++ ++/* ++ * Probe for a directory leaf block to search. ++ * ++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format ++ * error in the directory index, and the caller should fall back to ++ * searching the directory normally. The callers of dx_probe **MUST** ++ * check for this error code, and make sure it never gets reflected ++ * back to userspace. ++ */ ++static struct dx_frame * ++dx_probe(struct dentry *dentry, struct inode *dir, ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) ++{ ++ unsigned count, indirect; ++ struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_root *root; ++ struct buffer_head *bh; ++ struct dx_frame *frame = frame_in; ++ u32 hash; ++ ++ frame->bh = NULL; ++ if (dentry) ++ dir = dentry->d_parent->d_inode; ++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err))) ++ goto fail; ++ root = (struct dx_root *) bh->b_data; ++ if (root->info.hash_version != DX_HASH_TEA && ++ root->info.hash_version != DX_HASH_HALF_MD4 && ++ root->info.hash_version != DX_HASH_LEGACY) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unrecognised inode hash code %d", ++ root->info.hash_version); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ hinfo->hash_version = root->info.hash_version; ++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ if (dentry) ++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); ++ hash = hinfo->hash; ++ ++ if (root->info.unused_flags & 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash flags: %#06x", ++ root->info.unused_flags); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ if ((indirect = root->info.indirect_levels) > 1) { ++ ext3_warning(dir->i_sb, __FUNCTION__, ++ "Unimplemented inode hash depth: %#06x", ++ root->info.indirect_levels); ++ brelse(bh); ++ *err = ERR_BAD_DX_DIR; ++ goto fail; ++ } ++ ++ entries = (struct dx_entry *) (((char *)&root->info) + ++ root->info.info_length); ++ assert(dx_get_limit(entries) == dx_root_limit(dir, ++ root->info.info_length)); ++ dxtrace (printk("Look up %x", hash)); ++ while (1) ++ { ++ count = dx_get_count(entries); ++ assert (count && count <= dx_get_limit(entries)); ++ p = entries + 1; ++ q = entries + count - 1; ++ while (p <= q) ++ { ++ m = p + (q - p)/2; ++ dxtrace(printk(".")); ++ if (dx_get_hash(m) > hash) ++ q = m - 1; ++ else ++ p = m + 1; ++ } ++ ++ if (0) // linear search cross check ++ { ++ unsigned n = count - 1; ++ at = entries; ++ while (n--) ++ { ++ dxtrace(printk(",")); ++ if (dx_get_hash(++at) > hash) ++ { ++ at--; ++ break; ++ } ++ } ++ assert (at == p - 1); ++ } ++ ++ at = p - 1; ++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); ++ frame->bh = bh; ++ frame->entries = entries; ++ frame->at = at; ++ if (!indirect--) return frame; ++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err))) ++ goto fail2; ++ at = entries = ((struct dx_node *) bh->b_data)->entries; ++ assert (dx_get_limit(entries) == dx_node_limit (dir)); ++ frame++; ++ } ++fail2: ++ while (frame >= frame_in) { ++ brelse(frame->bh); ++ frame--; ++ } ++fail: ++ return NULL; ++} ++ ++static void dx_release (struct dx_frame *frames) ++{ ++ if (frames[0].bh == NULL) ++ return; ++ ++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) ++ brelse(frames[1].bh); ++ brelse(frames[0].bh); ++} ++ ++/* ++ * This function increments the frame pointer to search the next leaf ++ * block, and reads in the necessary intervening nodes if the search ++ * should be necessary. Whether or not the search is necessary is ++ * controlled by the hash parameter. If the hash value is even, then ++ * the search is only continued if the next block starts with that ++ * hash value. This is used if we are searching for a specific file. ++ * ++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block. ++ * ++ * This function returns 1 if the caller should continue to search, ++ * or 0 if it should not. If there is an error reading one of the ++ * index blocks, it will return -1. ++ * ++ * If start_hash is non-null, it will be filled in with the starting ++ * hash of the next page. ++ */ ++static int ext3_htree_next_block(struct inode *dir, __u32 hash, ++ struct dx_frame *frame, ++ struct dx_frame *frames, int *err, ++ __u32 *start_hash) ++{ ++ struct dx_frame *p; ++ struct buffer_head *bh; ++ int num_frames = 0; ++ __u32 bhash; ++ ++ *err = ENOENT; ++ p = frame; ++ /* ++ * Find the next leaf page by incrementing the frame pointer. ++ * If we run out of entries in the interior node, loop around and ++ * increment pointer in the parent node. When we break out of ++ * this loop, num_frames indicates the number of interior ++ * nodes need to be read. ++ */ ++ while (1) { ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ if (p == frames) ++ return 0; ++ num_frames++; ++ p--; ++ } ++ ++ /* ++ * If the hash is 1, then continue only if the next page has a ++ * continuation hash of any value. This is used for readdir ++ * handling. Otherwise, check to see if the hash matches the ++ * desired contiuation hash. If it doesn't, return since ++ * there's no point to read in the successive index pages. ++ */ ++ bhash = dx_get_hash(p->at); ++ if (start_hash) ++ *start_hash = bhash; ++ if ((hash & 1) == 0) { ++ if ((bhash & ~1) != hash) ++ return 0; ++ } ++ /* ++ * If the hash is HASH_NB_ALWAYS, we always go to the next ++ * block so no check is necessary ++ */ ++ while (num_frames--) { ++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at), ++ 0, err))) ++ return -1; /* Failure */ ++ p++; ++ brelse (p->bh); ++ p->bh = bh; ++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; ++ } ++ return 1; ++} ++ ++ ++/* ++ * p is at least 6 bytes before the end of page ++ */ ++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p) ++{ ++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len)); ++} ++ ++/* ++ * This function fills a red-black tree with information from a ++ * directory. We start scanning the directory in hash order, starting ++ * at start_hash and start_minor_hash. ++ * ++ * This function returns the number of entries inserted into the tree, ++ * or a negative error code. ++ */ ++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash) ++{ ++ struct dx_hash_info hinfo; ++ struct buffer_head *bh; ++ struct ext3_dir_entry_2 *de, *top; ++ static struct dx_frame frames[2], *frame; ++ struct inode *dir; ++ int block, err; ++ int count = 0; ++ int ret; ++ __u32 hashval; ++ ++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, ++ start_minor_hash)); ++ dir = dir_file->f_dentry->d_inode; ++ hinfo.hash = start_hash; ++ hinfo.minor_hash = 0; ++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ ++ while (1) { ++ block = dx_get_block(frame->at); ++ dxtrace(printk("Reading block %d\n", block)); ++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err))) ++ goto errout; ++ ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) { ++ ext3fs_dirhash(de->name, de->name_len, &hinfo); ++ if ((hinfo.hash < start_hash) || ++ ((hinfo.hash == start_hash) && ++ (hinfo.minor_hash < start_minor_hash))) ++ continue; ++ ext3_htree_store_dirent(dir_file, hinfo.hash, ++ hinfo.minor_hash, de); ++ count++; ++ } ++ brelse (bh); ++ hashval = ~1; ++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS, ++ frame, frames, &err, &hashval); ++ if (next_hash) ++ *next_hash = hashval; ++ if (ret == -1) ++ goto errout; ++ /* ++ * Stop if: (a) there are no more entries, or ++ * (b) we have inserted at least one entry and the ++ * next hash value is not a continuation ++ */ ++ if ((ret == 0) || ++ (count && ((hashval & 1) == 0))) ++ break; ++ } ++ dx_release(frames); ++ dxtrace(printk("Fill tree: returned %d entries\n", count)); ++ return count; ++errout: ++ dx_release(frames); ++ return (err); ++} ++ ++ ++/* ++ * Directory block splitting, compacting ++ */ ++ ++static int dx_make_map (struct ext3_dir_entry_2 *de, int size, ++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) ++{ ++ int count = 0; ++ char *base = (char *) de; ++ struct dx_hash_info h = *hinfo; ++ ++ while ((char *) de < base + size) ++ { ++ if (de->name_len && de->inode) { ++ ext3fs_dirhash(de->name, de->name_len, &h); ++ map_tail--; ++ map_tail->hash = h.hash; ++ map_tail->offs = (u32) ((char *) de - base); ++ count++; ++ } ++ /* XXX: do we need to check rec_len == 0 case? -Chris */ ++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len)); ++ } ++ return count; ++} ++ ++static void dx_sort_map (struct dx_map_entry *map, unsigned count) ++{ ++ struct dx_map_entry *p, *q, *top = map + count - 1; ++ int more; ++ /* Combsort until bubble sort doesn't suck */ ++ while (count > 2) ++ { ++ count = count*10/13; ++ if (count - 9 < 2) /* 9, 10 -> 11 */ ++ count = 11; ++ for (p = top, q = p - count; q >= map; p--, q--) ++ if (p->hash < q->hash) ++ swap(*p, *q); ++ } ++ /* Garden variety bubble sort */ ++ do { ++ more = 0; ++ q = top; ++ while (q-- > map) ++ { ++ if (q[1].hash >= q[0].hash) ++ continue; ++ swap(*(q+1), *q); ++ more = 1; ++ } ++ } while(more); ++} ++ ++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block) ++{ ++ struct dx_entry *entries = frame->entries; ++ struct dx_entry *old = frame->at, *new = old + 1; ++ int count = dx_get_count(entries); ++ ++ assert(count < dx_get_limit(entries)); ++ assert(old < entries + count); ++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new)); ++ dx_set_hash(new, hash); ++ dx_set_block(new, block); ++ dx_set_count(entries, count + 1); ++} ++#endif ++ ++ ++static void ext3_update_dx_flag(struct inode *inode) ++{ ++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb, ++ EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL; ++} ++ + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * +@@ -94,6 +724,7 @@ static int inline search_dirblock(struct + return 0; + } + ++ + /* + * ext3_find_entry() + * +@@ -105,6 +736,8 @@ static int inline search_dirblock(struct + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ ++ ++ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { +@@ -119,12 +752,32 @@ static struct buffer_head * ext3_find_en + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; ++ int namelen; ++ const u8 *name; ++ unsigned blocksize; + + *res_dir = NULL; + sb = dir->i_sb; +- ++ blocksize = sb->s_blocksize; ++ namelen = dentry->d_name.len; ++ name = dentry->d_name.name; ++ if (namelen > EXT3_NAME_LEN) ++ return NULL; ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ bh = ext3_dx_find_entry(dentry, res_dir, &err); ++ /* ++ * On success, or if the error was file not found, ++ * return. Otherwise, fall back to doing a search the ++ * old fashioned way. ++ */ ++ if (bh || (err != ERR_BAD_DX_DIR)) ++ return bh; ++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n")); ++ } ++#endif + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); +- start = dir->u.ext3_i.i_dir_start_lookup; ++ start = EXT3_I(dir)->i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; +@@ -165,7 +818,7 @@ restart: + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { +- dir->u.ext3_i.i_dir_start_lookup = block; ++ EXT3_I(dir)->i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { +@@ -196,6 +849,66 @@ cleanup_and_exit: + return ret; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, ++ struct ext3_dir_entry_2 **res_dir, int *err) ++{ ++ struct super_block * sb; ++ struct dx_hash_info hinfo; ++ u32 hash; ++ struct dx_frame frames[2], *frame; ++ struct ext3_dir_entry_2 *de, *top; ++ struct buffer_head *bh; ++ unsigned long block; ++ int retval; ++ int namelen = dentry->d_name.len; ++ const u8 *name = dentry->d_name.name; ++ struct inode *dir = dentry->d_parent->d_inode; ++ ++ sb = dir->i_sb; ++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err))) ++ return NULL; ++ hash = hinfo.hash; ++ do { ++ block = dx_get_block(frame->at); ++ if (!(bh = ext3_bread (NULL,dir, block, 0, err))) ++ goto errout; ++ de = (struct ext3_dir_entry_2 *) bh->b_data; ++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize - ++ EXT3_DIR_REC_LEN(0)); ++ for (; de < top; de = ext3_next_entry(de)) ++ if (ext3_match (namelen, name, de)) { ++ if (!ext3_check_dir_entry("ext3_find_entry", ++ dir, de, bh, ++ (block<b_data))) { ++ brelse (bh); ++ goto errout; ++ } ++ *res_dir = de; ++ dx_release (frames); ++ return bh; ++ } ++ brelse (bh); ++ /* Check to see if we should continue to search */ ++ retval = ext3_htree_next_block(dir, hash, frame, ++ frames, err, 0); ++ if (retval == -1) { ++ ext3_warning(sb, __FUNCTION__, ++ "error reading index page in directory #%lu", ++ dir->i_ino); ++ goto errout; ++ } ++ } while (retval == 1); ++ ++ *err = -ENOENT; ++errout: ++ dxtrace(printk("%s not found\n", name)); ++ dx_release (frames); ++ return NULL; ++} ++#endif ++ + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; +@@ -212,8 +925,9 @@ static struct dentry *ext3_lookup(struct + brelse (bh); + inode = iget(dir->i_sb, ino); + +- if (!inode) ++ if (!inode) { + return ERR_PTR(-EACCES); ++ } + } + d_add(dentry, inode); + return NULL; +@@ -237,6 +951,300 @@ static inline void ext3_set_de_type(stru + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + ++#ifdef CONFIG_EXT3_INDEX ++static struct ext3_dir_entry_2 * ++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count) ++{ ++ unsigned rec_len = 0; ++ ++ while (count--) { ++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs); ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ memcpy (to, de, rec_len); ++ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len; ++ de->inode = 0; ++ map++; ++ to += rec_len; ++ } ++ return (struct ext3_dir_entry_2 *) (to - rec_len); ++} ++ ++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size) ++{ ++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base; ++ unsigned rec_len = 0; ++ ++ prev = to = de; ++ while ((char*)de < base + size) { ++ next = (struct ext3_dir_entry_2 *) ((char *) de + ++ le16_to_cpu(de->rec_len)); ++ if (de->inode && de->name_len) { ++ rec_len = EXT3_DIR_REC_LEN(de->name_len); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = rec_len; ++ prev = to; ++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len); ++ } ++ de = next; ++ } ++ return prev; ++} ++ ++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, ++ struct buffer_head **bh,struct dx_frame *frame, ++ struct dx_hash_info *hinfo, int *error) ++{ ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ unsigned count, continued; ++ struct buffer_head *bh2; ++ u32 newblock; ++ u32 hash2; ++ struct dx_map_entry *map; ++ char *data1 = (*bh)->b_data, *data2; ++ unsigned split; ++ struct ext3_dir_entry_2 *de = NULL, *de2; ++ int err; ++ ++ bh2 = ext3_append (handle, dir, &newblock, error); ++ if (!(bh2)) { ++ brelse(*bh); ++ *bh = NULL; ++ goto errout; ++ } ++ ++ BUFFER_TRACE(*bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, *bh); ++ if (err) { ++ journal_error: ++ brelse(*bh); ++ brelse(bh2); ++ *bh = NULL; ++ ext3_std_error(dir->i_sb, err); ++ goto errout; ++ } ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ ++ data2 = bh2->b_data; ++ ++ /* create map in the end of data2 block */ ++ map = (struct dx_map_entry *) (data2 + blocksize); ++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1, ++ blocksize, hinfo, map); ++ map -= count; ++ split = count/2; // need to adjust to actual middle ++ dx_sort_map (map, count); ++ hash2 = map[split].hash; ++ continued = hash2 == map[split - 1].hash; ++ dxtrace(printk("Split block %i at %x, %i/%i\n", ++ dx_get_block(frame->at), hash2, split, count-split)); ++ ++ /* Fancy dance to stay within two buffers */ ++ de2 = dx_move_dirents(data1, data2, map + split, count - split); ++ de = dx_pack_dirents(data1,blocksize); ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1)); ++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1)); ++ ++ /* Which block gets the new entry? */ ++ if (hinfo->hash >= hash2) ++ { ++ swap(*bh, bh2); ++ de = de2; ++ } ++ dx_insert_block (frame, hash2 + continued, newblock); ++ err = ext3_journal_dirty_metadata (handle, bh2); ++ if (err) ++ goto journal_error; ++ err = ext3_journal_dirty_metadata (handle, frame->bh); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ dxtrace(dx_show_index ("frame", frame->entries)); ++errout: ++ return de; ++} ++#endif ++ ++ ++/* ++ * Add a new entry into a directory (leaf) block. If de is non-NULL, ++ * it points to a directory entry which is guaranteed to be large ++ * enough for new directory entry. If de is NULL, then ++ * add_dirent_to_buf will attempt search the directory block for ++ * space. It will return -ENOSPC if no space is available, and -EIO ++ * and -EEXIST if directory entry already exists. ++ * ++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In ++ * all other cases bh is released. ++ */ ++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct ext3_dir_entry_2 *de, ++ struct buffer_head * bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ unsigned long offset = 0; ++ unsigned short reclen; ++ int nlen, rlen, err; ++ char *top; ++ ++ reclen = EXT3_DIR_REC_LEN(namelen); ++ if (!de) { ++ de = (struct ext3_dir_entry_2 *)bh->b_data; ++ top = bh->b_data + dir->i_sb->s_blocksize - reclen; ++ while ((char *) de <= top) { ++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de, ++ bh, offset)) { ++ brelse (bh); ++ return -EIO; ++ } ++ if (ext3_match (namelen, name, de)) { ++ brelse (bh); ++ return -EEXIST; ++ } ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if ((de->inode? rlen - nlen: rlen) >= reclen) ++ break; ++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen); ++ offset += rlen; ++ } ++ if ((char *) de > top) ++ return -ENOSPC; ++ } ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) { ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return err; ++ } ++ ++ /* By now the buffer is marked for journaling */ ++ nlen = EXT3_DIR_REC_LEN(de->name_len); ++ rlen = le16_to_cpu(de->rec_len); ++ if (de->inode) { ++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen); ++ de1->rec_len = cpu_to_le16(rlen - nlen); ++ de->rec_len = cpu_to_le16(nlen); ++ de = de1; ++ } ++ de->file_type = EXT3_FT_UNKNOWN; ++ if (inode) { ++ de->inode = cpu_to_le32(inode->i_ino); ++ ext3_set_de_type(dir->i_sb, de, inode->i_mode); ++ } else ++ de->inode = 0; ++ de->name_len = namelen; ++ memcpy (de->name, name, namelen); ++ /* ++ * XXX shouldn't update any times until successful ++ * completion of syscall, but too many callers depend ++ * on this. ++ * ++ * XXX similarly, too many callers depend on ++ * ext3_new_inode() setting the times, but error ++ * recovery deletes the inode, so the worst that can ++ * happen is that the times are slightly out of date ++ * and/or different from the directory change time. ++ */ ++ dir->i_mtime = dir->i_ctime = CURRENT_TIME; ++ ext3_update_dx_flag(dir); ++ dir->i_version = ++event; ++ ext3_mark_inode_dirty(handle, dir); ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) ++ ext3_std_error(dir->i_sb, err); ++ brelse(bh); ++ return 0; ++} ++ ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * This converts a one block unindexed directory to a 3 block indexed ++ * directory, and adds the dentry to the indexed directory. ++ */ ++static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct buffer_head *bh) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ const char *name = dentry->d_name.name; ++ int namelen = dentry->d_name.len; ++ struct buffer_head *bh2; ++ struct dx_root *root; ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries; ++ struct ext3_dir_entry_2 *de, *de2; ++ char *data1, *top; ++ unsigned len; ++ int retval; ++ unsigned blocksize; ++ struct dx_hash_info hinfo; ++ u32 block; ++ ++ blocksize = dir->i_sb->s_blocksize; ++ dxtrace(printk("Creating index\n")); ++ retval = ext3_journal_get_write_access(handle, bh); ++ if (retval) { ++ ext3_std_error(dir->i_sb, retval); ++ brelse(bh); ++ return retval; ++ } ++ root = (struct dx_root *) bh->b_data; ++ ++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; ++ bh2 = ext3_append (handle, dir, &block, &retval); ++ if (!(bh2)) { ++ brelse(bh); ++ return retval; ++ } ++ data1 = bh2->b_data; ++ ++ /* The 0th block becomes the root, move the dirents out */ ++ de = (struct ext3_dir_entry_2 *) &root->info; ++ len = ((char *) root) + blocksize - (char *) de; ++ memcpy (data1, de, len); ++ de = (struct ext3_dir_entry_2 *) data1; ++ top = data1 + len; ++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top) ++ de = de2; ++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de); ++ /* Initialize the root; the dot dirents already exist */ ++ de = (struct ext3_dir_entry_2 *) (&root->dotdot); ++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2)); ++ memset (&root->info, 0, sizeof(root->info)); ++ root->info.info_length = sizeof(root->info); ++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version; ++ entries = root->entries; ++ dx_set_block (entries, 1); ++ dx_set_count (entries, 1); ++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); ++ ++ /* Initialize as for dx_probe */ ++ hinfo.hash_version = root->info.hash_version; ++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed; ++ ext3fs_dirhash(name, namelen, &hinfo); ++ frame = frames; ++ frame->entries = entries; ++ frame->at = entries; ++ frame->bh = bh; ++ bh = bh2; ++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval); ++ dx_release (frames); ++ if (!(de)) ++ return retval; ++ ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} ++#endif ++ + /* + * ext3_add_entry() + * +@@ -247,127 +1255,198 @@ static inline void ext3_set_de_type(stru + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +- +-/* +- * AKPM: the journalling code here looks wrong on the error paths +- */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; +- const char *name = dentry->d_name.name; +- int namelen = dentry->d_name.len; + unsigned long offset; +- unsigned short rec_len; + struct buffer_head * bh; +- struct ext3_dir_entry_2 * de, * de1; ++ struct ext3_dir_entry_2 *de; + struct super_block * sb; + int retval; ++#ifdef CONFIG_EXT3_INDEX ++ int dx_fallback=0; ++#endif ++ unsigned blocksize; ++ unsigned nlen, rlen; ++ u32 block, blocks; + + sb = dir->i_sb; +- +- if (!namelen) ++ blocksize = sb->s_blocksize; ++ if (!dentry->d_name.len) + return -EINVAL; +- bh = ext3_bread (handle, dir, 0, 0, &retval); ++#ifdef CONFIG_EXT3_INDEX ++ if (is_dx(dir)) { ++ retval = ext3_dx_add_entry(handle, dentry, inode); ++ if (!retval || (retval != ERR_BAD_DX_DIR)) ++ return retval; ++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL; ++ dx_fallback++; ++ ext3_mark_inode_dirty(handle, dir); ++ } ++#endif ++ blocks = dir->i_size >> sb->s_blocksize_bits; ++ for (block = 0, offset = 0; block < blocks; block++) { ++ bh = ext3_bread(handle, dir, block, 0, &retval); ++ if(!bh) ++ return retval; ++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (retval != -ENOSPC) ++ return retval; ++ ++#ifdef CONFIG_EXT3_INDEX ++ if (blocks == 1 && !dx_fallback && ++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) ++ return make_indexed_dir(handle, dentry, inode, bh); ++#endif ++ brelse(bh); ++ } ++ bh = ext3_append(handle, dir, &block, &retval); + if (!bh) + return retval; +- rec_len = EXT3_DIR_REC_LEN(namelen); +- offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; +- while (1) { +- if ((char *)de >= sb->s_blocksize + bh->b_data) { +- brelse (bh); +- bh = NULL; +- bh = ext3_bread (handle, dir, +- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); +- if (!bh) +- return retval; +- if (dir->i_size <= offset) { +- if (dir->i_size == 0) { +- brelse(bh); +- return -ENOENT; +- } ++ de->inode = 0; ++ de->rec_len = cpu_to_le16(rlen = blocksize); ++ nlen = 0; ++ return add_dirent_to_buf(handle, dentry, inode, de, bh); ++} + +- ext3_debug ("creating next block\n"); ++#ifdef CONFIG_EXT3_INDEX ++/* ++ * Returns 0 for success, or a negative error value ++ */ ++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct dx_frame frames[2], *frame; ++ struct dx_entry *entries, *at; ++ struct dx_hash_info hinfo; ++ struct buffer_head * bh; ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct super_block * sb = dir->i_sb; ++ struct ext3_dir_entry_2 *de; ++ int err; + +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- de->inode = 0; +- de->rec_len = le16_to_cpu(sb->s_blocksize); +- dir->u.ext3_i.i_disksize = +- dir->i_size = offset + sb->s_blocksize; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- ext3_mark_inode_dirty(handle, dir); +- } else { ++ frame = dx_probe(dentry, 0, &hinfo, frames, &err); ++ if (!frame) ++ return err; ++ entries = frame->entries; ++ at = frame->at; + +- ext3_debug ("skipping to next block\n"); ++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err))) ++ goto cleanup; + +- de = (struct ext3_dir_entry_2 *) bh->b_data; +- } +- } +- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, +- offset)) { +- brelse (bh); +- return -ENOENT; +- } +- if (ext3_match (namelen, name, de)) { +- brelse (bh); +- return -EEXIST; ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) ++ goto journal_error; ++ ++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh); ++ if (err != -ENOSPC) { ++ bh = 0; ++ goto cleanup; ++ } ++ ++ /* Block full, should compress but for now just split */ ++ dxtrace(printk("using %u of %u node entries\n", ++ dx_get_count(entries), dx_get_limit(entries))); ++ /* Need to split index? */ ++ if (dx_get_count(entries) == dx_get_limit(entries)) { ++ u32 newblock; ++ unsigned icount = dx_get_count(entries); ++ int levels = frame - frames; ++ struct dx_entry *entries2; ++ struct dx_node *node2; ++ struct buffer_head *bh2; ++ ++ if (levels && (dx_get_count(frames->entries) == ++ dx_get_limit(frames->entries))) { ++ ext3_warning(sb, __FUNCTION__, ++ "Directory index full!\n"); ++ err = -ENOSPC; ++ goto cleanup; + } +- if ((le32_to_cpu(de->inode) == 0 && +- le16_to_cpu(de->rec_len) >= rec_len) || +- (le16_to_cpu(de->rec_len) >= +- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { +- BUFFER_TRACE(bh, "get_write_access"); +- ext3_journal_get_write_access(handle, bh); +- /* By now the buffer is marked for journaling */ +- offset += le16_to_cpu(de->rec_len); +- if (le32_to_cpu(de->inode)) { +- de1 = (struct ext3_dir_entry_2 *) ((char *) de + +- EXT3_DIR_REC_LEN(de->name_len)); +- de1->rec_len = +- cpu_to_le16(le16_to_cpu(de->rec_len) - +- EXT3_DIR_REC_LEN(de->name_len)); +- de->rec_len = cpu_to_le16( +- EXT3_DIR_REC_LEN(de->name_len)); +- de = de1; ++ bh2 = ext3_append (handle, dir, &newblock, &err); ++ if (!(bh2)) ++ goto cleanup; ++ node2 = (struct dx_node *)(bh2->b_data); ++ entries2 = node2->entries; ++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize); ++ node2->fake.inode = 0; ++ BUFFER_TRACE(frame->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, frame->bh); ++ if (err) ++ goto journal_error; ++ if (levels) { ++ unsigned icount1 = icount/2, icount2 = icount - icount1; ++ unsigned hash2 = dx_get_hash(entries + icount1); ++ dxtrace(printk("Split index %i/%i\n", icount1, icount2)); ++ ++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ ++ err = ext3_journal_get_write_access(handle, ++ frames[0].bh); ++ if (err) ++ goto journal_error; ++ ++ memcpy ((char *) entries2, (char *) (entries + icount1), ++ icount2 * sizeof(struct dx_entry)); ++ dx_set_count (entries, icount1); ++ dx_set_count (entries2, icount2); ++ dx_set_limit (entries2, dx_node_limit(dir)); ++ ++ /* Which index block gets the new entry? */ ++ if (at - entries >= icount1) { ++ frame->at = at = at - entries - icount1 + entries2; ++ frame->entries = entries = entries2; ++ swap(frame->bh, bh2); + } +- de->file_type = EXT3_FT_UNKNOWN; +- if (inode) { +- de->inode = cpu_to_le32(inode->i_ino); +- ext3_set_de_type(dir->i_sb, de, inode->i_mode); +- } else +- de->inode = 0; +- de->name_len = namelen; +- memcpy (de->name, name, namelen); +- /* +- * XXX shouldn't update any times until successful +- * completion of syscall, but too many callers depend +- * on this. +- * +- * XXX similarly, too many callers depend on +- * ext3_new_inode() setting the times, but error +- * recovery deletes the inode, so the worst that can +- * happen is that the times are slightly out of date +- * and/or different from the directory change time. +- */ +- dir->i_mtime = dir->i_ctime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; +- dir->i_version = ++event; +- ext3_mark_inode_dirty(handle, dir); +- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); +- ext3_journal_dirty_metadata(handle, bh); +- brelse(bh); +- return 0; ++ dx_insert_block (frames + 0, hash2, newblock); ++ dxtrace(dx_show_index ("node", frames[1].entries)); ++ dxtrace(dx_show_index ("node", ++ ((struct dx_node *) bh2->b_data)->entries)); ++ err = ext3_journal_dirty_metadata(handle, bh2); ++ if (err) ++ goto journal_error; ++ brelse (bh2); ++ } else { ++ dxtrace(printk("Creating second level index...\n")); ++ memcpy((char *) entries2, (char *) entries, ++ icount * sizeof(struct dx_entry)); ++ dx_set_limit(entries2, dx_node_limit(dir)); ++ ++ /* Set up root */ ++ dx_set_count(entries, 1); ++ dx_set_block(entries + 0, newblock); ++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; ++ ++ /* Add new access path frame */ ++ frame = frames + 1; ++ frame->at = at = at - entries + entries2; ++ frame->entries = entries = entries2; ++ frame->bh = bh2; ++ err = ext3_journal_get_write_access(handle, ++ frame->bh); ++ if (err) ++ goto journal_error; + } +- offset += le16_to_cpu(de->rec_len); +- de = (struct ext3_dir_entry_2 *) +- ((char *) de + le16_to_cpu(de->rec_len)); ++ ext3_journal_dirty_metadata(handle, frames[0].bh); + } +- brelse (bh); +- return -ENOSPC; ++ de = do_split(handle, dir, &bh, frame, &hinfo, &err); ++ if (!de) ++ goto cleanup; ++ err = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ bh = 0; ++ goto cleanup; ++ ++journal_error: ++ ext3_std_error(dir->i_sb, err); ++cleanup: ++ if (bh) ++ brelse(bh); ++ dx_release(frames); ++ return err; + } ++#endif + + /* + * ext3_delete_entry deletes a directory entry by merging it with the +@@ -451,9 +1530,11 @@ static int ext3_create (struct inode * d + struct inode * inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -478,9 +1559,11 @@ static int ext3_mknod (struct inode * di + struct inode *inode; + int err; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -507,9 +1590,11 @@ static int ext3_mkdir(struct inode * dir + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -521,7 +1606,7 @@ static int ext3_mkdir(struct inode * dir + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; +- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; ++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; + inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { +@@ -554,21 +1639,19 @@ static int ext3_mkdir(struct inode * dir + inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); +- if (err) +- goto out_no_entry; ++ if (err) { ++ inode->i_nlink = 0; ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + dir->i_nlink++; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- inode->i_nlink = 0; +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + /* +@@ -655,7 +1738,7 @@ int ext3_orphan_add(handle_t *handle, st + int err = 0, rc; + + lock_super(sb); +- if (!list_empty(&inode->u.ext3_i.i_orphan)) ++ if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks +@@ -696,7 +1779,7 @@ int ext3_orphan_add(handle_t *handle, st + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) +- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); ++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", +@@ -714,25 +1797,26 @@ out_unlock: + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; ++ struct ext3_inode_info *ei = EXT3_I(inode); + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); +- if (list_empty(&inode->u.ext3_i.i_orphan)) { ++ if (list_empty(&ei->i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); +- prev = inode->u.ext3_i.i_orphan.prev; ++ prev = ei->i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + +- list_del(&inode->u.ext3_i.i_orphan); +- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); ++ list_del(&ei->i_orphan); ++ INIT_LIST_HEAD(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on +@@ -793,8 +1877,9 @@ static int ext3_rmdir (struct inode * di + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); +@@ -832,7 +1917,7 @@ static int ext3_rmdir (struct inode * di + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: +@@ -850,8 +1935,9 @@ static int ext3_unlink(struct inode * di + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -878,7 +1964,7 @@ static int ext3_unlink(struct inode * di + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; +- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(dir); + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) +@@ -904,9 +1990,11 @@ static int ext3_symlink (struct inode * + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -916,7 +2004,7 @@ static int ext3_symlink (struct inode * + if (IS_ERR(inode)) + goto out_stop; + +- if (l > sizeof (inode->u.ext3_i.i_data)) { ++ if (l > sizeof (EXT3_I(inode)->i_data)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* +@@ -925,25 +2013,23 @@ static int ext3_symlink (struct inode * + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); +- if (err) +- goto out_no_entry; ++ if (err) { ++ ext3_dec_count(handle, inode); ++ ext3_mark_inode_dirty(handle, inode); ++ iput (inode); ++ goto out_stop; ++ } + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; +- memcpy((char*)&inode->u.ext3_i.i_data,symname,l); ++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_size = l-1; + } +- inode->u.ext3_i.i_disksize = inode->i_size; ++ EXT3_I(inode)->i_disksize = inode->i_size; + err = ext3_add_nondir(handle, dentry, inode); + ext3_mark_inode_dirty(handle, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; +- +-out_no_entry: +- ext3_dec_count(handle, inode); +- ext3_mark_inode_dirty(handle, inode); +- iput (inode); +- goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, +@@ -956,12 +2042,15 @@ static int ext3_link (struct dentry * ol + if (S_ISDIR(inode->i_mode)) + return -EPERM; + +- if (inode->i_nlink >= EXT3_LINK_MAX) ++ if (inode->i_nlink >= EXT3_LINK_MAX) { + return -EMLINK; ++ } + +- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(dir)) + handle->h_sync = 1; +@@ -995,9 +2084,11 @@ static int ext3_rename (struct inode * o + + old_bh = new_bh = dir_bh = NULL; + +- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); +- if (IS_ERR(handle)) ++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + ++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); ++ if (IS_ERR(handle)) { + return PTR_ERR(handle); ++ } + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; +@@ -1077,7 +2168,7 @@ static int ext3_rename (struct inode * o + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; +- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(old_dir); + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); +@@ -1089,7 +2180,7 @@ static int ext3_rename (struct inode * o + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; +- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; ++ ext3_update_dx_flag(new_dir); + ext3_mark_inode_dirty(handle, new_dir); + } + } +--- linux-2.4.20/fs/ext3/super.c~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/fs/ext3/super.c Sat Apr 5 03:56:31 2003 +@@ -707,6 +707,7 @@ static int ext3_setup_super(struct super + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ++ + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO +@@ -717,6 +718,7 @@ static int ext3_setup_super(struct super + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); ++ + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { +@@ -890,6 +892,7 @@ static loff_t ext3_max_size(int bits) + return res; + } + ++ + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { +@@ -1066,6 +1069,9 @@ struct super_block * ext3_read_super (st + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); ++ for (i=0; i < 4; i++) ++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); ++ sbi->s_def_hash_version = es->s_def_hash_version; + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR +@@ -1769,6 +1775,7 @@ static void __exit exit_ext3_fs(void) + unregister_filesystem(&ext3_fs_type); + } + ++EXPORT_SYMBOL(ext3_force_commit); + EXPORT_SYMBOL(ext3_bread); + + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); +--- linux-2.4.20/include/linux/ext3_fs.h~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/include/linux/ext3_fs.h Sat Apr 5 03:56:31 2003 +@@ -40,6 +40,11 @@ + #define EXT3FS_VERSION "2.4-0.9.19" + + /* ++ * Always enable hashed directories ++ */ ++#define CONFIG_EXT3_INDEX ++ ++/* + * Debug code + */ + #ifdef EXT3FS_DEBUG +@@ -437,8 +442,11 @@ struct ext3_super_block { + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ +- +-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ ++ __u32 s_hash_seed[4]; /* HTREE hash seed */ ++ __u8 s_def_hash_version; /* Default hash version to use */ ++ __u8 s_reserved_char_pad; ++ __u16 s_reserved_word_pad; ++ __u32 s_reserved[192]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ +@@ -575,9 +583,46 @@ struct ext3_dir_entry_2 { + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) ++/* ++ * Hash Tree Directory indexing ++ * (c) Daniel Phillips, 2001 ++ */ ++ ++#ifdef CONFIG_EXT3_INDEX ++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \ ++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \ ++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) ++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) ++#else ++ #define is_dx(dir) 0 ++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX) ++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2) ++#endif ++ ++/* Legal values for the dx_root hash_version field: */ ++ ++#define DX_HASH_LEGACY 0 ++#define DX_HASH_HALF_MD4 1 ++#define DX_HASH_TEA 2 ++ ++/* hash info structure used by the directory hash */ ++struct dx_hash_info ++{ ++ u32 hash; ++ u32 minor_hash; ++ int hash_version; ++ u32 *seed; ++}; + + #ifdef __KERNEL__ + /* ++ * Control parameters used by ext3_htree_next_block ++ */ ++#define HASH_NB_ALWAYS 1 ++ ++ ++/* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc +@@ -587,6 +632,27 @@ struct ext3_iloc + unsigned long block_group; + }; + ++ ++/* ++ * This structure is stuffed into the struct file's private_data field ++ * for directories. It is where we put information so that we can do ++ * readdir operations in hash tree order. ++ */ ++struct dir_private_info { ++ rb_root_t root; ++ rb_node_t *curr_node; ++ struct fname *extra_fname; ++ loff_t last_pos; ++ __u32 curr_hash; ++ __u32 curr_minor_hash; ++ __u32 next_hash; ++}; ++ ++/* ++ * Special error return code only used by dx_probe() and its callers. ++ */ ++#define ERR_BAD_DX_DIR -75000 ++ + /* + * Function prototypes + */ +@@ -614,11 +680,20 @@ extern struct ext3_group_desc * ext3_get + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, +- struct ext3_dir_entry_2 *, struct buffer_head *, +- unsigned long); ++ struct ext3_dir_entry_2 *, ++ struct buffer_head *, unsigned long); ++extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++ __u32 minor_hash, ++ struct ext3_dir_entry_2 *dirent); ++extern void ext3_htree_free_dir_info(struct dir_private_info *p); ++ + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + ++/* hash.c */ ++extern int ext3fs_dirhash(const char *name, int len, struct ++ dx_hash_info *hinfo); ++ + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); +@@ -650,6 +725,8 @@ extern int ext3_ioctl (struct inode *, s + /* namei.c */ + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); ++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, ++ __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) +--- linux-2.4.20/include/linux/ext3_fs_sb.h~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/include/linux/ext3_fs_sb.h Sat Apr 5 03:56:31 2003 +@@ -62,6 +62,8 @@ struct ext3_sb_info { + int s_inode_size; + int s_first_ino; + u32 s_next_generation; ++ u32 s_hash_seed[4]; ++ int s_def_hash_version; + + /* Journaling */ + struct inode * s_journal_inode; +--- linux-2.4.20/include/linux/ext3_jbd.h~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/include/linux/ext3_jbd.h Sat Apr 5 03:56:31 2003 +@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + ++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8 ++ + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, +--- linux-2.4.20/include/linux/rbtree.h~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/include/linux/rbtree.h Sat Apr 5 03:56:31 2003 +@@ -120,6 +120,8 @@ rb_root_t; + + extern void rb_insert_color(rb_node_t *, rb_root_t *); + extern void rb_erase(rb_node_t *, rb_root_t *); ++extern rb_node_t *rb_get_first(rb_root_t *root); ++extern rb_node_t *rb_get_next(rb_node_t *n); + + static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link) + { +--- linux-2.4.20/lib/rbtree.c~ext-2.4-patch-1 Sat Apr 5 03:56:31 2003 ++++ linux-2.4.20-braam/lib/rbtree.c Sat Apr 5 03:56:31 2003 +@@ -17,6 +17,8 @@ + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c ++ ++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002 + */ + + #include +@@ -294,3 +296,43 @@ void rb_erase(rb_node_t * node, rb_root_ + __rb_erase_color(child, parent, root); + } + EXPORT_SYMBOL(rb_erase); ++ ++/* ++ * This function returns the first node (in sort order) of the tree. ++ */ ++rb_node_t *rb_get_first(rb_root_t *root) ++{ ++ rb_node_t *n; ++ ++ n = root->rb_node; ++ if (!n) ++ return 0; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++} ++EXPORT_SYMBOL(rb_get_first); ++ ++/* ++ * Given a node, this function will return the next node in the tree. ++ */ ++rb_node_t *rb_get_next(rb_node_t *n) ++{ ++ rb_node_t *parent; ++ ++ if (n->rb_right) { ++ n = n->rb_right; ++ while (n->rb_left) ++ n = n->rb_left; ++ return n; ++ } else { ++ while ((parent = n->rb_parent)) { ++ if (n == parent->rb_left) ++ return parent; ++ n = parent; ++ } ++ return 0; ++ } ++} ++EXPORT_SYMBOL(rb_get_next); ++ + +_ diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-2.patch b/lustre/kernel_patches/patches/ext-2.4-patch-2.patch new file mode 100644 index 0000000..689d33b --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-2.patch @@ -0,0 +1,34 @@ +# This is a BitKeeper generated patch for the following project: +# Project Name: Linux kernel tree +# +# namei.c | 9 +++++++++ +# 1 files changed, 9 insertions(+) +# +# The following is the BitKeeper ChangeSet Log +# -------------------------------------------- +# 02/11/07 tytso@snap.thunk.org 1.777 +# Add '.' and '..' entries to be returned by readdir of htree directories +# +# This patch from Chris Li adds '.' and '..' to the rbtree so that they +# are properly returned by readdir. +# -------------------------------------------- +# +diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c +--- a/fs/ext3/namei.c Thu Nov 7 10:57:30 2002 ++++ b/fs/ext3/namei.c Thu Nov 7 10:57:30 2002 +@@ -546,6 +546,15 @@ + if (!frame) + return err; + ++ /* Add '.' and '..' from the htree header */ ++ if (!start_hash && !start_minor_hash) { ++ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; ++ ext3_htree_store_dirent(dir_file, 0, 0, de); ++ de = ext3_next_entry(de); ++ ext3_htree_store_dirent(dir_file, 0, 0, de); ++ count += 2; ++ } ++ + while (1) { + block = dx_get_block(frame->at); + dxtrace(printk("Reading block %d\n", block)); diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-3.patch b/lustre/kernel_patches/patches/ext-2.4-patch-3.patch new file mode 100644 index 0000000..2600b02 --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-3.patch @@ -0,0 +1,96 @@ +# This is a BitKeeper generated patch for the following project: +# Project Name: Linux kernel tree +# +# fs/ext3/dir.c | 7 +++++-- +# fs/ext3/namei.c | 11 +++++++---- +# include/linux/ext3_fs.h | 2 +- +# 3 files changed, 13 insertions(+), 7 deletions(-) +# +# The following is the BitKeeper ChangeSet Log +# -------------------------------------------- +# 02/11/07 tytso@snap.thunk.org 1.778 +# Check for failed kmalloc() in ext3_htree_store_dirent() +# +# This patch checks for a failed kmalloc() in ext3_htree_store_dirent(), +# and passes the error up to its caller, ext3_htree_fill_tree(). +# -------------------------------------------- +# +diff -Nru a/fs/ext3/dir.c b/fs/ext3/dir.c +--- a/fs/ext3/dir.c Thu Nov 7 10:57:34 2002 ++++ b/fs/ext3/dir.c Thu Nov 7 10:57:34 2002 +@@ -308,7 +308,7 @@ + /* + * Given a directory entry, enter it into the fname rb tree. + */ +-void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent) + { +@@ -323,6 +323,8 @@ + /* Create and allocate the fname structure */ + len = sizeof(struct fname) + dirent->name_len + 1; + new_fn = kmalloc(len, GFP_KERNEL); ++ if (!new_fn) ++ return -ENOMEM; + memset(new_fn, 0, len); + new_fn->hash = hash; + new_fn->minor_hash = minor_hash; +@@ -344,7 +346,7 @@ + (new_fn->minor_hash == fname->minor_hash)) { + new_fn->next = fname->next; + fname->next = new_fn; +- return; ++ return 0; + } + + if (new_fn->hash < fname->hash) +@@ -359,6 +361,7 @@ + + rb_link_node(&new_fn->rb_hash, parent, p); + rb_insert_color(&new_fn->rb_hash, &info->root); ++ return 0; + } + + +diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c +--- a/fs/ext3/namei.c Thu Nov 7 10:57:34 2002 ++++ b/fs/ext3/namei.c Thu Nov 7 10:57:34 2002 +@@ -549,9 +549,11 @@ + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data; +- ext3_htree_store_dirent(dir_file, 0, 0, de); ++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) ++ goto errout; + de = ext3_next_entry(de); +- ext3_htree_store_dirent(dir_file, 0, 0, de); ++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0) ++ goto errout; + count += 2; + } + +@@ -570,8 +572,9 @@ + ((hinfo.hash == start_hash) && + (hinfo.minor_hash < start_minor_hash))) + continue; +- ext3_htree_store_dirent(dir_file, hinfo.hash, +- hinfo.minor_hash, de); ++ if ((err = ext3_htree_store_dirent(dir_file, ++ hinfo.hash, hinfo.minor_hash, de)) != 0) ++ goto errout; + count++; + } + brelse (bh); +diff -Nru a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h +--- a/include/linux/ext3_fs.h Thu Nov 7 10:57:34 2002 ++++ b/include/linux/ext3_fs.h Thu Nov 7 10:57:34 2002 +@@ -682,7 +682,7 @@ + extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, + struct buffer_head *, unsigned long); +-extern void ext3_htree_store_dirent(struct file *dir_file, __u32 hash, ++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext3_dir_entry_2 *dirent); + extern void ext3_htree_free_dir_info(struct dir_private_info *p); diff --git a/lustre/kernel_patches/patches/ext-2.4-patch-4.patch b/lustre/kernel_patches/patches/ext-2.4-patch-4.patch new file mode 100644 index 0000000..67f5afa --- /dev/null +++ b/lustre/kernel_patches/patches/ext-2.4-patch-4.patch @@ -0,0 +1,48 @@ +# This is a BitKeeper generated patch for the following project: +# Project Name: Linux kernel tree +# +# namei.c | 21 ++++++++++++++++++++- +# 1 files changed, 20 insertions(+), 1 deletion(-) +# +# The following is the BitKeeper ChangeSet Log +# -------------------------------------------- +# 02/11/07 tytso@snap.thunk.org 1.779 +# Fix ext3 htree rename bug. +# +# This fixes an ext3 htree bug pointed out by Christopher Li; if +# adding the new name to the directory causes a split, this can cause +# the directory entry containing the old name to move to another +# block, and then the removal of the old name will fail. +# -------------------------------------------- +# +diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c +--- a/fs/ext3/namei.c Thu Nov 7 10:57:49 2002 ++++ b/fs/ext3/namei.c Thu Nov 7 10:57:49 2002 +@@ -2173,7 +2173,26 @@ + /* + * ok, that's it + */ +- ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh); ++ if (retval == -ENOENT) { ++ /* ++ * old_de could have moved out from under us. ++ */ ++ struct buffer_head *old_bh2; ++ struct ext3_dir_entry_2 *old_de2; ++ ++ old_bh2 = ext3_find_entry(old_dentry, &old_de2); ++ if (old_bh2) { ++ retval = ext3_delete_entry(handle, old_dir, ++ old_de2, old_bh2); ++ brelse(old_bh2); ++ } ++ } ++ if (retval) { ++ ext3_warning(old_dir->i_sb, "ext3_rename", ++ "Deleting old file (%lu), %d, error=%d", ++ old_dir->i_ino, old_dir->i_nlink, retval); ++ } + + if (new_inode) { + new_inode->i_nlink--; diff --git a/lustre/extN/ext3-2.4-ino_t.diff b/lustre/kernel_patches/patches/ext3-2.4-ino_t.patch similarity index 73% rename from lustre/extN/ext3-2.4-ino_t.diff rename to lustre/kernel_patches/patches/ext3-2.4-ino_t.patch index ce1bd88..1786d0f 100644 --- a/lustre/extN/ext3-2.4-ino_t.diff +++ b/lustre/kernel_patches/patches/ext3-2.4-ino_t.patch @@ -1,6 +1,11 @@ ---- linux/fs/ext3/ialloc.c.orig Sat Oct 19 11:42:23 2002 -+++ linux/fs/ext3/ialloc.c Sat Jan 4 12:14:18 2003 -@@ -64,8 +64,8 @@ static int read_inode_bitmap (struct sup + fs/ext3/ialloc.c | 20 ++++++++++---------- + fs/ext3/namei.c | 16 ++++++++-------- + include/linux/ext3_fs.h | 2 +- + 3 files changed, 19 insertions(+), 19 deletions(-) + +--- linux-2.4.20/fs/ext3/ialloc.c~ext3-2.4-ino_t 2003-04-08 23:35:24.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/ialloc.c 2003-04-08 23:35:24.000000000 -0600 +@@ -65,8 +65,8 @@ static int read_inode_bitmap (struct sup if (!bh) { ext3_error (sb, "read_inode_bitmap", "Cannot read inode bitmap - " @@ -11,7 +16,7 @@ retval = -EIO; } /* -@@ -531,19 +532,19 @@ out: +@@ -533,19 +533,19 @@ out: } /* Verify that we are loading a valid orphan from disk */ @@ -35,7 +40,7 @@ return NULL; } -@@ -552,7 +553,7 @@ struct inode *ext3_orphan_get (struct su +@@ -554,7 +554,7 @@ struct inode *ext3_orphan_get (struct su if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { ext3_warning(sb, __FUNCTION__, @@ -44,7 +49,7 @@ return NULL; } -@@ -563,7 +564,7 @@ struct inode *ext3_orphan_get (struct su +@@ -565,16 +565,16 @@ struct inode *ext3_orphan_get (struct su if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { ext3_warning(sb, __FUNCTION__, @@ -53,7 +58,6 @@ printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); printk(KERN_NOTICE "inode=%p\n", inode); -@@ -570,9 +571,9 @@ struct inode *ext3_orphan_get (struct su if (inode) { printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); @@ -65,11 +69,11 @@ } /* Avoid freeing blocks if we got a bad deleted inode */ if (inode && inode->i_nlink == 0) ---- linux/fs/ext3/namei.c.orig Sat Oct 19 11:42:45 2002 -+++ linux/fs/ext3/namei.c Sat Jan 4 12:13:27 2003 -@@ -716,10 +716,10 @@ int ext3_orphan_del(handle_t *handle, st - { +--- linux-2.4.20/fs/ext3/namei.c~ext3-2.4-ino_t 2003-04-08 23:35:24.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-04-08 23:35:24.000000000 -0600 +@@ -1808,10 +1808,10 @@ int ext3_orphan_del(handle_t *handle, st struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); struct ext3_sb_info *sbi; - ino_t ino_next; + unsigned long ino_next; @@ -78,18 +82,18 @@ - + lock_super(inode->i_sb); - if (list_empty(&inode->u.ext3_i.i_orphan)) { + if (list_empty(&ei->i_orphan)) { unlock_super(inode->i_sb); -@@ -730,7 +730,7 @@ int ext3_orphan_del(handle_t *handle, st - prev = inode->u.ext3_i.i_orphan.prev; +@@ -1822,7 +1822,7 @@ int ext3_orphan_del(handle_t *handle, st + prev = ei->i_orphan.prev; sbi = EXT3_SB(inode->i_sb); - jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - list_del(&inode->u.ext3_i.i_orphan); - INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); -@@ -741,13 +741,13 @@ int ext3_orphan_del(handle_t *handle, st + list_del(&ei->i_orphan); + INIT_LIST_HEAD(&ei->i_orphan); +@@ -1833,13 +1833,13 @@ int ext3_orphan_del(handle_t *handle, st * list in memory. */ if (!handle) goto out; @@ -105,7 +109,7 @@ BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext3_journal_get_write_access(handle, sbi->s_sbh); if (err) -@@ -758,8 +758,8 @@ int ext3_orphan_del(handle_t *handle, st +@@ -1850,8 +1850,8 @@ int ext3_orphan_del(handle_t *handle, st struct ext3_iloc iloc2; struct inode *i_prev = list_entry(prev, struct inode, u.ext3_i.i_orphan); @@ -116,7 +120,7 @@ i_prev->i_ino, ino_next); err = ext3_reserve_inode_write(handle, i_prev, &iloc2); if (err) -@@ -774,7 +774,7 @@ int ext3_orphan_del(handle_t *handle, st +@@ -1866,7 +1866,7 @@ int ext3_orphan_del(handle_t *handle, st if (err) goto out_brelse; @@ -125,9 +129,9 @@ ext3_std_error(inode->i_sb, err); out: unlock_super(inode->i_sb); ---- linux/include/linux/ext3_fs.h.orig Thu Jan 2 16:10:24 2003 -+++ linux/include/linux/ext3_fs.h Sat Jan 4 12:25:41 2003 -@@ -622,7 +622,7 @@ extern int ext3_sync_file (struct file * +--- linux-2.4.20/include/linux/ext3_fs.h~ext3-2.4-ino_t 2003-04-08 23:35:24.000000000 -0600 ++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-04-08 23:35:24.000000000 -0600 +@@ -673,7 +673,7 @@ extern int ext3fs_dirhash(const char *na /* ialloc.c */ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); extern void ext3_free_inode (handle_t *, struct inode *); @@ -136,3 +140,5 @@ extern unsigned long ext3_count_free_inodes (struct super_block *); extern void ext3_check_inodes_bitmap (struct super_block *); extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + +_ diff --git a/lustre/extN/ext3-2.4.18-fixes.diff b/lustre/kernel_patches/patches/ext3-2.4.18-fixes.patch similarity index 100% rename from lustre/extN/ext3-2.4.18-fixes.diff rename to lustre/kernel_patches/patches/ext3-2.4.18-fixes.patch diff --git a/lustre/extN/ext3-2.4.18-ino_sb_macro.diff b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro.patch similarity index 99% rename from lustre/extN/ext3-2.4.18-ino_sb_macro.diff rename to lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro.patch index cc47588..2ddff7d 100644 --- a/lustre/extN/ext3-2.4.18-ino_sb_macro.diff +++ b/lustre/kernel_patches/patches/ext3-2.4.18-ino_sb_macro.patch @@ -422,7 +422,7 @@ struct ext3_group_desc * gdp; struct ext3_group_desc * tmp; struct ext3_super_block * es; -@@ -318,19 +320,21 @@ struct inode * ext3_new_inode (handle_t +@@ -318,7 +320,9 @@ struct inode * ext3_new_inode (handle_t inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -432,11 +432,8 @@ + init_rwsem(&ei->truncate_sem); lock_super (sb); -- es = sb->u.ext3_sb.s_es; -+ es = sbi->s_es; - repeat: - gdp = NULL; - i = 0; + es = sb->u.ext3_sb.s_es; +@@ -328,9 +332,9 @@ struct inode * ext3_new_inode (handle_t if (S_ISDIR(mode)) { avefreei = le32_to_cpu(es->s_free_inodes_count) / @@ -499,17 +496,6 @@ BUFFER_TRACE(bh, "get_write_access"); err = ext3_journal_get_write_access(handle, bh); if (err) goto fail; -@@ -436,8 +440,8 @@ repeat: - } - goto repeat; - } -- j += i * EXT3_INODES_PER_GROUP(sb) + 1; -- if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { -+ j += i * sbi->s_inodes_per_group + 1; -+ if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) { - ext3_error (sb, "ext3_new_inode", - "reserved inode or inode > inodes count - " - "block_group = %d,inode=%d", i, j); @@ -457,13 +461,13 @@ repeat: err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; diff --git a/lustre/extN/patch-2.4.18-chaos22 b/lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch similarity index 60% rename from lustre/extN/patch-2.4.18-chaos22 rename to lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch index c40d4ea..5f566de 100644 --- a/lustre/extN/patch-2.4.18-chaos22 +++ b/lustre/kernel_patches/patches/ext3-2.4.20-fixes.patch @@ -1,7 +1,12 @@ -diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c ---- lum-2.4.18-um30/fs/ext3/balloc.c Mon Feb 25 12:38:08 2002 -+++ uml-2.4.18-12.5/fs/ext3/balloc.c Thu Sep 19 13:40:11 2002 -@@ -276,7 +276,8 @@ + + + + fs/ext3/balloc.c | 53 +++++++++++++++++++++++++++++++---------------------- + 1 files changed, 31 insertions(+), 22 deletions(-) + +--- linux-2.4.20/fs/ext3/balloc.c~ext3-2.4.20-fixes 2003-04-08 23:35:17.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/balloc.c 2003-04-08 23:35:17.000000000 -0600 +@@ -276,7 +276,8 @@ void ext3_free_blocks (handle_t *handle, } lock_super (sb); es = sb->u.ext3_sb.s_es; @@ -11,7 +16,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c (block + count) > le32_to_cpu(es->s_blocks_count)) { ext3_error (sb, "ext3_free_blocks", "Freeing blocks not in datazone - " -@@ -309,17 +310,6 @@ +@@ -309,17 +310,6 @@ do_more: if (!gdp) goto error_return; @@ -29,7 +34,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c /* * We are about to start releasing blocks in the bitmap, * so we need undo access. -@@ -345,14 +335,24 @@ +@@ -345,14 +335,24 @@ do_more: if (err) goto error_return; @@ -38,7 +43,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c + if (block == le32_to_cpu(gdp->bg_block_bitmap) || + block == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range(block, le32_to_cpu(gdp->bg_inode_table), -+ sb->u.ext2_sb.s_itb_per_group)) { ++ EXT3_SB(sb)->s_itb_per_group)) { + ext3_error(sb, __FUNCTION__, + "Freeing block in system zone - block = %lu", + block); @@ -56,7 +61,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c if (debug_bh) { BUFFER_TRACE(debug_bh, "Deleted!"); if (!bh2jh(bitmap_bh)->b_committed_data) -@@ -365,9 +365,8 @@ +@@ -365,9 +365,8 @@ do_more: #endif BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { @@ -68,7 +73,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { dquot_freed_blocks++; -@@ -415,7 +417,6 @@ +@@ -415,7 +414,6 @@ do_more: if (!err) err = ret; if (overflow && !err) { @@ -76,7 +81,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c count = overflow; goto do_more; } -@@ -575,6 +577,7 @@ +@@ -576,6 +574,7 @@ int ext3_new_block (handle_t *handle, st ext3_debug ("goal=%lu.\n", goal); @@ -84,7 +89,7 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c /* * First, test whether the goal block is free. */ -@@ -684,10 +686,21 @@ +@@ -684,10 +683,20 @@ got_block: if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || tmp == le32_to_cpu(gdp->bg_inode_bitmap) || in_range (tmp, le32_to_cpu(gdp->bg_inode_table), @@ -106,60 +111,8 @@ diff -ru lum-2.4.18-um30/fs/ext3/balloc.c uml-2.4.18-12.5/fs/ext3/balloc.c + ext3_set_bit(j, bh->b_data); + goto repeat; + } -+ /* The superblock lock should guard against anybody else beating * us to this point! */ -diff -ru lum-2.4.18-um30/fs/ext3/namei.c uml-2.4.18-12.5/fs/ext3/namei.c ---- lum-2.4.18-um30/fs/ext3/namei.c Fri Nov 9 15:25:04 2001 -+++ uml-2.4.18-12.5/fs/ext3/namei.c Thu Sep 19 13:40:11 2002 -@@ -354,8 +355,8 @@ - */ - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; -- ext3_mark_inode_dirty(handle, dir); - dir->i_version = ++event; -+ ext3_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, bh); - brelse(bh); -@@ -464,8 +465,8 @@ - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - inode->i_mapping->a_ops = &ext3_aops; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); -+ ext3_mark_inode_dirty(handle, inode); - } - ext3_journal_stop(handle, dir); - return err; -@@ -489,8 +490,8 @@ - err = PTR_ERR(inode); - if (!IS_ERR(inode)) { - init_special_inode(inode, mode, rdev); -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); -+ ext3_mark_inode_dirty(handle, inode); - } - ext3_journal_stop(handle, dir); - return err; -@@ -933,8 +934,8 @@ - inode->i_size = l-1; - } - inode->u.ext3_i.i_disksize = inode->i_size; -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); -+ ext3_mark_inode_dirty(handle, inode); - out_stop: - ext3_journal_stop(handle, dir); - return err; -@@ -970,8 +971,8 @@ - ext3_inc_count(handle, inode); - atomic_inc(&inode->i_count); - -- ext3_mark_inode_dirty(handle, inode); - err = ext3_add_nondir(handle, dentry, inode); -+ ext3_mark_inode_dirty(handle, inode); - ext3_journal_stop(handle, dir); - return err; - } + +_ diff --git a/lustre/extN/ext3-2.5-noread.diff b/lustre/kernel_patches/patches/ext3-2.5-noread.patch similarity index 99% rename from lustre/extN/ext3-2.5-noread.diff rename to lustre/kernel_patches/patches/ext3-2.5-noread.patch index f1c611f..1aa2578 100644 --- a/lustre/extN/ext3-2.5-noread.diff +++ b/lustre/kernel_patches/patches/ext3-2.5-noread.patch @@ -204,7 +204,7 @@ + if (block_end > itable_end) + block_end = itable_end; + -+ for (; block < block_end; block++) { ++ for (++block; block < block_end; block++) { + bh[count] = sb_getblk(sb, block); + if (count && (buffer_uptodate(bh[count]) || + buffer_locked(bh[count]))) { diff --git a/lustre/kernel_patches/patches/ext3-2.5.63.patch b/lustre/kernel_patches/patches/ext3-2.5.63.patch new file mode 100644 index 0000000..fd28cd8 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-2.5.63.patch @@ -0,0 +1,150 @@ + fs/ext3/inode.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- + fs/ext3/super.c | 5 ++- + fs/ext3/xattr.c | 5 +++ + fs/ext3/xattr.h | 2 - + 4 files changed, 92 insertions(+), 4 deletions(-) + +--- linux-2.5.63-nointent/fs/ext3/xattr.c~ext3-2.5.63 Fri Mar 21 18:47:19 2003 ++++ linux-2.5.63-nointent-root/fs/ext3/xattr.c Fri Mar 21 18:47:19 2003 +@@ -1181,3 +1181,8 @@ exit_ext3_xattr(void) + ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, + &ext3_xattr_user_handler); + } ++ ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_set); ++EXPORT_SYMBOL(ext3_xattr_set_handle); ++ +--- linux-2.5.63-nointent/fs/ext3/inode.c~ext3-2.5.63 Fri Mar 21 18:47:19 2003 ++++ linux-2.5.63-nointent-root/fs/ext3/inode.c Fri Mar 21 18:47:19 2003 +@@ -1019,7 +1019,7 @@ struct buffer_head *ext3_bread(handle_t + *err = -EIO; + return NULL; + } +- ++EXPORT_SYMBOL(ext3_bread); + static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, +@@ -2870,3 +2870,85 @@ int ext3_change_inode_journal_flag(struc + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ ++ ++/* for each block: 1 ind + 1 dind + 1 tind ++ * for each block: 3 bitmap blocks ++ * for each block: 3 group descriptor blocks ++ * i inode block ++ * 1 superblock ++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files ++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ * ++ * XXX assuming: ++ * (1) fs logic block size == page size ++ * (2) ext3 in writeback mode ++ */ ++static inline int ext3_san_write_trans_blocks(int nblocks) ++{ ++ int ret; ++ ++ ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1; ++ ++#ifdef CONFIG_QUOTA ++ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return ret; ++} ++ ++/* Alloc blocks for an inode, while don't create any buffer/page ++ * for data I/O; set the inode size if file is extended. ++ * ++ * @inode: target inode ++ * @blocks: array of logic block number ++ * @nblocks: how many blocks need be alloced ++ * @newsize: new filesize we should set ++ * ++ * return: 0 success, otherwise failed ++ * (*blocks) contains physical block number alloced ++ * ++ * XXX this assume the fs block size == page size ++ */ ++int ext3_prep_san_write(struct inode *inode, long *blocks, ++ int nblocks, loff_t newsize) ++{ ++ handle_t *handle; ++ struct buffer_head bh_tmp; ++ int needed_blocks; ++ int i, ret = 0, ret2; ++ ++ needed_blocks = ext3_san_write_trans_blocks(nblocks); ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) { ++ unlock_kernel(); ++ return PTR_ERR(handle); ++ } ++ unlock_kernel(); ++ ++ /* alloc blocks one by one */ ++ for (i = 0; i < nblocks; i++) { ++ ret = ext3_get_block_handle(handle, inode, blocks[i], ++ &bh_tmp, 1, 1); ++ if (ret) ++ break; ++ ++ blocks[i] = bh_tmp.b_blocknr; ++ } ++ ++ /* set inode size if needed */ ++ if (!ret && (newsize > inode->i_size)) { ++ inode->i_size = newsize; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ lock_kernel(); ++ ret2 = ext3_journal_stop(handle, inode); ++ unlock_kernel(); ++ ++ if (!ret) ++ ret = ret2; ++ return ret; ++} ++EXPORT_SYMBOL(ext3_prep_san_write); +--- linux-2.5.63-nointent/fs/ext3/super.c~ext3-2.5.63 Fri Mar 21 18:47:19 2003 ++++ linux-2.5.63-nointent-root/fs/ext3/super.c Fri Mar 21 18:47:19 2003 +@@ -1492,10 +1492,10 @@ static journal_t *ext3_get_dev_journal(s + printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + goto out_journal; + } +- if (ntohl(journal->j_superblock->s_nr_users) != 1) { ++ if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT3-fs: External journal has more than one " + "user (unsupported) - %d\n", +- ntohl(journal->j_superblock->s_nr_users)); ++ be32_to_cpu(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; +@@ -1703,6 +1703,7 @@ int ext3_force_commit(struct super_block + unlock_kernel(); + return ret; + } ++EXPORT_SYMBOL(ext3_force_commit); + + /* + * Ext3 always journals updates to the superblock itself, so we don't +--- linux-2.5.63-nointent/fs/ext3/xattr.h~ext3-2.5.63 Fri Mar 21 18:47:19 2003 ++++ linux-2.5.63-nointent-root/fs/ext3/xattr.h Fri Mar 21 18:47:19 2003 +@@ -5,7 +5,7 @@ + + (C) 2001 Andreas Gruenbacher, + */ +- ++#include + #include + #include + + +_ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch new file mode 100644 index 0000000..6b9a348 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.18.patch @@ -0,0 +1,302 @@ + 0 files changed + +--- linux-2.4.18-chaos52/fs/ext3/super.c~ext3-delete_thread-2.4.18 2003-06-01 03:24:13.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/super.c 2003-06-03 17:01:49.000000000 +0800 +@@ -398,6 +398,210 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("EXT3-fs: delete thread on %s started\n", ++ kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ sleep_on(&sbi->s_delete_thread_queue); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("ext3 delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list)); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * truncate thread when we run out of space. ++ * ++ * In 2.5 this can be done much more cleanly by just registering a "drop" ++ * method in the super_operations struct. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt (old_inode->i_sb, ASYNCDEL)) { ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ !sbi->s_delete_list.next) { ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan)); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -405,6 +609,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -453,7 +658,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + sync_fs: ext3_sync_fs, +@@ -514,6 +723,12 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else ++#endif ++ + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -1209,6 +1424,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~ext3-delete_thread-2.4.18 2003-06-01 03:24:11.000000000 +0800 ++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h 2003-06-03 17:03:28.000000000 +0800 +@@ -190,6 +190,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -317,6 +318,7 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +--- linux-2.4.18-chaos52/include/linux/ext3_fs_sb.h~ext3-delete_thread-2.4.18 2003-06-01 03:24:13.000000000 +0800 ++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs_sb.h 2003-06-03 16:59:24.000000000 +0800 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 32 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -74,6 +76,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ + +_ diff --git a/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch new file mode 100644 index 0000000..be2723c --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-delete_thread-2.4.20.patch @@ -0,0 +1,300 @@ +diff -puNr origin/fs/ext3/super.c linux/fs/ext3/super.c +--- origin/fs/ext3/super.c 2003-05-04 17:23:52.000000000 +0400 ++++ linux/fs/ext3/super.c 2003-05-04 17:09:20.000000000 +0400 +@@ -398,6 +398,210 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ /*tsk->flags |= PF_KERNTHREAD;*/ ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ ext3_debug("EXT3-fs: delete thread on %s started\n", ++ kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ sleep_on(&sbi->s_delete_thread_queue); ++ ext3_debug("%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm,sbi->s_delete_inodes,sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("ext3 delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ ext3_debug("%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) ++ ext3_warning(sb, __FUNCTION__, ++ "%lu blocks, %lu inodes on list?\n", ++ sbi->s_delete_blocks,sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list)); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * create a new inode locally and put it on a list for the truncate thread. ++ * We need large parts of the inode struct in order to complete the ++ * truncate and unlink, so we may as well just have a real inode to do it. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * truncate thread when we run out of space. ++ * ++ * In 2.5 this can be done much more cleanly by just registering a "drop" ++ * method in the super_operations struct. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ if (!test_opt (old_inode->i_sb, ASYNCDEL)) { ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ !sbi->s_delete_list.next) { ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan)); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * in the same order as the on-disk orphan list (badness otherwise). ++ */ ++ EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ ext3_debug("delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -405,6 +611,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -453,7 +660,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ +@@ -514,6 +725,11 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef EXT3_DELETE_THREAD ++ if (!strcmp(this_char, "asyncdel")) ++ set_opt(*mount_options, ASYNCDEL); ++ else ++#endif + #ifdef CONFIG_EXT3_FS_XATTR_USER + if (!strcmp (this_char, "user_xattr")) + set_opt (*mount_options, XATTR_USER); +@@ -1220,6 +1436,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock +diff -puNr origin/include/linux/ext3_fs.h linux/include/linux/ext3_fs.h +--- origin/include/linux/ext3_fs.h 2003-05-04 17:22:49.000000000 +0400 ++++ linux/include/linux/ext3_fs.h 2003-05-04 15:06:10.000000000 +0400 +@@ -193,6 +193,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +@@ -321,6 +322,7 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +diff -puNr origin/include/linux/ext3_fs_sb.h linux/include/linux/ext3_fs_sb.h +--- origin/include/linux/ext3_fs_sb.h 2003-05-04 17:23:52.000000000 +0400 ++++ linux/include/linux/ext3_fs_sb.h 2003-05-04 11:37:04.000000000 +0400 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 8 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -76,6 +78,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff --git a/lustre/kernel_patches/patches/ext3-largefile.patch b/lustre/kernel_patches/patches/ext3-largefile.patch new file mode 100644 index 0000000..aa7a2f2 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-largefile.patch @@ -0,0 +1,16 @@ + fs/ext3/inode.c | 2 +- + 1 files changed, 1 insertion(+), 1 deletion(-) + +--- linux-2.4.20/fs/ext3/inode.c~ext3-largefile 2003-04-08 23:35:36.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/inode.c 2003-04-08 23:35:36.000000000 -0600 +@@ -2562,7 +2562,7 @@ void ext3_dirty_inode(struct inode *inod + handle_t *handle; + + lock_kernel(); +- handle = ext3_journal_start(inode, 1); ++ handle = ext3_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; + if (current_handle && + +_ diff --git a/lustre/kernel_patches/patches/ext3-noread-2.4.20.patch b/lustre/kernel_patches/patches/ext3-noread-2.4.20.patch new file mode 100644 index 0000000..b14b869 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-noread-2.4.20.patch @@ -0,0 +1,218 @@ + fs/ext3/ialloc.c | 47 ++++++++++++++++++++++- + fs/ext3/inode.c | 96 +++++++++++++++++++++++++++++++++++++----------- + include/linux/ext3_fs.h | 2 + + 3 files changed, 121 insertions(+), 24 deletions(-) + +--- linux-2.4.20/fs/ext3/ialloc.c~ext3-noread-2.4.20 2003-05-16 12:21:39.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/ialloc.c 2003-05-16 12:21:46.000000000 +0800 +@@ -289,6 +289,37 @@ error_return: + } + + /* ++ * @block_group: block group of inode ++ * @offset: relative offset of inode within @block_group ++ * ++ * Check whether any of the inodes in this disk block are in use. ++ * ++ * Caller must be holding superblock lock (group/bitmap read lock in future). ++ */ ++int ext3_itable_block_used(struct super_block *sb, unsigned int block_group, ++ int offset) ++{ ++ int bitmap_nr = load_inode_bitmap(sb, block_group); ++ int inodes_per_block; ++ unsigned long inum, iend; ++ struct buffer_head *ibitmap; ++ ++ if (bitmap_nr < 0) ++ return 1; ++ ++ inodes_per_block = sb->s_blocksize / EXT3_SB(sb)->s_inode_size; ++ inum = offset & ~(inodes_per_block - 1); ++ iend = inum + inodes_per_block; ++ ibitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; ++ for (; inum < iend; inum++) { ++ if (inum != offset && ext3_test_bit(inum, ibitmap->b_data)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of +@@ -310,6 +341,7 @@ struct inode * ext3_new_inode (handle_t + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; ++ struct ext3_iloc iloc; + int err = 0; + + /* Cannot create files in a deleted directory */ +@@ -510,8 +542,19 @@ repeat: + inode->i_generation = sb->u.ext3_sb.s_next_generation++; + + inode->u.ext3_i.i_state = EXT3_STATE_NEW; +- err = ext3_mark_inode_dirty(handle, inode); +- if (err) goto fail; ++ err = ext3_get_inode_loc_new(inode, &iloc, 1); ++ if (err) goto fail; ++ BUFFER_TRACE(iloc->bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, iloc.bh); ++ if (err) { ++ brelse(iloc.bh); ++ iloc.bh = NULL; ++ goto fail; ++ } ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ if (err) goto fail; ++ ++ + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { +--- linux-2.4.20/fs/ext3/inode.c~ext3-noread-2.4.20 2003-05-16 12:21:41.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/inode.c 2003-05-16 12:22:15.000000000 +0800 +@@ -2013,14 +2013,19 @@ out_stop: + ext3_journal_stop(handle, inode); + } + +-/* +- * ext3_get_inode_loc returns with an extra refcount against the +- * inode's underlying buffer_head on success. +- */ +- +-int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) ++#define NUM_INODE_PREREAD 16 ++ ++/* ++ * ext3_get_inode_loc returns with an extra refcount against the inode's ++ * underlying buffer_head on success. If this is for a new inode allocation ++ * (new is non-zero) then we may be able to optimize away the read if there ++ * are no other in-use inodes in this inode table block. If we need to do ++ * a read, then read in a whole chunk of blocks to avoid blocking again soon ++ * if we are doing lots of creates/updates. ++ */ ++int ext3_get_inode_loc_new(struct inode *inode, struct ext3_iloc *iloc, int new) + { +- struct buffer_head *bh = 0; ++ struct buffer_head *bh[NUM_INODE_PREREAD]; + unsigned long block; + unsigned long block_group; + unsigned long group_desc; +@@ -2045,31 +2050,73 @@ int ext3_get_inode_loc (struct inode *in + } + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); +- bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; +- if (!bh) { ++ if (!(inode->i_sb->u.ext3_sb.s_group_desc[group_desc])) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "Descriptor not loaded"); + goto bad_inode; + } + +- gdp = (struct ext3_group_desc *) bh->b_data; ++ gdp = (struct ext3_group_desc *)(inode->i_sb->u.ext3_sb.s_group_desc[group_desc]->b_data); + /* + * Figure out the offset within the block group inode table + */ +- offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * +- EXT3_INODE_SIZE(inode->i_sb); ++ offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)); ++ + block = le32_to_cpu(gdp[desc].bg_inode_table) + +- (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); +- if (!(bh = sb_bread(inode->i_sb, block))) { +- ext3_error (inode->i_sb, "ext3_get_inode_loc", +- "unable to read inode block - " +- "inode=%lu, block=%lu", inode->i_ino, block); +- goto bad_inode; +- } +- offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); ++ (offset * EXT3_INODE_SIZE(inode->i_sb) >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); + +- iloc->bh = bh; +- iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); ++ bh[0] = sb_getblk(inode->i_sb, block); ++ if (buffer_uptodate(bh[0])) ++ goto done; ++ ++ /* If we don't really need to read this block, and it isn't already ++ * in memory, then we just zero it out. Otherwise, we keep the ++ * current block contents (deleted inode data) for posterity. ++ */ ++ if (new && !ext3_itable_block_used(inode->i_sb, block_group, offset)) { ++ lock_buffer(bh[0]); ++ memset(bh[0]->b_data, 0, bh[0]->b_size); ++ mark_buffer_uptodate(bh[0], 1); ++ unlock_buffer(bh[0]); ++ } else { ++ unsigned long block_end, itable_end; ++ int count = 1; ++ ++ itable_end = le32_to_cpu(gdp[desc].bg_inode_table) + ++ inode->i_sb->u.ext3_sb.s_itb_per_group; ++ block_end = block + NUM_INODE_PREREAD; ++ if (block_end > itable_end) ++ block_end = itable_end; ++ ++ for (++block; block < block_end; block++) { ++ bh[count] = sb_getblk(inode->i_sb, block); ++ if (count && (buffer_uptodate(bh[count]) || ++ buffer_locked(bh[count]))) { ++ __brelse(bh[count]); ++ } else ++ count++; ++ } ++ ++ ll_rw_block(READ, count, bh); ++ ++ /* Release all but the block we actually need (bh[0]) */ ++ while (--count > 0) ++ __brelse(bh[count]); ++ ++ wait_on_buffer(bh[0]); ++ if (!buffer_uptodate(bh[0])) { ++ ext3_error(inode->i_sb, __FUNCTION__, ++ "unable to read inode block - " ++ "inode=%lu, block=%lu", inode->i_ino, ++ bh[0]->b_blocknr); ++ goto bad_inode; ++ } ++ } ++ done: ++ offset = (offset * EXT3_INODE_SIZE(inode->i_sb)) & (EXT3_BLOCK_SIZE(inode->i_sb) - 1); ++ ++ iloc->bh = bh[0]; ++ iloc->raw_inode = (struct ext3_inode *)(bh[0]->b_data + offset); + iloc->block_group = block_group; + + return 0; +@@ -2078,6 +2125,11 @@ int ext3_get_inode_loc (struct inode *in + return -EIO; + } + ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) ++{ ++ return ext3_get_inode_loc_new(inode, iloc, 0); ++} ++ + void ext3_read_inode(struct inode * inode) + { + struct ext3_iloc iloc; +--- linux-2.4.20/include/linux/ext3_fs.h~ext3-noread-2.4.20 2003-05-16 12:21:39.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext3_fs.h 2003-05-16 12:21:46.000000000 +0800 +@@ -683,6 +683,8 @@ extern int ext3_forget(handle_t *, int, + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + ++extern int ext3_itable_block_used(struct super_block *sb, unsigned int, int); ++extern int ext3_get_inode_loc_new(struct inode *, struct ext3_iloc *, int); + extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *); + extern void ext3_read_inode (struct inode *); + extern void ext3_write_inode (struct inode *, int); + +_ diff --git a/lustre/kernel_patches/patches/ext3-orphan_lock.patch b/lustre/kernel_patches/patches/ext3-orphan_lock.patch new file mode 100644 index 0000000..d1e5c8d --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-orphan_lock.patch @@ -0,0 +1,79 @@ +--- linux/fs/ext3/namei.c.orig Fri Mar 14 14:11:58 2003 ++++ linux/fs/ext3/namei.c Fri Mar 14 14:39:48 2003 +@@ -1406,8 +1409,8 @@ + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; +- +- lock_super(sb); ++ ++ down(&EXT3_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + +@@ -1455,7 +1458,7 @@ + jbd_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); + out_unlock: +- unlock_super(sb); ++ up(&EXT3_SB(sb)->s_orphan_lock); + ext3_std_error(inode->i_sb, err); + return err; + } +@@ -1468,20 +1471,19 @@ + { + struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); +- struct ext3_sb_info *sbi; ++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + +- lock_super(inode->i_sb); ++ down(&sbi->s_orphan_lock); + if (list_empty(&ei->i_orphan)) { +- unlock_super(inode->i_sb); ++ up(&sbi->s_orphan_lock); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; +- sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + +@@ -1525,10 +1527,10 @@ + if (err) + goto out_brelse; + +-out_err: ++out_err: + ext3_std_error(inode->i_sb, err); + out: +- unlock_super(inode->i_sb); ++ up(&sbi->s_orphan_lock); + return err; + + out_brelse: +--- linux/fs/ext3/super.c.orig Fri Mar 14 14:11:58 2003 ++++ linux/fs/ext3/super.c Fri Mar 14 14:36:00 2003 +@@ -1134,6 +1314,7 @@ + */ + sb->s_op = &ext3_sops; + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ ++ sema_init(&sbi->s_orphan_lock, 1); + + sb->s_root = 0; + +--- linux/include/linux/ext3_fs_sb.h.orig Tue Feb 11 16:34:33 2003 ++++ linux/include/linux/ext3_fs_sb.h Fri Mar 14 14:30:11 2003 +@@ -67,6 +69,7 @@ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; ++ struct semaphore s_orphan_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; + #ifdef CONFIG_JBD_DEBUG diff --git a/lustre/kernel_patches/patches/ext3-san-2.4.20.patch b/lustre/kernel_patches/patches/ext3-san-2.4.20.patch new file mode 100644 index 0000000..148f4e3 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-san-2.4.20.patch @@ -0,0 +1,117 @@ + fs/ext3/ext3-exports.c | 9 ++++- + fs/ext3/inode.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 89 insertions(+), 1 deletion(-) + +--- linux/fs/ext3/inode.c~ext3-san-2.4.20-hp Tue Apr 29 11:01:52 2003 ++++ linux-mmonroe/fs/ext3/inode.c Tue Apr 29 11:01:53 2003 +@@ -2734,3 +2734,84 @@ int ext3_change_inode_journal_flag(struc + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ ++ ++/* for each block: 1 ind + 1 dind + 1 tind ++ * for each block: 3 bitmap blocks ++ * for each block: 3 group descriptor blocks ++ * i inode block ++ * 1 superblock ++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files ++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ * ++ * XXX assuming: ++ * (1) fs logic block size == page size ++ * (2) ext3 in writeback mode ++ */ ++static inline int ext3_san_write_trans_blocks(int nblocks) ++{ ++ int ret; ++ ++ ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1; ++ ++#ifdef CONFIG_QUOTA ++ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return ret; ++} ++ ++/* Alloc blocks for an inode, while don't create any buffer/page ++ * for data I/O; set the inode size if file is extended. ++ * ++ * @inode: target inode ++ * @blocks: array of logic block number ++ * @nblocks: how many blocks need be alloced ++ * @newsize: new filesize we should set ++ * ++ * return: 0 success, otherwise failed ++ * (*blocks) contains physical block number alloced ++ * ++ * XXX this assume the fs block size == page size ++ */ ++int ext3_prep_san_write(struct inode *inode, long *blocks, ++ int nblocks, loff_t newsize) ++{ ++ handle_t *handle; ++ struct buffer_head bh_tmp; ++ int needed_blocks; ++ int i, ret = 0, ret2; ++ ++ needed_blocks = ext3_san_write_trans_blocks(nblocks); ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) { ++ unlock_kernel(); ++ return PTR_ERR(handle); ++ } ++ unlock_kernel(); ++ ++ /* alloc blocks one by one */ ++ for (i = 0; i < nblocks; i++) { ++ ret = ext3_get_block_handle(handle, inode, blocks[i], ++ &bh_tmp, 1); ++ if (ret) ++ break; ++ ++ blocks[i] = bh_tmp.b_blocknr; ++ } ++ ++ /* set inode size if needed */ ++ if (!ret && (newsize > inode->i_size)) { ++ inode->i_size = newsize; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ lock_kernel(); ++ ret2 = ext3_journal_stop(handle, inode); ++ unlock_kernel(); ++ ++ if (!ret) ++ ret = ret2; ++ return ret; ++} +--- linux/fs/ext3/ext3-exports.c~ext3-san-2.4.20-hp Tue Apr 29 11:01:51 2003 ++++ linux-mmonroe/fs/ext3/ext3-exports.c Tue Apr 29 11:07:19 2003 +@@ -1,9 +1,15 @@ + #include + #include +-#include ++#include ++#include ++#include + #include ++#include + #include + ++int ext3_prep_san_write(struct inode *inode, long *blocks, ++ int nblocks, loff_t newsize); ++ + EXPORT_SYMBOL(ext3_force_commit); + EXPORT_SYMBOL(ext3_bread); + EXPORT_SYMBOL(ext3_xattr_register); +@@ -11,3 +17,4 @@ EXPORT_SYMBOL(ext3_xattr_unregister); + EXPORT_SYMBOL(ext3_xattr_get); + EXPORT_SYMBOL(ext3_xattr_list); + EXPORT_SYMBOL(ext3_xattr_set); ++EXPORT_SYMBOL(ext3_prep_san_write); + +_ diff --git a/lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch b/lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch new file mode 100644 index 0000000..ce3928d --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-truncate_blocks-chaos.patch.patch @@ -0,0 +1,92 @@ +--- ./fs/ext3/inode.c.orig Wed Mar 12 02:44:06 2003 ++++ ./fs/ext3/inode.c Wed Mar 12 11:55:20 2003 +@@ -99,7 +99,35 @@ int ext3_forget(handle_t *handle, int is + return err; + } + +-/* ++/* ++ * Work out how many blocks we need to progress with the next chunk of a ++ * truncate transaction. ++ */ ++ ++static unsigned long blocks_for_truncate(struct inode *inode) ++{ ++ unsigned long needed; ++ ++ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); ++ ++ /* Give ourselves just enough room to cope with inodes in which ++ * i_blocks is corrupt: we've seen disk corruptions in the past ++ * which resulted in random data in an inode which looked enough ++ * like a regular file for ext3 to try to delete it. Things ++ * will go a bit crazy if that happens, but at least we should ++ * try not to panic the whole kernel. */ ++ if (needed < 2) ++ needed = 2; ++ ++ /* But we need to bound the transaction so we don't overflow the ++ * journal. */ ++ if (needed > EXT3_MAX_TRANS_DATA) ++ needed = EXT3_MAX_TRANS_DATA; ++ ++ return EXT3_DATA_TRANS_BLOCKS + needed; ++} ++ ++/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. +@@ -110,19 +138,14 @@ int ext3_forget(handle_t *handle, int is + * transaction in the top-level truncate loop. --sct + */ + +-static handle_t *start_transaction(struct inode *inode) ++static handle_t *start_transaction(struct inode *inode) + { +- long needed; + handle_t *result; +- +- needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; +- +- result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); ++ ++ result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; +- ++ + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; + } +@@ -135,14 +158,9 @@ static handle_t *start_transaction(struc + */ + static int try_to_extend_transaction(handle_t *handle, struct inode *inode) + { +- long needed; +- + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; +- needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; +- if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) ++ if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; + } +@@ -154,11 +172,8 @@ static int try_to_extend_transaction(han + */ + static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) + { +- long needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; + jbd_debug(2, "restarting handle %p\n", handle); +- return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); ++ return ext3_journal_restart(handle, blocks_for_truncate(inode)); + } + + /* diff --git a/lustre/kernel_patches/patches/ext3-truncate_blocks.patch b/lustre/kernel_patches/patches/ext3-truncate_blocks.patch new file mode 100644 index 0000000..ce3928d --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-truncate_blocks.patch @@ -0,0 +1,92 @@ +--- ./fs/ext3/inode.c.orig Wed Mar 12 02:44:06 2003 ++++ ./fs/ext3/inode.c Wed Mar 12 11:55:20 2003 +@@ -99,7 +99,35 @@ int ext3_forget(handle_t *handle, int is + return err; + } + +-/* ++/* ++ * Work out how many blocks we need to progress with the next chunk of a ++ * truncate transaction. ++ */ ++ ++static unsigned long blocks_for_truncate(struct inode *inode) ++{ ++ unsigned long needed; ++ ++ needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); ++ ++ /* Give ourselves just enough room to cope with inodes in which ++ * i_blocks is corrupt: we've seen disk corruptions in the past ++ * which resulted in random data in an inode which looked enough ++ * like a regular file for ext3 to try to delete it. Things ++ * will go a bit crazy if that happens, but at least we should ++ * try not to panic the whole kernel. */ ++ if (needed < 2) ++ needed = 2; ++ ++ /* But we need to bound the transaction so we don't overflow the ++ * journal. */ ++ if (needed > EXT3_MAX_TRANS_DATA) ++ needed = EXT3_MAX_TRANS_DATA; ++ ++ return EXT3_DATA_TRANS_BLOCKS + needed; ++} ++ ++/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. +@@ -110,19 +138,14 @@ int ext3_forget(handle_t *handle, int is + * transaction in the top-level truncate loop. --sct + */ + +-static handle_t *start_transaction(struct inode *inode) ++static handle_t *start_transaction(struct inode *inode) + { +- long needed; + handle_t *result; +- +- needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; +- +- result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); ++ ++ result = ext3_journal_start(inode, blocks_for_truncate(inode)); + if (!IS_ERR(result)) + return result; +- ++ + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; + } +@@ -135,14 +158,9 @@ static handle_t *start_transaction(struc + */ + static int try_to_extend_transaction(handle_t *handle, struct inode *inode) + { +- long needed; +- + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; +- needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; +- if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) ++ if (!ext3_journal_extend(handle, blocks_for_truncate(inode))) + return 0; + return 1; + } +@@ -154,11 +172,8 @@ static int try_to_extend_transaction(han + */ + static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) + { +- long needed = inode->i_blocks; +- if (needed > EXT3_MAX_TRANS_DATA) +- needed = EXT3_MAX_TRANS_DATA; + jbd_debug(2, "restarting handle %p\n", handle); +- return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); ++ return ext3_journal_restart(handle, blocks_for_truncate(inode)); + } + + /* diff --git a/lustre/kernel_patches/patches/ext3-unmount_sync.patch b/lustre/kernel_patches/patches/ext3-unmount_sync.patch new file mode 100644 index 0000000..c57903c --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-unmount_sync.patch @@ -0,0 +1,21 @@ + fs/ext3/super.c | 7 ++++++- + 1 files changed, 6 insertions(+), 1 deletion(-) + +--- linux-2.4.20/fs/ext3/super.c~ext3-unmount_sync 2003-04-08 23:35:44.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/super.c 2003-04-08 23:35:44.000000000 -0600 +@@ -1612,7 +1612,12 @@ void ext3_write_super (struct super_bloc + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + +- if (do_sync_supers) { ++ /* ++ * Tricky --- if we are unmounting, the write really does need ++ * to be synchronous. We can detect that by looking for NULL in ++ * sb->s_root. ++ */ ++ if (do_sync_supers || !sb->s_root) { + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + +_ diff --git a/lustre/extN/ext3-use-after-free.diff b/lustre/kernel_patches/patches/ext3-use-after-free.patch similarity index 56% rename from lustre/extN/ext3-use-after-free.diff rename to lustre/kernel_patches/patches/ext3-use-after-free.patch index 8cd673f..dd999bf 100644 --- a/lustre/extN/ext3-use-after-free.diff +++ b/lustre/kernel_patches/patches/ext3-use-after-free.patch @@ -1,21 +1,9 @@ - - -If ext3_add_nondir() fails it will do an iput() of the inode. But we -continue to run ext3_mark_inode_dirty() against the potentially-freed -inode. This oopses when slab poisoning is enabled. - -Fix it so that we only run ext3_mark_inode_dirty() if the inode was -successfully instantiated. - -This bug was added in 2.4.20-pre9. - - - fs/ext3/namei.c | 11 +++++------ + ./fs/ext3/namei.c | 11 +++++------ 1 files changed, 5 insertions(+), 6 deletions(-) ---- 24/fs/ext3/namei.c~ext3-use-after-free Sun Dec 15 11:27:50 2002 -+++ 24-akpm/fs/ext3/namei.c Sun Dec 15 11:27:50 2002 -@@ -429,8 +429,11 @@ static int ext3_add_nondir(handle_t *han +--- linux-2.4.20/./fs/ext3/namei.c~ext3-use-after-free 2003-04-08 23:35:51.000000000 -0600 ++++ linux-2.4.20-braam/./fs/ext3/namei.c 2003-04-08 23:35:51.000000000 -0600 +@@ -1521,8 +1521,11 @@ static int ext3_add_nondir(handle_t *han { int err = ext3_add_entry(handle, dentry, inode); if (!err) { @@ -29,7 +17,7 @@ This bug was added in 2.4.20-pre9. } ext3_dec_count(handle, inode); iput(inode); -@@ -465,7 +468,6 @@ static int ext3_create (struct inode * d +@@ -1559,7 +1562,6 @@ static int ext3_create (struct inode * d inode->i_fop = &ext3_file_operations; inode->i_mapping->a_ops = &ext3_aops; err = ext3_add_nondir(handle, dentry, inode); @@ -37,7 +25,7 @@ This bug was added in 2.4.20-pre9. } ext3_journal_stop(handle, dir); return err; -@@ -490,7 +492,6 @@ static int ext3_mknod (struct inode * di +@@ -1586,7 +1588,6 @@ static int ext3_mknod (struct inode * di if (!IS_ERR(inode)) { init_special_inode(inode, mode, rdev); err = ext3_add_nondir(handle, dentry, inode); @@ -45,15 +33,15 @@ This bug was added in 2.4.20-pre9. } ext3_journal_stop(handle, dir); return err; -@@ -934,7 +935,6 @@ static int ext3_symlink (struct inode * +@@ -2035,7 +2036,6 @@ static int ext3_symlink (struct inode * } - inode->u.ext3_i.i_disksize = inode->i_size; + EXT3_I(inode)->i_disksize = inode->i_size; err = ext3_add_nondir(handle, dentry, inode); - ext3_mark_inode_dirty(handle, inode); out_stop: ext3_journal_stop(handle, dir); return err; -@@ -971,7 +971,6 @@ static int ext3_link (struct dentry * ol +@@ -2069,7 +2069,6 @@ static int ext3_link (struct dentry * ol atomic_inc(&inode->i_count); err = ext3_add_nondir(handle, dentry, inode); diff --git a/lustre/kernel_patches/patches/ext3-xattr-2.5.patch b/lustre/kernel_patches/patches/ext3-xattr-2.5.patch deleted file mode 100644 index 4179839..0000000 --- a/lustre/kernel_patches/patches/ext3-xattr-2.5.patch +++ /dev/null @@ -1,2690 +0,0 @@ -# This is a BitKeeper generated patch for the following project: -# Project Name: Linux kernel tree -# This patch format is intended for GNU patch command version 2.5 or higher. -# This patch includes the following deltas: -# ChangeSet 1.809 -> 1.810 -# fs/ext3/Makefile 1.4 -> 1.5 -# include/linux/ext3_jbd.h 1.5 -> 1.6 -# fs/ext3/ialloc.c 1.17 -> 1.18 -# fs/ext3/symlink.c 1.3 -> 1.4 -# fs/Makefile 1.42 -> 1.43 -# fs/ext3/namei.c 1.22 -> 1.23 -# include/linux/ext3_fs.h 1.11 -> 1.12 -# fs/Config.in 1.39 -> 1.40 -# fs/ext3/inode.c 1.42 -> 1.43 -# fs/Config.help 1.21 -> 1.22 -# fs/ext3/super.c 1.33 -> 1.34 -# fs/ext3/file.c 1.9 -> 1.10 -# (new) -> 1.1 fs/ext3/xattr.h -# (new) -> 1.1 include/linux/mbcache.h -# (new) -> 1.1 fs/ext3/xattr.c -# (new) -> 1.1 fs/mbcache.c -# (new) -> 1.1 fs/ext3/xattr_user.c -# -# The following is the BitKeeper ChangeSet Log -# -------------------------------------------- -# 02/10/20 braam@clusterfs.com 1.810 -# xattrs for UML bk repository -# -------------------------------------------- -# -diff -Nru a/fs/Config.help b/fs/Config.help ---- a/fs/Config.help Sun Dec 8 02:49:56 2002 -+++ b/fs/Config.help Sun Dec 8 02:49:56 2002 -@@ -154,6 +154,13 @@ - of your root partition (the one containing the directory /) cannot - be compiled as a module, and so this may be dangerous. - -+CONFIG_EXT3_FS_XATTR -+ Extended attributes are name:value pairs associated with inodes by -+ the kernel or by users (see the attr(5) manual page, or visit -+ for details). -+ -+ If unsure, say N. -+ - CONFIG_JBD - This is a generic journaling layer for block devices. It is - currently used by the ext3 file system, but it could also be used to -diff -Nru a/fs/Config.in b/fs/Config.in ---- a/fs/Config.in Sun Dec 8 02:49:56 2002 -+++ b/fs/Config.in Sun Dec 8 02:49:56 2002 -@@ -27,6 +27,7 @@ - dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL - - tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS -+dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS - # CONFIG_JBD could be its own option (even modular), but until there are - # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS - # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS -@@ -180,6 +181,17 @@ - define_tristate CONFIG_ZISOFS_FS $CONFIG_ISO9660_FS - else - define_tristate CONFIG_ZISOFS_FS n -+fi -+ -+# Meta block cache for Extended Attributes (ext2/ext3) -+if [ "$CONFIG_EXT2_FS_XATTR" = "y" -o "$CONFIG_EXT3_FS_XATTR" = "y" ]; then -+ if [ "$CONFIG_EXT2_FS" = "y" -o "$CONFIG_EXT3_FS" = "y" ]; then -+ define_tristate CONFIG_FS_MBCACHE y -+ else -+ if [ "$CONFIG_EXT2_FS" = "m" -o "$CONFIG_EXT3_FS" = "m" ]; then -+ define_tristate CONFIG_FS_MBCACHE m -+ fi -+ fi - fi - - mainmenu_option next_comment -diff -Nru a/fs/Makefile b/fs/Makefile ---- a/fs/Makefile Sun Dec 8 02:49:56 2002 -+++ b/fs/Makefile Sun Dec 8 02:49:56 2002 -@@ -6,7 +6,7 @@ - # - - export-objs := open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \ -- fcntl.o read_write.o dcookies.o -+ fcntl.o read_write.o dcookies.o mbcache.o - - obj-y := open.o read_write.o devices.o file_table.o buffer.o \ - bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \ -@@ -29,6 +29,8 @@ - obj-y += binfmt_script.o - - obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o -+ -+obj-$(CONFIG_FS_MBCACHE) += mbcache.o - - obj-$(CONFIG_QUOTA) += dquot.o - obj-$(CONFIG_QFMT_V1) += quota_v1.o -diff -Nru a/fs/ext3/Makefile b/fs/ext3/Makefile ---- a/fs/ext3/Makefile Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/Makefile Sun Dec 8 02:49:56 2002 -@@ -7,4 +7,10 @@ - ext3-objs := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o - -+export-objs += xattr.o -+ -+ifeq ($(CONFIG_EXT3_FS_XATTR),y) -+ext3-objs += xattr.o xattr_user.o -+endif -+ - include $(TOPDIR)/Rules.make -diff -Nru a/fs/ext3/file.c b/fs/ext3/file.c ---- a/fs/ext3/file.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/file.c Sun Dec 8 02:49:56 2002 -@@ -23,7 +23,7 @@ - #include - #include - #include --#include -+#include "xattr.h" - - /* - * Called when an inode is released. Note that this is different -@@ -98,5 +98,9 @@ - struct inode_operations ext3_file_inode_operations = { - .truncate = ext3_truncate, - .setattr = ext3_setattr, -+ .setxattr = ext3_setxattr, -+ .getxattr = ext3_getxattr, -+ .listxattr = ext3_listxattr, -+ .removexattr = ext3_removexattr, - }; - -diff -Nru a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c ---- a/fs/ext3/ialloc.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/ialloc.c Sun Dec 8 02:49:56 2002 -@@ -25,6 +25,8 @@ - #include - #include - -+#include "xattr.h" -+ - /* - * ialloc.c contains the inodes allocation and deallocation routines - */ -@@ -118,6 +120,7 @@ - * as writing the quota to disk may need the lock as well. - */ - DQUOT_INIT(inode); -+ ext3_xattr_delete_inode(handle, inode); - DQUOT_FREE_INODE(inode); - DQUOT_DROP(inode); - -diff -Nru a/fs/ext3/inode.c b/fs/ext3/inode.c ---- a/fs/ext3/inode.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/inode.c Sun Dec 8 02:49:56 2002 -@@ -42,6 +42,18 @@ - */ - #undef SEARCH_FROM_ZERO - -+/* -+ * Test whether an inode is a fast symlink. -+ */ -+static inline int ext3_inode_is_fast_symlink(struct inode *inode) -+{ -+ int ea_blocks = EXT3_I(inode)->i_file_acl ? -+ (inode->i_sb->s_blocksize >> 9) : 0; -+ -+ return (S_ISLNK(inode->i_mode) && -+ inode->i_blocks - ea_blocks == 0); -+} -+ - /* The ext3 forget function must perform a revoke if we are freeing data - * which has been journaled. Metadata (eg. indirect blocks) must be - * revoked in all cases. -@@ -51,7 +63,7 @@ - * still needs to be revoked. - */ - --static int ext3_forget(handle_t *handle, int is_metadata, -+int ext3_forget(handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - int blocknr) - { -@@ -167,9 +179,7 @@ - { - handle_t *handle; - -- if (is_bad_inode(inode) || -- inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -+ if (is_bad_inode(inode)) - goto no_delete; - - lock_kernel(); -@@ -1979,6 +1989,8 @@ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; -+ if (ext3_inode_is_fast_symlink(inode)) -+ return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - -@@ -2130,8 +2142,6 @@ - struct ext3_group_desc * gdp; - - if ((inode->i_ino != EXT3_ROOT_INO && -- inode->i_ino != EXT3_ACL_IDX_INO && -- inode->i_ino != EXT3_ACL_DATA_INO && - inode->i_ino != EXT3_JOURNAL_INO && - inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || - inode->i_ino > le32_to_cpu( -@@ -2263,10 +2273,7 @@ - - brelse (iloc.bh); - -- if (inode->i_ino == EXT3_ACL_IDX_INO || -- inode->i_ino == EXT3_ACL_DATA_INO) -- /* Nothing to do */ ; -- else if (S_ISREG(inode->i_mode)) { -+ if (S_ISREG(inode->i_mode)) { - inode->i_op = &ext3_file_inode_operations; - inode->i_fop = &ext3_file_operations; - if (ext3_should_writeback_data(inode)) -@@ -2277,18 +2284,20 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { -- if (!inode->i_blocks) -+ if (ext3_inode_is_fast_symlink(inode)) - inode->i_op = &ext3_fast_symlink_inode_operations; - else { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - if (ext3_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext3_writeback_aops; - else - inode->i_mapping->a_ops = &ext3_aops; - } -- } else -+ } else { -+ inode->i_op = &ext3_special_inode_operations; - init_special_inode(inode, inode->i_mode, - le32_to_cpu(iloc.raw_inode->i_block[0])); -+ } - if (ei->i_flags & EXT3_SYNC_FL) - inode->i_flags |= S_SYNC; - if (ei->i_flags & EXT3_APPEND_FL) -diff -Nru a/fs/ext3/namei.c b/fs/ext3/namei.c ---- a/fs/ext3/namei.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/namei.c Sun Dec 8 02:49:56 2002 -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include "xattr.h" - - - /* -@@ -1654,7 +1655,7 @@ - if (IS_DIRSYNC(dir)) - handle->h_sync = 1; - -- inode = ext3_new_inode (handle, dir, S_IFDIR); -+ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; -@@ -1662,7 +1663,6 @@ - inode->i_op = &ext3_dir_inode_operations; - inode->i_fop = &ext3_dir_operations; - inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; -- inode->i_blocks = 0; - dir_block = ext3_bread (handle, inode, 0, 1, &err); - if (!dir_block) { - inode->i_nlink--; /* is this nlink == 0? */ -@@ -1689,9 +1689,6 @@ - BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); - ext3_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); -- inode->i_mode = S_IFDIR | mode; -- if (dir->i_mode & S_ISGID) -- inode->i_mode |= S_ISGID; - ext3_mark_inode_dirty(handle, inode); - err = ext3_add_entry (handle, dentry, inode); - if (err) { -@@ -2068,7 +2065,7 @@ - goto out_stop; - - if (l > sizeof (EXT3_I(inode)->i_data)) { -- inode->i_op = &page_symlink_inode_operations; -+ inode->i_op = &ext3_symlink_inode_operations; - if (ext3_should_writeback_data(inode)) - inode->i_mapping->a_ops = &ext3_writeback_aops; - else -@@ -2284,4 +2281,17 @@ - .rmdir = ext3_rmdir, - .mknod = ext3_mknod, - .rename = ext3_rename, -+ .setxattr = ext3_setxattr, -+ .getxattr = ext3_getxattr, -+ .listxattr = ext3_listxattr, -+ .removexattr = ext3_removexattr, - }; -+ -+struct inode_operations ext3_special_inode_operations = { -+ .setxattr = ext3_setxattr, -+ .getxattr = ext3_getxattr, -+ .listxattr = ext3_listxattr, -+ .removexattr = ext3_removexattr, -+}; -+ -+ -diff -Nru a/fs/ext3/super.c b/fs/ext3/super.c ---- a/fs/ext3/super.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/super.c Sun Dec 8 02:49:56 2002 -@@ -30,6 +30,7 @@ - #include - #include - #include -+#include "xattr.h" - - #ifdef CONFIG_JBD_DEBUG - static int ext3_ro_after; /* Make fs read-only after this many jiffies */ -@@ -405,6 +406,7 @@ - struct ext3_super_block *es = sbi->s_es; - int i; - -+ ext3_xattr_put_super(sb); - journal_destroy(sbi->s_journal); - if (!(sb->s_flags & MS_RDONLY)) { - EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -@@ -554,6 +556,7 @@ - int is_remount) - { - unsigned long *mount_options = &sbi->s_mount_opt; -+ - uid_t *resuid = &sbi->s_resuid; - gid_t *resgid = &sbi->s_resgid; - char * this_char; -@@ -566,6 +569,13 @@ - continue; - if ((value = strchr (this_char, '=')) != NULL) - *value++ = 0; -+#ifdef CONFIG_EXT3_FS_XATTR -+ if (!strcmp (this_char, "user_xattr")) -+ set_opt (*mount_options, XATTR_USER); -+ else if (!strcmp (this_char, "nouser_xattr")) -+ clear_opt (*mount_options, XATTR_USER); -+ else -+#endif - if (!strcmp (this_char, "bsddf")) - clear_opt (*mount_options, MINIX_DF); - else if (!strcmp (this_char, "nouid32")) { -@@ -982,6 +992,12 @@ - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; -+ -+ /* Default extended attribute flags */ -+#ifdef CONFIG_EXT3_FS_XATTR -+ set_opt(sbi->s_mount_opt, XATTR_USER); -+#endif -+ - if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) - goto out_fail; - -@@ -1820,7 +1836,10 @@ - - static int __init init_ext3_fs(void) - { -- int err = init_inodecache(); -+ int err = init_ext3_xattr(); -+ if (err) -+ return err; -+ err = init_inodecache(); - if (err) - goto out1; - err = register_filesystem(&ext3_fs_type); -@@ -1830,6 +1849,7 @@ - out: - destroy_inodecache(); - out1: -+ exit_ext3_xattr(); - return err; - } - -@@ -1837,6 +1857,7 @@ - { - unregister_filesystem(&ext3_fs_type); - destroy_inodecache(); -+ exit_ext3_xattr(); - } - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -diff -Nru a/fs/ext3/symlink.c b/fs/ext3/symlink.c ---- a/fs/ext3/symlink.c Sun Dec 8 02:49:56 2002 -+++ b/fs/ext3/symlink.c Sun Dec 8 02:49:56 2002 -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include "xattr.h" - - static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) - { -@@ -33,7 +34,20 @@ - return vfs_follow_link(nd, (char*)ei->i_data); - } - -+struct inode_operations ext3_symlink_inode_operations = { -+ .readlink = page_readlink, -+ .follow_link = page_follow_link, -+ .setxattr = ext3_setxattr, -+ .getxattr = ext3_getxattr, -+ .listxattr = ext3_listxattr, -+ .removexattr = ext3_removexattr, -+}; -+ - struct inode_operations ext3_fast_symlink_inode_operations = { -- .readlink = ext3_readlink, /* BKL not held. Don't need */ -+ .readlink = ext3_readlink, /* BKL not held. Don't need */ - .follow_link = ext3_follow_link, /* BKL not held. Don't need */ -+ .setxattr = ext3_setxattr, -+ .getxattr = ext3_getxattr, -+ .listxattr = ext3_listxattr, -+ .removexattr = ext3_removexattr, - }; -diff -Nru a/fs/ext3/xattr.c b/fs/ext3/xattr.c ---- /dev/null Wed Dec 31 16:00:00 1969 -+++ b/fs/ext3/xattr.c Sun Dec 8 02:49:56 2002 -@@ -0,0 +1,1127 @@ -+/* -+ * linux/fs/ext3/xattr.c -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ * -+ * Fix by Harrison Xing . -+ * Ext3 code with a lot of help from Eric Jarman . -+ * Extended attributes for symlinks and special files added per -+ * suggestion of Luka Renko . -+ */ -+ -+/* -+ * Extended attributes are stored on disk blocks allocated outside of -+ * any inode. The i_file_acl field is then made to point to this allocated -+ * block. If all extended attributes of an inode are identical, these -+ * inodes may share the same extended attribute block. Such situations -+ * are automatically detected by keeping a cache of recent attribute block -+ * numbers and hashes over the block's contents in memory. -+ * -+ * -+ * Extended attribute block layout: -+ * -+ * +------------------+ -+ * | header | -+ * ¦ entry 1 | | -+ * | entry 2 | | growing downwards -+ * | entry 3 | v -+ * | four null bytes | -+ * | . . . | -+ * | value 1 | ^ -+ * | value 3 | | growing upwards -+ * | value 2 | | -+ * +------------------+ -+ * -+ * The block header is followed by multiple entry descriptors. These entry -+ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD -+ * byte boundaries. The entry descriptors are sorted by attribute name, -+ * so that two extended attribute blocks can be compared efficiently. -+ * -+ * Attribute values are aligned to the end of the block, stored in -+ * no specific order. They are also padded to EXT3_XATTR_PAD byte -+ * boundaries. No additional gaps are left between them. -+ * -+ * Locking strategy -+ * ---------------- -+ * The VFS holdsinode->i_sem semaphore when any of the xattr inode -+ * operations are called, so we are guaranteed that only one -+ * processes accesses extended attributes of an inode at any time. -+ * -+ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that -+ * only a single process is modifying an extended attribute block, even -+ * if the block is shared among inodes. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+#define EXT3_EA_USER "user." -+ -+#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) -+#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) -+#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) -+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) -+ -+#ifdef EXT3_XATTR_DEBUG -+# define ea_idebug(inode, f...) do { \ -+ printk(KERN_DEBUG "inode %s:%ld: ", \ -+ kdevname(inode->i_dev), inode->i_ino); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+# define ea_bdebug(bh, f...) do { \ -+ printk(KERN_DEBUG "block %s:%ld: ", \ -+ kdevname(bh->b_dev), bh->b_blocknr); \ -+ printk(f); \ -+ printk("\n"); \ -+ } while (0) -+#else -+# define ea_idebug(f...) -+# define ea_bdebug(f...) -+#endif -+ -+static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, -+ struct ext3_xattr_header *); -+ -+static int ext3_xattr_cache_insert(struct buffer_head *); -+static struct buffer_head *ext3_xattr_cache_find(struct inode *, -+ struct ext3_xattr_header *); -+static void ext3_xattr_cache_remove(struct buffer_head *); -+static void ext3_xattr_rehash(struct ext3_xattr_header *, -+ struct ext3_xattr_entry *); -+ -+static struct mb_cache *ext3_xattr_cache; -+ -+/* -+ * If a file system does not share extended attributes among inodes, -+ * we should not need the ext3_xattr_sem semaphore. However, the -+ * filesystem may still contain shared blocks, so we always take -+ * the lock. -+ */ -+ -+static DECLARE_MUTEX(ext3_xattr_sem); -+static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; -+static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; -+ -+int -+ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) -+{ -+ int error = -EINVAL; -+ -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ if (!ext3_xattr_handlers[name_index-1]) { -+ ext3_xattr_handlers[name_index-1] = handler; -+ error = 0; -+ } -+ write_unlock(&ext3_handler_lock); -+ } -+ return error; -+} -+ -+void -+ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) -+{ -+ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { -+ write_lock(&ext3_handler_lock); -+ ext3_xattr_handlers[name_index-1] = NULL; -+ write_unlock(&ext3_handler_lock); -+ } -+} -+ -+static inline const char * -+strcmp_prefix(const char *a, const char *a_prefix) -+{ -+ while (*a_prefix && *a == *a_prefix) { -+ a++; -+ a_prefix++; -+ } -+ return *a_prefix ? NULL : a; -+} -+ -+/* -+ * Decode the extended attribute name, and translate it into -+ * the name_index and name suffix. -+ */ -+static inline struct ext3_xattr_handler * -+ext3_xattr_resolve_name(const char **name) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ int i; -+ -+ if (!*name) -+ return NULL; -+ read_lock(&ext3_handler_lock); -+ for (i=0; iprefix); -+ if (n) { -+ handler = ext3_xattr_handlers[i]; -+ *name = n; -+ break; -+ } -+ } -+ } -+ read_unlock(&ext3_handler_lock); -+ return handler; -+} -+ -+static inline struct ext3_xattr_handler * -+ext3_xattr_handler(int name_index) -+{ -+ struct ext3_xattr_handler *handler = NULL; -+ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { -+ read_lock(&ext3_handler_lock); -+ handler = ext3_xattr_handlers[name_index-1]; -+ read_unlock(&ext3_handler_lock); -+ } -+ return handler; -+} -+ -+/* -+ * Inode operation getxattr() -+ * -+ * dentry->d_inode->i_sem down -+ */ -+ssize_t -+ext3_getxattr(struct dentry *dentry, const char *name, -+ void *buffer, size_t size) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -EOPNOTSUPP; -+ return handler->get(inode, name, buffer, size); -+} -+ -+/* -+ * Inode operation listxattr() -+ * -+ * dentry->d_inode->i_sem down -+ */ -+ssize_t -+ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) -+{ -+ return ext3_xattr_list(dentry->d_inode, buffer, size); -+} -+ -+/* -+ * Inode operation setxattr() -+ * -+ * dentry->d_inode->i_sem down -+ */ -+int -+ext3_setxattr(struct dentry *dentry, const char *name, -+ void *value, size_t size, int flags) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ if (size == 0) -+ value = ""; /* empty EA, do not remove */ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -EOPNOTSUPP; -+ return handler->set(inode, name, value, size, flags); -+} -+ -+/* -+ * Inode operation removexattr() -+ * -+ * dentry->d_inode->i_sem down -+ */ -+int -+ext3_removexattr(struct dentry *dentry, const char *name) -+{ -+ struct ext3_xattr_handler *handler; -+ struct inode *inode = dentry->d_inode; -+ -+ handler = ext3_xattr_resolve_name(&name); -+ if (!handler) -+ return -EOPNOTSUPP; -+ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); -+} -+ -+/* -+ * ext3_xattr_get() -+ * -+ * Copy an extended attribute into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size; -+ char *end; -+ int name_len, error; -+ -+ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", -+ name_index, name, buffer, (long)buffer_size); -+ -+ if (name == NULL) -+ return -EINVAL; -+ if (!EXT3_I(inode)->i_file_acl) -+ return -ENODATA; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* find named attribute */ -+ name_len = strlen(name); -+ -+ error = -ERANGE; -+ if (name_len > 255) -+ goto cleanup; -+ entry = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (name_index == entry->e_name_index && -+ name_len == entry->e_name_len && -+ memcmp(name, entry->e_name, name_len) == 0) -+ goto found; -+ entry = next; -+ } -+ /* Check the remaining name entries */ -+ while (!IS_LAST_ENTRY(entry)) { -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ entry = next; -+ } -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ error = -ENODATA; -+ goto cleanup; -+found: -+ /* check the buffer size */ -+ if (entry->e_value_block != 0) -+ goto bad_block; -+ size = le32_to_cpu(entry->e_value_size); -+ if (size > inode->i_sb->s_blocksize || -+ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) -+ goto bad_block; -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (buffer) { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ /* return value of attribute */ -+ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), -+ size); -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_list() -+ * -+ * Copy a list of attribute names into the buffer -+ * provided, or compute the buffer size required. -+ * Buffer is NULL to compute the size of the buffer required. -+ * -+ * Returns a negative error number on failure, or the number of bytes -+ * used / required on success. -+ */ -+int -+ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) -+{ -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_entry *entry; -+ unsigned int block, size = 0; -+ char *buf, *end; -+ int error; -+ -+ ea_idebug(inode, "buffer=%p, buffer_size=%ld", -+ buffer, (long)buffer_size); -+ -+ if (!EXT3_I(inode)->i_file_acl) -+ return 0; -+ block = EXT3_I(inode)->i_file_acl; -+ ea_idebug(inode, "reading block %d", block); -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) -+ return -EIO; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); -+ end = bh->b_data + bh->b_size; -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* compute the size required for the list of attribute names */ -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ struct ext3_xattr_entry *next = -+ EXT3_XATTR_NEXT(entry); -+ if ((char *)next >= end) -+ goto bad_block; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) { -+ size += handler->list(NULL, inode, entry->e_name, -+ entry->e_name_len) + 1; -+ } -+ } -+ -+ if (ext3_xattr_cache_insert(bh)) -+ ea_idebug(inode, "cache insert failed"); -+ if (!buffer) { -+ error = size; -+ goto cleanup; -+ } else { -+ error = -ERANGE; -+ if (size > buffer_size) -+ goto cleanup; -+ } -+ -+ /* list the attribute names */ -+ buf = buffer; -+ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); -+ entry = EXT3_XATTR_NEXT(entry)) { -+ struct ext3_xattr_handler *handler; -+ -+ handler = ext3_xattr_handler(entry->e_name_index); -+ if (handler) { -+ buf += handler->list(buf, inode, entry->e_name, -+ entry->e_name_len); -+ *buf++ = '\0'; -+ } -+ } -+ error = size; -+ -+cleanup: -+ brelse(bh); -+ -+ return error; -+} -+ -+/* -+ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is -+ * not set, set it. -+ */ -+static void ext3_xattr_update_super_block(handle_t *handle, -+ struct super_block *sb) -+{ -+ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) -+ return; -+ -+ lock_super(sb); -+ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); -+ EXT3_SB(sb)->s_es->s_feature_compat |= -+ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); -+ sb->s_dirt = 1; -+ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); -+ unlock_super(sb); -+} -+ -+/* -+ * ext3_xattr_set() -+ * -+ * Create, replace or remove an extended attribute for this inode. Buffer -+ * is NULL to remove an existing extended attribute, and non-NULL to -+ * either replace an existing extended attribute, or create a new extended -+ * attribute. The flags XATTR_REPLACE and XATTR_CREATE -+ * specify that an extended attribute must exist and must not exist -+ * previous to the call, respectively. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t value_len, int flags) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *bh = NULL; -+ struct ext3_xattr_header *header = NULL; -+ struct ext3_xattr_entry *here, *last; -+ unsigned int name_len; -+ int min_offs = sb->s_blocksize, not_found = 1, free, error; -+ char *end; -+ -+ /* -+ * header -- Points either into bh, or to a temporarily -+ * allocated buffer. -+ * here -- The named entry found, or the place for inserting, within -+ * the block pointed to by header. -+ * last -- Points right after the last named entry within the block -+ * pointed to by header. -+ * min_offs -- The offset of the first value (values are aligned -+ * towards the end of the block). -+ * end -- Points right after the block pointed to by header. -+ */ -+ -+ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", -+ name_index, name, value, (long)value_len); -+ -+ if (IS_RDONLY(inode)) -+ return -EROFS; -+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) -+ return -EPERM; -+ if (value == NULL) -+ value_len = 0; -+ if (name == NULL) -+ return -EINVAL; -+ name_len = strlen(name); -+ if (name_len > 255 || value_len > sb->s_blocksize) -+ return -ERANGE; -+ down(&ext3_xattr_sem); -+ -+ if (EXT3_I(inode)->i_file_acl) { -+ /* The inode already has an extended attribute block. */ -+ int block = EXT3_I(inode)->i_file_acl; -+ -+ bh = sb_bread(sb, block); -+ error = -EIO; -+ if (!bh) -+ goto cleanup; -+ ea_bdebug(bh, "b_count=%d, refcount=%d", -+ atomic_read(&(bh->b_count)), -+ le32_to_cpu(HDR(bh)->h_refcount)); -+ header = HDR(bh); -+ end = bh->b_data + bh->b_size; -+ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ header->h_blocks != cpu_to_le32(1)) { -+bad_block: ext3_error(sb, "ext3_xattr_set", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ error = -EIO; -+ goto cleanup; -+ } -+ /* Find the named attribute. */ -+ here = FIRST_ENTRY(bh); -+ while (!IS_LAST_ENTRY(here)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!here->e_value_block && here->e_value_size) { -+ int offs = le16_to_cpu(here->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ not_found = name_index - here->e_name_index; -+ if (!not_found) -+ not_found = name_len - here->e_name_len; -+ if (!not_found) -+ not_found = memcmp(name, here->e_name,name_len); -+ if (not_found <= 0) -+ break; -+ here = next; -+ } -+ last = here; -+ /* We still need to compute min_offs and last. */ -+ while (!IS_LAST_ENTRY(last)) { -+ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); -+ if ((char *)next >= end) -+ goto bad_block; -+ if (!last->e_value_block && last->e_value_size) { -+ int offs = le16_to_cpu(last->e_value_offs); -+ if (offs < min_offs) -+ min_offs = offs; -+ } -+ last = next; -+ } -+ -+ /* Check whether we have enough space left. */ -+ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); -+ } else { -+ /* We will use a new extended attribute block. */ -+ free = sb->s_blocksize - -+ sizeof(struct ext3_xattr_header) - sizeof(__u32); -+ here = last = NULL; /* avoid gcc uninitialized warning. */ -+ } -+ -+ if (not_found) { -+ /* Request to remove a nonexistent attribute? */ -+ error = -ENODATA; -+ if (flags & XATTR_REPLACE) -+ goto cleanup; -+ error = 0; -+ if (value == NULL) -+ goto cleanup; -+ else -+ free -= EXT3_XATTR_LEN(name_len); -+ } else { -+ /* Request to create an existing attribute? */ -+ error = -EEXIST; -+ if (flags & XATTR_CREATE) -+ goto cleanup; -+ if (!here->e_value_block && here->e_value_size) { -+ unsigned int size = le32_to_cpu(here->e_value_size); -+ -+ if (le16_to_cpu(here->e_value_offs) + size > -+ sb->s_blocksize || size > sb->s_blocksize) -+ goto bad_block; -+ free += EXT3_XATTR_SIZE(size); -+ } -+ } -+ free -= EXT3_XATTR_SIZE(value_len); -+ error = -ENOSPC; -+ if (free < 0) -+ goto cleanup; -+ -+ /* Here we know that we can set the new attribute. */ -+ -+ if (header) { -+ if (header->h_refcount == cpu_to_le32(1)) { -+ ea_bdebug(bh, "modifying in-place"); -+ ext3_xattr_cache_remove(bh); -+ error = ext3_journal_get_write_access(handle, bh); -+ if (error) -+ goto cleanup; -+ } else { -+ int offset; -+ -+ ea_bdebug(bh, "cloning"); -+ header = kmalloc(bh->b_size, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memcpy(header, HDR(bh), bh->b_size); -+ header->h_refcount = cpu_to_le32(1); -+ offset = (char *)header - bh->b_data; -+ here = ENTRY((char *)here + offset); -+ last = ENTRY((char *)last + offset); -+ } -+ } else { -+ /* Allocate a buffer where we construct the new block. */ -+ header = kmalloc(sb->s_blocksize, GFP_KERNEL); -+ error = -ENOMEM; -+ if (header == NULL) -+ goto cleanup; -+ memset(header, 0, sb->s_blocksize); -+ end = (char *)header + sb->s_blocksize; -+ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); -+ header->h_blocks = header->h_refcount = cpu_to_le32(1); -+ last = here = ENTRY(header+1); -+ } -+ -+ if (not_found) { -+ /* Insert the new name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ int rest = (char *)last - (char *)here; -+ memmove((char *)here + size, here, rest); -+ memset(here, 0, size); -+ here->e_name_index = name_index; -+ here->e_name_len = name_len; -+ memcpy(here->e_name, name, name_len); -+ } else { -+ /* Remove the old value. */ -+ if (!here->e_value_block && here->e_value_size) { -+ char *first_val = (char *)header + min_offs; -+ int offs = le16_to_cpu(here->e_value_offs); -+ char *val = (char *)header + offs; -+ size_t size = EXT3_XATTR_SIZE( -+ le32_to_cpu(here->e_value_size)); -+ memmove(first_val + size, first_val, val - first_val); -+ memset(first_val, 0, size); -+ here->e_value_offs = 0; -+ min_offs += size; -+ -+ /* Adjust all value offsets. */ -+ last = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(last)) { -+ int o = le16_to_cpu(last->e_value_offs); -+ if (!last->e_value_block && o < offs) -+ last->e_value_offs = -+ cpu_to_le16(o + size); -+ last = EXT3_XATTR_NEXT(last); -+ } -+ } -+ if (value == NULL) { -+ /* Remove this attribute. */ -+ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { -+ /* This block is now empty. */ -+ error = ext3_xattr_set2(handle, inode, bh,NULL); -+ goto cleanup; -+ } else { -+ /* Remove the old name. */ -+ int size = EXT3_XATTR_LEN(name_len); -+ last = ENTRY((char *)last - size); -+ memmove(here, (char*)here + size, -+ (char*)last - (char*)here); -+ memset(last, 0, size); -+ } -+ } -+ } -+ -+ if (value != NULL) { -+ /* Insert the new value. */ -+ here->e_value_size = cpu_to_le32(value_len); -+ if (value_len) { -+ size_t size = EXT3_XATTR_SIZE(value_len); -+ char *val = (char *)header + min_offs - size; -+ here->e_value_offs = -+ cpu_to_le16((char *)val - (char *)header); -+ memset(val + size - EXT3_XATTR_PAD, 0, -+ EXT3_XATTR_PAD); /* Clear the pad bytes. */ -+ memcpy(val, value, value_len); -+ } -+ } -+ ext3_xattr_rehash(header, here); -+ -+ error = ext3_xattr_set2(handle, inode, bh, header); -+ -+cleanup: -+ brelse(bh); -+ if (!(bh && header == HDR(bh))) -+ kfree(header); -+ up(&ext3_xattr_sem); -+ -+ return error; -+} -+ -+/* -+ * Second half of ext3_xattr_set(): Update the file system. -+ */ -+static int -+ext3_xattr_set2(handle_t *handle, struct inode *inode, -+ struct buffer_head *old_bh, struct ext3_xattr_header *header) -+{ -+ struct super_block *sb = inode->i_sb; -+ struct buffer_head *new_bh = NULL; -+ int error; -+ -+ if (header) { -+ new_bh = ext3_xattr_cache_find(inode, header); -+ if (new_bh) { -+ /* -+ * We found an identical block in the cache. -+ * The old block will be released after updating -+ * the inode. -+ */ -+ ea_bdebug(old_bh, "reusing block %ld", -+ new_bh->b_blocknr); -+ -+ error = -EDQUOT; -+ if (DQUOT_ALLOC_BLOCK(inode, 1)) -+ goto cleanup; -+ -+ error = ext3_journal_get_write_access(handle, new_bh); -+ if (error) -+ goto cleanup; -+ HDR(new_bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); -+ ea_bdebug(new_bh, "refcount now=%d", -+ le32_to_cpu(HDR(new_bh)->h_refcount)); -+ } else if (old_bh && header == HDR(old_bh)) { -+ /* Keep this block. */ -+ new_bh = old_bh; -+ ext3_xattr_cache_insert(new_bh); -+ } else { -+ /* We need to allocate a new block */ -+ int block; -+ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + -+ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); -+ -+ block = ext3_new_block(handle, inode, goal, 0, -+ 0, &error); -+ if (error) -+ goto cleanup; -+ ea_idebug(inode, "creating block %d", block); -+ -+ new_bh = sb_getblk(sb, block); -+ if (!new_bh) { -+getblk_failed: -+ ext3_free_blocks(handle, inode, block, 1); -+ error = -EIO; -+ goto cleanup; -+ } -+ lock_buffer(new_bh); -+ error = ext3_journal_get_create_access(handle, new_bh); -+ if (error) { -+ unlock_buffer(new_bh); -+ goto getblk_failed; -+ } -+ memcpy(new_bh->b_data, header, new_bh->b_size); -+ set_buffer_uptodate(new_bh); -+ unlock_buffer(new_bh); -+ ext3_xattr_cache_insert(new_bh); -+ -+ ext3_xattr_update_super_block(handle, sb); -+ } -+ error = ext3_journal_dirty_metadata(handle, new_bh); -+ if (error) -+ goto cleanup; -+ } -+ -+ /* Update the inode. */ -+ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; -+ inode->i_ctime = CURRENT_TIME; -+ ext3_mark_inode_dirty(handle, inode); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ -+ error = 0; -+ if (old_bh && old_bh != new_bh) { -+ /* -+ * If there was an old block, and we are not still using it, -+ * we now release the old block. -+ */ -+ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); -+ -+ error = ext3_journal_get_write_access(handle, old_bh); -+ if (error) -+ goto cleanup; -+ if (refcount == 1) { -+ /* Free the old block. */ -+ ea_bdebug(old_bh, "freeing"); -+ ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1); -+ -+ /* ext3_forget() calls bforget() for us, but we -+ let our caller release old_bh, so we need to -+ duplicate the handle before. */ -+ get_bh(old_bh); -+ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); -+ } else { -+ /* Decrement the refcount only. */ -+ refcount--; -+ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); -+ DQUOT_FREE_BLOCK(inode, 1); -+ ext3_journal_dirty_metadata(handle, old_bh); -+ ea_bdebug(old_bh, "refcount now=%d", refcount); -+ } -+ } -+ -+cleanup: -+ if (old_bh != new_bh) -+ brelse(new_bh); -+ -+ return error; -+} -+ -+/* -+ * ext3_xattr_delete_inode() -+ * -+ * Free extended attribute resources associated with this inode. This -+ * is called immediately before an inode is freed. -+ */ -+void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+ struct buffer_head *bh; -+ unsigned int block = EXT3_I(inode)->i_file_acl; -+ -+ if (!block) -+ return; -+ down(&ext3_xattr_sem); -+ -+ bh = sb_bread(inode->i_sb, block); -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: block %d read error", inode->i_ino, block); -+ goto cleanup; -+ } -+ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); -+ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || -+ HDR(bh)->h_blocks != cpu_to_le32(1)) { -+ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", -+ "inode %ld: bad block %d", inode->i_ino, block); -+ goto cleanup; -+ } -+ ext3_journal_get_write_access(handle, bh); -+ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { -+ ext3_xattr_cache_remove(bh); -+ ext3_free_blocks(handle, inode, block, 1); -+ ext3_forget(handle, 1, inode, bh, block); -+ bh = NULL; -+ } else { -+ HDR(bh)->h_refcount = cpu_to_le32( -+ le32_to_cpu(HDR(bh)->h_refcount) - 1); -+ ext3_journal_dirty_metadata(handle, bh); -+ if (IS_SYNC(inode)) -+ handle->h_sync = 1; -+ DQUOT_FREE_BLOCK(inode, 1); -+ } -+ EXT3_I(inode)->i_file_acl = 0; -+ -+cleanup: -+ brelse(bh); -+ up(&ext3_xattr_sem); -+} -+ -+/* -+ * ext3_xattr_put_super() -+ * -+ * This is called when a file system is unmounted. -+ */ -+void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+ mb_cache_shrink(ext3_xattr_cache, sb->s_bdev); -+} -+ -+/* -+ * ext3_xattr_cache_insert() -+ * -+ * Create a new entry in the extended attribute cache, and insert -+ * it unless such an entry is already in the cache. -+ * -+ * Returns 0, or a negative error number on failure. -+ */ -+static int -+ext3_xattr_cache_insert(struct buffer_head *bh) -+{ -+ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); -+ struct mb_cache_entry *ce; -+ int error; -+ -+ ce = mb_cache_entry_alloc(ext3_xattr_cache); -+ if (!ce) -+ return -ENOMEM; -+ error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, &hash); -+ if (error) { -+ mb_cache_entry_free(ce); -+ if (error == -EBUSY) { -+ ea_bdebug(bh, "already in cache (%d cache entries)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ error = 0; -+ } -+ } else { -+ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, -+ atomic_read(&ext3_xattr_cache->c_entry_count)); -+ mb_cache_entry_release(ce); -+ } -+ return error; -+} -+ -+/* -+ * ext3_xattr_cmp() -+ * -+ * Compare two extended attribute blocks for equality. -+ * -+ * Returns 0 if the blocks are equal, 1 if they differ, and -+ * a negative error number on errors. -+ */ -+static int -+ext3_xattr_cmp(struct ext3_xattr_header *header1, -+ struct ext3_xattr_header *header2) -+{ -+ struct ext3_xattr_entry *entry1, *entry2; -+ -+ entry1 = ENTRY(header1+1); -+ entry2 = ENTRY(header2+1); -+ while (!IS_LAST_ENTRY(entry1)) { -+ if (IS_LAST_ENTRY(entry2)) -+ return 1; -+ if (entry1->e_hash != entry2->e_hash || -+ entry1->e_name_len != entry2->e_name_len || -+ entry1->e_value_size != entry2->e_value_size || -+ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) -+ return 1; -+ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) -+ return -EIO; -+ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), -+ (char *)header2 + le16_to_cpu(entry2->e_value_offs), -+ le32_to_cpu(entry1->e_value_size))) -+ return 1; -+ -+ entry1 = EXT3_XATTR_NEXT(entry1); -+ entry2 = EXT3_XATTR_NEXT(entry2); -+ } -+ if (!IS_LAST_ENTRY(entry2)) -+ return 1; -+ return 0; -+} -+ -+/* -+ * ext3_xattr_cache_find() -+ * -+ * Find an identical extended attribute block. -+ * -+ * Returns a pointer to the block found, or NULL if such a block was -+ * not found or an error occurred. -+ */ -+static struct buffer_head * -+ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) -+{ -+ __u32 hash = le32_to_cpu(header->h_hash); -+ struct mb_cache_entry *ce; -+ -+ if (!header->h_hash) -+ return NULL; /* never share */ -+ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); -+ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_bdev, hash); -+ while (ce) { -+ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); -+ -+ if (!bh) { -+ ext3_error(inode->i_sb, "ext3_xattr_cache_find", -+ "inode %ld: block %ld read error", -+ inode->i_ino, (unsigned long) ce->e_block); -+ } else if (le32_to_cpu(HDR(bh)->h_refcount) > -+ EXT3_XATTR_REFCOUNT_MAX) { -+ ea_idebug(inode, "block %ld refcount %d>%d", -+ (unsigned long) ce->e_block, -+ le32_to_cpu(HDR(bh)->h_refcount), -+ EXT3_XATTR_REFCOUNT_MAX); -+ } else if (!ext3_xattr_cmp(header, HDR(bh))) { -+ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); -+ mb_cache_entry_release(ce); -+ return bh; -+ } -+ brelse(bh); -+ ce = mb_cache_entry_find_next(ce, 0, inode->i_bdev, hash); -+ } -+ return NULL; -+} -+ -+/* -+ * ext3_xattr_cache_remove() -+ * -+ * Remove the cache entry of a block from the cache. Called when a -+ * block becomes invalid. -+ */ -+static void -+ext3_xattr_cache_remove(struct buffer_head *bh) -+{ -+ struct mb_cache_entry *ce; -+ -+ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_bdev, -+ bh->b_blocknr); -+ if (ce) { -+ ea_bdebug(bh, "removing (%d cache entries remaining)", -+ atomic_read(&ext3_xattr_cache->c_entry_count)-1); -+ mb_cache_entry_free(ce); -+ } else -+ ea_bdebug(bh, "no cache entry"); -+} -+ -+#define NAME_HASH_SHIFT 5 -+#define VALUE_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_hash_entry() -+ * -+ * Compute the hash of an extended attribute. -+ */ -+static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ __u32 hash = 0; -+ char *name = entry->e_name; -+ int n; -+ -+ for (n=0; n < entry->e_name_len; n++) { -+ hash = (hash << NAME_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ -+ *name++; -+ } -+ -+ if (entry->e_value_block == 0 && entry->e_value_size != 0) { -+ __u32 *value = (__u32 *)((char *)header + -+ le16_to_cpu(entry->e_value_offs)); -+ for (n = (le32_to_cpu(entry->e_value_size) + -+ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { -+ hash = (hash << VALUE_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ -+ le32_to_cpu(*value++); -+ } -+ } -+ entry->e_hash = cpu_to_le32(hash); -+} -+ -+#undef NAME_HASH_SHIFT -+#undef VALUE_HASH_SHIFT -+ -+#define BLOCK_HASH_SHIFT 16 -+ -+/* -+ * ext3_xattr_rehash() -+ * -+ * Re-compute the extended attribute hash value after an entry has changed. -+ */ -+static void ext3_xattr_rehash(struct ext3_xattr_header *header, -+ struct ext3_xattr_entry *entry) -+{ -+ struct ext3_xattr_entry *here; -+ __u32 hash = 0; -+ -+ ext3_xattr_hash_entry(header, entry); -+ here = ENTRY(header+1); -+ while (!IS_LAST_ENTRY(here)) { -+ if (!here->e_hash) { -+ /* Block is not shared if an entry's hash value == 0 */ -+ hash = 0; -+ break; -+ } -+ hash = (hash << BLOCK_HASH_SHIFT) ^ -+ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ -+ le32_to_cpu(here->e_hash); -+ here = EXT3_XATTR_NEXT(here); -+ } -+ header->h_hash = cpu_to_le32(hash); -+} -+ -+#undef BLOCK_HASH_SHIFT -+ -+int __init -+init_ext3_xattr(void) -+{ -+ int err; -+ -+ err = ext3_xattr_register(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler); -+ if (err) -+ return err; -+ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, -+ sizeof(struct mb_cache_entry) + -+ sizeof(struct mb_cache_entry_index), 1, 6); -+ if (!ext3_xattr_cache) { -+ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+void -+exit_ext3_xattr(void) -+{ -+ if (ext3_xattr_cache) -+ mb_cache_destroy(ext3_xattr_cache); -+ ext3_xattr_cache = NULL; -+ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, &ext3_xattr_user_handler); -+} -+ -diff -Nru a/fs/ext3/xattr.h b/fs/ext3/xattr.h ---- /dev/null Wed Dec 31 16:00:00 1969 -+++ b/fs/ext3/xattr.h Sun Dec 8 02:49:56 2002 -@@ -0,0 +1,133 @@ -+/* -+ File: fs/ext3/xattr.h -+ -+ On-disk format of extended attributes for the ext3 filesystem. -+ -+ (C) 2001 Andreas Gruenbacher, -+*/ -+ -+#include -+#include -+ -+/* Magic value in attribute blocks */ -+#define EXT3_XATTR_MAGIC 0xEA020000 -+ -+/* Maximum number of references to one attribute block */ -+#define EXT3_XATTR_REFCOUNT_MAX 1024 -+ -+/* Name indexes */ -+#define EXT3_XATTR_INDEX_MAX 10 -+#define EXT3_XATTR_INDEX_USER 1 -+#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 -+#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 -+ -+struct ext3_xattr_header { -+ __u32 h_magic; /* magic number for identification */ -+ __u32 h_refcount; /* reference count */ -+ __u32 h_blocks; /* number of disk blocks used */ -+ __u32 h_hash; /* hash value of all attributes */ -+ __u32 h_reserved[4]; /* zero right now */ -+}; -+ -+struct ext3_xattr_entry { -+ __u8 e_name_len; /* length of name */ -+ __u8 e_name_index; /* attribute name index */ -+ __u16 e_value_offs; /* offset in disk block of value */ -+ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ -+ __u32 e_value_size; /* size of attribute value */ -+ __u32 e_hash; /* hash value of name and value */ -+ char e_name[0]; /* attribute name */ -+}; -+ -+#define EXT3_XATTR_PAD_BITS 2 -+#define EXT3_XATTR_PAD (1<e_name_len)) ) -+#define EXT3_XATTR_SIZE(size) \ -+ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) -+ -+# ifdef CONFIG_EXT3_FS_XATTR -+ -+struct ext3_xattr_handler { -+ char *prefix; -+ size_t (*list)(char *list, struct inode *inode, const char *name, -+ int name_len); -+ int (*get)(struct inode *inode, const char *name, void *buffer, -+ size_t size); -+ int (*set)(struct inode *inode, const char *name, const void *buffer, -+ size_t size, int flags); -+}; -+ -+extern int ext3_xattr_register(int, struct ext3_xattr_handler *); -+extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); -+ -+extern int ext3_setxattr(struct dentry *, const char *, void *, size_t, int); -+extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); -+extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); -+extern int ext3_removexattr(struct dentry *, const char *); -+ -+extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -+extern int ext3_xattr_list(struct inode *, char *, size_t); -+extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); -+ -+extern void ext3_xattr_delete_inode(handle_t *, struct inode *); -+extern void ext3_xattr_put_super(struct super_block *); -+ -+extern int init_ext3_xattr(void); -+extern void exit_ext3_xattr(void); -+ -+# else /* CONFIG_EXT3_FS_XATTR */ -+# define ext3_setxattr NULL -+# define ext3_getxattr NULL -+# define ext3_listxattr NULL -+# define ext3_removexattr NULL -+ -+static inline int -+ext3_xattr_get(struct inode *inode, int name_index, const char *name, -+ void *buffer, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline int -+ext3_xattr_list(struct inode *inode, void *buffer, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline int -+ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, -+ const char *name, const void *value, size_t size, int flags) -+{ -+ return -EOPNOTSUPP; -+} -+ -+static inline void -+ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) -+{ -+} -+ -+static inline void -+ext3_xattr_put_super(struct super_block *sb) -+{ -+} -+ -+static inline int -+init_ext3_xattr(void) -+{ -+ return 0; -+} -+ -+static inline void -+exit_ext3_xattr(void) -+{ -+} -+ -+# endif /* CONFIG_EXT3_FS_XATTR */ -+ -+extern struct ext3_xattr_handler ext3_xattr_user_handler; -diff -Nru a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c ---- /dev/null Wed Dec 31 16:00:00 1969 -+++ b/fs/ext3/xattr_user.c Sun Dec 8 02:49:56 2002 -@@ -0,0 +1,99 @@ -+/* -+ * linux/fs/ext3/xattr_user.c -+ * Handler for extended user attributes. -+ * -+ * Copyright (C) 2001 by Andreas Gruenbacher, -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "xattr.h" -+ -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+# include -+#endif -+ -+#define XATTR_USER_PREFIX "user." -+ -+static size_t -+ext3_xattr_user_list(char *list, struct inode *inode, -+ const char *name, int name_len) -+{ -+ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; -+ -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return 0; -+ -+ if (list) { -+ memcpy(list, XATTR_USER_PREFIX, prefix_len); -+ memcpy(list+prefix_len, name, name_len); -+ } -+ return prefix_len + name_len; -+} -+ -+static int -+ext3_xattr_user_get(struct inode *inode, const char *name, -+ void *buffer, size_t size) -+{ -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -EOPNOTSUPP; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_READ); -+#else -+ error = permission(inode, MAY_READ); -+#endif -+ if (error) -+ return error; -+ -+ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, -+ buffer, size); -+} -+ -+static int -+ext3_xattr_user_set(struct inode *inode, const char *name, -+ const void *value, size_t size, int flags) -+{ -+ handle_t *handle; -+ int error; -+ -+ if (strcmp(name, "") == 0) -+ return -EINVAL; -+ if (!test_opt(inode->i_sb, XATTR_USER)) -+ return -EOPNOTSUPP; -+ if ( !S_ISREG(inode->i_mode) && -+ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) -+ return -EPERM; -+#ifdef CONFIG_EXT3_FS_POSIX_ACL -+ error = ext3_permission_locked(inode, MAY_WRITE); -+#else -+ error = permission(inode, MAY_WRITE); -+#endif -+ if (error) -+ return error; -+ -+ lock_kernel(); -+ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); -+ if (IS_ERR(handle)) -+ return PTR_ERR(handle); -+ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, -+ value, size, flags); -+ ext3_journal_stop(handle, inode); -+ unlock_kernel(); -+ -+ return error; -+} -+ -+struct ext3_xattr_handler ext3_xattr_user_handler = { -+ prefix: XATTR_USER_PREFIX, -+ list: ext3_xattr_user_list, -+ get: ext3_xattr_user_get, -+ set: ext3_xattr_user_set, -+}; -diff -Nru a/fs/mbcache.c b/fs/mbcache.c ---- /dev/null Wed Dec 31 16:00:00 1969 -+++ b/fs/mbcache.c Sun Dec 8 02:49:56 2002 -@@ -0,0 +1,702 @@ -+/* -+ * linux/fs/mbcache.c -+ * (C) 2001-2002 Andreas Gruenbacher, -+ */ -+ -+/* -+ * Filesystem Meta Information Block Cache (mbcache) -+ * -+ * The mbcache caches blocks of block devices that need to be located -+ * by their device/block number, as well as by other criteria (such -+ * as the block's contents). -+ * -+ * There can only be one cache entry in a cache per device and block number. -+ * Additional indexes need not be unique in this sense. The number of -+ * additional indexes (=other criteria) can be hardwired (at compile time) -+ * or specified at cache create time. -+ * -+ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' -+ * in the cache. A valid entry is in the main hash tables of the cache, -+ * and may also be in the lru list. An invalid entry is not in any hashes -+ * or lists. -+ * -+ * A valid cache entry is only in the lru list if no handles refer to it. -+ * Invalid cache entries will be freed when the last handle to the cache -+ * entry is released. -+ */ -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+#ifdef MB_CACHE_DEBUG -+# define mb_debug(f...) do { \ -+ printk(KERN_DEBUG f); \ -+ printk("\n"); \ -+ } while (0) -+#define mb_assert(c) do { if (!(c)) \ -+ printk(KERN_ERR "assertion " #c " failed\n"); \ -+ } while(0) -+#else -+# define mb_debug(f...) do { } while(0) -+# define mb_assert(c) do { } while(0) -+#endif -+#define mb_error(f...) do { \ -+ printk(KERN_ERR f); \ -+ printk("\n"); \ -+ } while(0) -+ -+MODULE_AUTHOR("Andreas Gruenbacher "); -+MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); -+MODULE_LICENSE("GPL"); -+ -+EXPORT_SYMBOL(mb_cache_create); -+EXPORT_SYMBOL(mb_cache_shrink); -+EXPORT_SYMBOL(mb_cache_destroy); -+EXPORT_SYMBOL(mb_cache_entry_alloc); -+EXPORT_SYMBOL(mb_cache_entry_insert); -+EXPORT_SYMBOL(mb_cache_entry_release); -+EXPORT_SYMBOL(mb_cache_entry_takeout); -+EXPORT_SYMBOL(mb_cache_entry_free); -+EXPORT_SYMBOL(mb_cache_entry_dup); -+EXPORT_SYMBOL(mb_cache_entry_get); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+EXPORT_SYMBOL(mb_cache_entry_find_first); -+EXPORT_SYMBOL(mb_cache_entry_find_next); -+#endif -+ -+ -+/* -+ * Global data: list of all mbcache's, lru list, and a spinlock for -+ * accessing cache data structures on SMP machines. (The lru list is -+ * global across all mbcaches.) -+ */ -+ -+static LIST_HEAD(mb_cache_list); -+static LIST_HEAD(mb_cache_lru_list); -+static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; -+static struct shrinker *mb_shrinker; -+ -+static inline int -+mb_cache_indexes(struct mb_cache *cache) -+{ -+#ifdef MB_CACHE_INDEXES_COUNT -+ return MB_CACHE_INDEXES_COUNT; -+#else -+ return cache->c_indexes_count; -+#endif -+} -+ -+/* -+ * What the mbcache registers as to get shrunk dynamically. -+ */ -+ -+static int mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask); -+ -+static inline void -+__mb_cache_entry_takeout_lru(struct mb_cache_entry *ce) -+{ -+ if (!list_empty(&ce->e_lru_list)) -+ list_del_init(&ce->e_lru_list); -+} -+ -+ -+static inline void -+__mb_cache_entry_into_lru(struct mb_cache_entry *ce) -+{ -+ list_add(&ce->e_lru_list, &mb_cache_lru_list); -+} -+ -+ -+static inline int -+__mb_cache_entry_in_lru(struct mb_cache_entry *ce) -+{ -+ return (!list_empty(&ce->e_lru_list)); -+} -+ -+ -+/* -+ * Insert the cache entry into all hashes. -+ */ -+static inline void -+__mb_cache_entry_link(struct mb_cache_entry *ce) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ unsigned int bucket; -+ int n; -+ -+ bucket = hash_long((unsigned long)ce->e_bdev + -+ (ce->e_block & 0xffffff), cache->c_bucket_bits); -+ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); -+ for (n=0; ne_indexes[n].o_key, -+ cache->c_bucket_bits); -+ list_add(&ce->e_indexes[n].o_list, -+ &cache->c_indexes_hash[n][bucket]); -+ } -+} -+ -+ -+/* -+ * Remove the cache entry from all hashes. -+ */ -+static inline void -+__mb_cache_entry_unlink(struct mb_cache_entry *ce) -+{ -+ int n; -+ -+ list_del_init(&ce->e_block_list); -+ for (n = 0; n < mb_cache_indexes(ce->e_cache); n++) -+ list_del(&ce->e_indexes[n].o_list); -+} -+ -+ -+static inline int -+__mb_cache_entry_is_linked(struct mb_cache_entry *ce) -+{ -+ return (!list_empty(&ce->e_block_list)); -+} -+ -+ -+static inline struct mb_cache_entry * -+__mb_cache_entry_read(struct mb_cache_entry *ce) -+{ -+ __mb_cache_entry_takeout_lru(ce); -+ atomic_inc(&ce->e_used); -+ return ce; -+} -+ -+ -+static inline void -+__mb_cache_entry_forget(struct mb_cache_entry *ce) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ -+ mb_assert(atomic_read(&ce->e_used) == 0); -+ atomic_dec(&cache->c_entry_count); -+ if (cache->c_op.free) -+ cache->c_op.free(ce); -+ kmem_cache_free(cache->c_entry_cache, ce); -+} -+ -+ -+static inline void -+__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) -+{ -+ if (atomic_dec_and_test(&ce->e_used)) { -+ if (!__mb_cache_entry_is_linked(ce)) -+ goto forget; -+ __mb_cache_entry_into_lru(ce); -+ } -+ spin_unlock(&mb_cache_spinlock); -+ return; -+forget: -+ spin_unlock(&mb_cache_spinlock); -+ __mb_cache_entry_forget(ce); -+} -+ -+ -+/* -+ * mb_cache_shrink_fn() memory pressure callback -+ * -+ * This function is called by the kernel memory management when memory -+ * gets low. -+ * -+ * @nr_to_scan: Number of objects to scan -+ * @gfp_mask: (ignored) -+ * -+ * Returns the number of objects which are present in the cache. -+ */ -+static int -+mb_cache_shrink_fn(int nr_to_scan, unsigned int gfp_mask) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l; -+ int count = 0; -+ -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_prev(l, &mb_cache_list) { -+ struct mb_cache *cache = -+ list_entry(l, struct mb_cache, c_cache_list); -+ mb_debug("cache %s (%d)", cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ count += atomic_read(&cache->c_entry_count); -+ } -+ mb_debug("trying to free %d entries", nr_to_scan); -+ if (nr_to_scan == 0) { -+ spin_unlock(&mb_cache_spinlock); -+ goto out; -+ } -+ while (nr_to_scan && !list_empty(&mb_cache_lru_list)) { -+ struct mb_cache_entry *ce = -+ list_entry(mb_cache_lru_list.prev, -+ struct mb_cache_entry, e_lru_list); -+ list_move(&ce->e_lru_list, &free_list); -+ if (__mb_cache_entry_is_linked(ce)) -+ __mb_cache_entry_unlink(ce); -+ nr_to_scan--; -+ } -+ spin_unlock(&mb_cache_spinlock); -+ l = free_list.prev; -+ while (l != &free_list) { -+ struct mb_cache_entry *ce = list_entry(l, -+ struct mb_cache_entry, e_lru_list); -+ l = l->prev; -+ __mb_cache_entry_forget(ce); -+ count--; -+ } -+out: -+ mb_debug("%d remaining entries ", count); -+ return count; -+} -+ -+ -+/* -+ * mb_cache_create() create a new cache -+ * -+ * All entries in one cache are equal size. Cache entries may be from -+ * multiple devices. If this is the first mbcache created, registers -+ * the cache with kernel memory management. Returns NULL if no more -+ * memory was available. -+ * -+ * @name: name of the cache (informal) -+ * @cache_op: contains the callback called when freeing a cache entry -+ * @entry_size: The size of a cache entry, including -+ * struct mb_cache_entry -+ * @indexes_count: number of additional indexes in the cache. Must equal -+ * MB_CACHE_INDEXES_COUNT if the number of indexes is -+ * hardwired. -+ * @bucket_bits: log2(number of hash buckets) -+ */ -+struct mb_cache * -+mb_cache_create(const char *name, struct mb_cache_op *cache_op, -+ size_t entry_size, int indexes_count, int bucket_bits) -+{ -+ int m=0, n, bucket_count = 1 << bucket_bits; -+ struct mb_cache *cache = NULL; -+ -+ if(entry_size < sizeof(struct mb_cache_entry) + -+ indexes_count * sizeof(struct mb_cache_entry_index)) -+ return NULL; -+ -+ cache = kmalloc(sizeof(struct mb_cache) + -+ indexes_count * sizeof(struct list_head), GFP_KERNEL); -+ if (!cache) -+ goto fail; -+ cache->c_name = name; -+ if (cache_op) -+ cache->c_op.free = cache_op->free; -+ else -+ cache->c_op.free = NULL; -+ atomic_set(&cache->c_entry_count, 0); -+ cache->c_bucket_bits = bucket_bits; -+#ifdef MB_CACHE_INDEXES_COUNT -+ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); -+#else -+ cache->c_indexes_count = indexes_count; -+#endif -+ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_block_hash) -+ goto fail; -+ for (n=0; nc_block_hash[n]); -+ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * -+ sizeof(struct list_head), -+ GFP_KERNEL); -+ if (!cache->c_indexes_hash[m]) -+ goto fail; -+ for (n=0; nc_indexes_hash[m][n]); -+ } -+ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, -+ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); -+ if (!cache->c_entry_cache) -+ goto fail; -+ -+ spin_lock(&mb_cache_spinlock); -+ if (list_empty(&mb_cache_list)) { -+ if (mb_shrinker) { -+ printk(KERN_ERR "%s: already have a shrinker!\n", -+ __FUNCTION__); -+ remove_shrinker(mb_shrinker); -+ } -+ mb_shrinker = set_shrinker(DEFAULT_SEEKS, mb_cache_shrink_fn); -+ } -+ list_add(&cache->c_cache_list, &mb_cache_list); -+ spin_unlock(&mb_cache_spinlock); -+ return cache; -+ -+fail: -+ if (cache) { -+ while (--m >= 0) -+ kfree(cache->c_indexes_hash[m]); -+ if (cache->c_block_hash) -+ kfree(cache->c_block_hash); -+ kfree(cache); -+ } -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_shrink() -+ * -+ * Removes all cache entires of a device from the cache. All cache entries -+ * currently in use cannot be freed, and thus remain in the cache. All others -+ * are freed. -+ * -+ * @cache: which cache to shrink -+ * @bdev: which device's cache entries to shrink -+ */ -+void -+mb_cache_shrink(struct mb_cache *cache, struct block_device *bdev) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l; -+ -+ spin_lock(&mb_cache_spinlock); -+ l = mb_cache_lru_list.prev; -+ while (l != &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ l = l->prev; -+ if (ce->e_bdev == bdev) { -+ list_move(&ce->e_lru_list, &free_list); -+ if (__mb_cache_entry_is_linked(ce)) -+ __mb_cache_entry_unlink(ce); -+ } -+ } -+ spin_unlock(&mb_cache_spinlock); -+ l = free_list.prev; -+ while (l != &free_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ l = l->prev; -+ __mb_cache_entry_forget(ce); -+ } -+} -+ -+ -+/* -+ * mb_cache_destroy() -+ * -+ * Shrinks the cache to its minimum possible size (hopefully 0 entries), -+ * and then destroys it. If this was the last mbcache, un-registers the -+ * mbcache from kernel memory management. -+ */ -+void -+mb_cache_destroy(struct mb_cache *cache) -+{ -+ LIST_HEAD(free_list); -+ struct list_head *l; -+ int n; -+ -+ spin_lock(&mb_cache_spinlock); -+ l = mb_cache_lru_list.prev; -+ while (l != &mb_cache_lru_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ l = l->prev; -+ if (ce->e_cache == cache) { -+ list_move(&ce->e_lru_list, &free_list); -+ if (__mb_cache_entry_is_linked(ce)) -+ __mb_cache_entry_unlink(ce); -+ } -+ } -+ list_del(&cache->c_cache_list); -+ if (list_empty(&mb_cache_list) && mb_shrinker) { -+ remove_shrinker(mb_shrinker); -+ mb_shrinker = 0; -+ } -+ spin_unlock(&mb_cache_spinlock); -+ -+ l = free_list.prev; -+ while (l != &free_list) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_lru_list); -+ l = l->prev; -+ __mb_cache_entry_forget(ce); -+ } -+ -+ if (atomic_read(&cache->c_entry_count) > 0) { -+ mb_error("cache %s: %d orphaned entries", -+ cache->c_name, -+ atomic_read(&cache->c_entry_count)); -+ } -+ -+ kmem_cache_destroy(cache->c_entry_cache); -+ -+ for (n=0; n < mb_cache_indexes(cache); n++) -+ kfree(cache->c_indexes_hash[n]); -+ kfree(cache->c_block_hash); -+ -+ kfree(cache); -+} -+ -+ -+/* -+ * mb_cache_entry_alloc() -+ * -+ * Allocates a new cache entry. The new entry will not be valid initially, -+ * and thus cannot be looked up yet. It should be filled with data, and -+ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL -+ * if no more memory was available. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_alloc(struct mb_cache *cache) -+{ -+ struct mb_cache_entry *ce; -+ -+ atomic_inc(&cache->c_entry_count); -+ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); -+ if (ce) { -+ INIT_LIST_HEAD(&ce->e_lru_list); -+ INIT_LIST_HEAD(&ce->e_block_list); -+ ce->e_cache = cache; -+ atomic_set(&ce->e_used, 1); -+ } -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_insert() -+ * -+ * Inserts an entry that was allocated using mb_cache_entry_alloc() into -+ * the cache. After this, the cache entry can be looked up, but is not yet -+ * in the lru list as the caller still holds a handle to it. Returns 0 on -+ * success, or -EBUSY if a cache entry for that device + inode exists -+ * already (this may happen after a failed lookup, but when another process -+ * has inserted the same cache entry in the meantime). -+ * -+ * @bdev: device the cache entry belongs to -+ * @block: block number -+ * @keys: array of additional keys. There must be indexes_count entries -+ * in the array (as specified when creating the cache). -+ */ -+int -+mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, -+ sector_t block, unsigned int keys[]) -+{ -+ struct mb_cache *cache = ce->e_cache; -+ unsigned int bucket; -+ struct list_head *l; -+ int error = -EBUSY, n; -+ -+ bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), -+ cache->c_bucket_bits); -+ spin_lock(&mb_cache_spinlock); -+ list_for_each_prev(l, &cache->c_block_hash[bucket]) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_bdev == bdev && ce->e_block == block) -+ goto out; -+ } -+ mb_assert(!__mb_cache_entry_is_linked(ce)); -+ ce->e_bdev = bdev; -+ ce->e_block = block; -+ for (n=0; ne_indexes[n].o_key = keys[n]; -+ __mb_cache_entry_link(ce); -+out: -+ spin_unlock(&mb_cache_spinlock); -+ return error; -+} -+ -+ -+/* -+ * mb_cache_entry_release() -+ * -+ * Release a handle to a cache entry. When the last handle to a cache entry -+ * is released it is either freed (if it is invalid) or otherwise inserted -+ * in to the lru list. -+ */ -+void -+mb_cache_entry_release(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_takeout() -+ * -+ * Take a cache entry out of the cache, making it invalid. The entry can later -+ * be re-inserted using mb_cache_entry_insert(), or released using -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_takeout(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(!__mb_cache_entry_in_lru(ce)); -+ if (__mb_cache_entry_is_linked(ce)) -+ __mb_cache_entry_unlink(ce); -+ spin_unlock(&mb_cache_spinlock); -+} -+ -+ -+/* -+ * mb_cache_entry_free() -+ * -+ * This is equivalent to the sequence mb_cache_entry_takeout() -- -+ * mb_cache_entry_release(). -+ */ -+void -+mb_cache_entry_free(struct mb_cache_entry *ce) -+{ -+ spin_lock(&mb_cache_spinlock); -+ mb_assert(!__mb_cache_entry_in_lru(ce)); -+ if (__mb_cache_entry_is_linked(ce)) -+ __mb_cache_entry_unlink(ce); -+ __mb_cache_entry_release_unlock(ce); -+} -+ -+ -+/* -+ * mb_cache_entry_dup() -+ * -+ * Duplicate a handle to a cache entry (does not duplicate the cache entry -+ * itself). After the call, both the old and the new handle must be released. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_dup(struct mb_cache_entry *ce) -+{ -+ atomic_inc(&ce->e_used); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_get() -+ * -+ * Get a cache entry by device / block number. (There can only be one entry -+ * in the cache per device and block.) Returns NULL if no such cache entry -+ * exists. -+ */ -+struct mb_cache_entry * -+mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, -+ sector_t block) -+{ -+ unsigned int bucket; -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), -+ cache->c_bucket_bits); -+ spin_lock(&mb_cache_spinlock); -+ list_for_each(l, &cache->c_block_hash[bucket]) { -+ ce = list_entry(l, struct mb_cache_entry, e_block_list); -+ if (ce->e_bdev == bdev && ce->e_block == block) { -+ ce = __mb_cache_entry_read(ce); -+ goto cleanup; -+ } -+ } -+ ce = NULL; -+ -+cleanup: -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+ -+static struct mb_cache_entry * -+__mb_cache_entry_find(struct list_head *l, struct list_head *head, -+ int index, struct block_device *bdev, unsigned int key) -+{ -+ while (l != head) { -+ struct mb_cache_entry *ce = -+ list_entry(l, struct mb_cache_entry, -+ e_indexes[index].o_list); -+ if (ce->e_bdev == bdev && -+ ce->e_indexes[index].o_key == key) { -+ ce = __mb_cache_entry_read(ce); -+ if (ce) -+ return ce; -+ } -+ l = l->next; -+ } -+ return NULL; -+} -+ -+ -+/* -+ * mb_cache_entry_find_first() -+ * -+ * Find the first cache entry on a given device with a certain key in -+ * an additional index. Additonal matches can be found with -+ * mb_cache_entry_find_next(). Returns NULL if no match was found. -+ * -+ * @cache: the cache to search -+ * @index: the number of the additonal index to search (0<=indexc_bucket_bits); -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = cache->c_indexes_hash[index][bucket].next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, bdev, key); -+ spin_unlock(&mb_cache_spinlock); -+ return ce; -+} -+ -+ -+/* -+ * mb_cache_entry_find_next() -+ * -+ * Find the next cache entry on a given device with a certain key in an -+ * additional index. Returns NULL if no match could be found. The previous -+ * entry is atomatically released, so that mb_cache_entry_find_next() can -+ * be called like this: -+ * -+ * entry = mb_cache_entry_find_first(); -+ * while (entry) { -+ * ... -+ * entry = mb_cache_entry_find_next(entry, ...); -+ * } -+ * -+ * @prev: The previous match -+ * @index: the number of the additonal index to search (0<=indexe_cache; -+ unsigned int bucket = hash_long(key, cache->c_bucket_bits); -+ struct list_head *l; -+ struct mb_cache_entry *ce; -+ -+ mb_assert(index < mb_cache_indexes(cache)); -+ spin_lock(&mb_cache_spinlock); -+ l = prev->e_indexes[index].o_list.next; -+ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], -+ index, bdev, key); -+ __mb_cache_entry_release_unlock(prev); -+ return ce; -+} -+ -+#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ -diff -Nru a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h ---- a/include/linux/ext3_fs.h Sun Dec 8 02:49:56 2002 -+++ b/include/linux/ext3_fs.h Sun Dec 8 02:49:56 2002 -@@ -64,8 +64,6 @@ - */ - #define EXT3_BAD_INO 1 /* Bad blocks inode */ - #define EXT3_ROOT_INO 2 /* Root inode */ --#define EXT3_ACL_IDX_INO 3 /* ACL inode */ --#define EXT3_ACL_DATA_INO 4 /* ACL inode */ - #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ - #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ - #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ -@@ -95,7 +93,6 @@ - #else - # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) - #endif --#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) - #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) - #ifdef __KERNEL__ - # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) -@@ -130,28 +127,6 @@ - #endif - - /* -- * ACL structures -- */ --struct ext3_acl_header /* Header of Access Control Lists */ --{ -- __u32 aclh_size; -- __u32 aclh_file_count; -- __u32 aclh_acle_count; -- __u32 aclh_first_acle; --}; -- --struct ext3_acl_entry /* Access Control List Entry */ --{ -- __u32 acle_size; -- __u16 acle_perms; /* Access permissions */ -- __u16 acle_type; /* Type of entry */ -- __u16 acle_tag; /* User or group identity */ -- __u16 acle_pad1; -- __u32 acle_next; /* Pointer on next entry for the */ -- /* same inode or on next free entry */ --}; -- --/* - * Structure of a blocks group descriptor - */ - struct ext3_group_desc -@@ -347,6 +322,7 @@ - #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ - #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ - #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ -+#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ - - /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - #ifndef _LINUX_EXT2_FS_H -@@ -529,7 +505,7 @@ - #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ - #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ - --#define EXT3_FEATURE_COMPAT_SUPP 0 -+#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR - #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ - EXT3_FEATURE_INCOMPAT_RECOVER) - #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ -@@ -713,6 +689,7 @@ - - - /* inode.c */ -+extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); - extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); - -@@ -781,8 +758,10 @@ - - /* namei.c */ - extern struct inode_operations ext3_dir_inode_operations; -+extern struct inode_operations ext3_special_inode_operations; - - /* symlink.c */ -+extern struct inode_operations ext3_symlink_inode_operations; - extern struct inode_operations ext3_fast_symlink_inode_operations; - - -diff -Nru a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h ---- a/include/linux/ext3_jbd.h Sun Dec 8 02:49:56 2002 -+++ b/include/linux/ext3_jbd.h Sun Dec 8 02:49:56 2002 -@@ -30,13 +30,19 @@ - - #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 - -+/* Extended attributes may touch two data buffers, two bitmap buffers, -+ * and two group and summaries. */ -+ -+#define EXT3_XATTR_TRANS_BLOCKS 8 -+ - /* Define the minimum size for a transaction which modifies data. This - * needs to take into account the fact that we may end up modifying two - * quota files too (one for the group, one for the user quota). The - * superblock only gets updated once, of course, so don't bother - * counting that again for the quota updates. */ - --#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) -+#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ -+ EXT3_XATTR_TRANS_BLOCKS - 2) - - extern int ext3_writepage_trans_blocks(struct inode *inode); - -diff -Nru a/include/linux/mbcache.h b/include/linux/mbcache.h ---- /dev/null Wed Dec 31 16:00:00 1969 -+++ b/include/linux/mbcache.h Sun Dec 8 02:49:56 2002 -@@ -0,0 +1,72 @@ -+/* -+ File: linux/mbcache.h -+ -+ (C) 2001 by Andreas Gruenbacher, -+*/ -+ -+/* Hardwire the number of additional indexes */ -+#define MB_CACHE_INDEXES_COUNT 1 -+ -+struct mb_cache_entry; -+ -+struct mb_cache_op { -+ void (*free)(struct mb_cache_entry *); -+}; -+ -+struct mb_cache { -+ struct list_head c_cache_list; -+ const char *c_name; -+ struct mb_cache_op c_op; -+ atomic_t c_entry_count; -+ int c_bucket_bits; -+#ifndef MB_CACHE_INDEXES_COUNT -+ int c_indexes_count; -+#endif -+ kmem_cache_t *c_entry_cache; -+ struct list_head *c_block_hash; -+ struct list_head *c_indexes_hash[0]; -+}; -+ -+struct mb_cache_entry_index { -+ struct list_head o_list; -+ unsigned int o_key; -+}; -+ -+struct mb_cache_entry { -+ struct list_head e_lru_list; -+ struct mb_cache *e_cache; -+ atomic_t e_used; -+ struct block_device *e_bdev; -+ sector_t e_block; -+ struct list_head e_block_list; -+ struct mb_cache_entry_index e_indexes[0]; -+}; -+ -+/* Functions on caches */ -+ -+struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, -+ int, int); -+void mb_cache_shrink(struct mb_cache *, struct block_device *); -+void mb_cache_destroy(struct mb_cache *); -+ -+/* Functions on cache entries */ -+ -+struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); -+int mb_cache_entry_insert(struct mb_cache_entry *, struct block_device *, -+ sector_t, unsigned int[]); -+void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); -+void mb_cache_entry_release(struct mb_cache_entry *); -+void mb_cache_entry_takeout(struct mb_cache_entry *); -+void mb_cache_entry_free(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); -+struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, -+ struct block_device *, -+ sector_t); -+#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) -+struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, -+ struct block_device *, -+ unsigned int); -+struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, -+ struct block_device *, -+ unsigned int); -+#endif diff --git a/lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch b/lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch new file mode 100644 index 0000000..d029650 --- /dev/null +++ b/lustre/kernel_patches/patches/ext3_orphan_lock-2.4.20-rh.patch @@ -0,0 +1,82 @@ + fs/ext3/namei.c | 15 +++++++-------- + fs/ext3/super.c | 1 + + include/linux/ext3_fs_sb.h | 1 + + 3 files changed, 9 insertions(+), 8 deletions(-) + +--- linux-rh-2.4.20-8/fs/ext3/namei.c~ext3_orphan_lock-2.4.20-rh 2003-05-05 19:49:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/namei.c 2003-05-05 20:01:28.000000000 +0800 +@@ -1747,8 +1747,8 @@ int ext3_orphan_add(handle_t *handle, st + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; +- +- lock_super(sb); ++ ++ down(&EXT3_SB(sb)->s_orphan_lock); + if (!list_empty(&EXT3_I(inode)->i_orphan)) + goto out_unlock; + +@@ -1796,7 +1796,7 @@ int ext3_orphan_add(handle_t *handle, st + jbd_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); + out_unlock: +- unlock_super(sb); ++ up(&EXT3_SB(sb)->s_orphan_lock); + ext3_std_error(inode->i_sb, err); + return err; + } +@@ -1809,20 +1809,19 @@ int ext3_orphan_del(handle_t *handle, st + { + struct list_head *prev; + struct ext3_inode_info *ei = EXT3_I(inode); +- struct ext3_sb_info *sbi; ++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb); + unsigned long ino_next; + struct ext3_iloc iloc; + int err = 0; + +- lock_super(inode->i_sb); ++ down(&sbi->s_orphan_lock); + if (list_empty(&ei->i_orphan)) { +- unlock_super(inode->i_sb); ++ up(&sbi->s_orphan_lock); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = ei->i_orphan.prev; +- sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + +@@ -1871,7 +1870,7 @@ int ext3_orphan_del(handle_t *handle, st + out_err: + ext3_std_error(inode->i_sb, err); + out: +- unlock_super(inode->i_sb); ++ up(&sbi->s_orphan_lock); + return err; + + out_brelse: +--- linux-rh-2.4.20-8/fs/ext3/super.c~ext3_orphan_lock-2.4.20-rh 2003-05-05 19:49:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/super.c 2003-05-05 19:54:09.000000000 +0800 +@@ -1151,6 +1151,7 @@ struct super_block * ext3_read_super (st + */ + sb->s_op = &ext3_sops; + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ ++ sema_init(&sbi->s_orphan_lock, 1); + + sb->s_root = 0; + +--- linux-rh-2.4.20-8/include/linux/ext3_fs_sb.h~ext3_orphan_lock-2.4.20-rh 2003-05-05 19:49:07.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext3_fs_sb.h 2003-05-05 19:54:09.000000000 +0800 +@@ -69,6 +69,7 @@ struct ext3_sb_info { + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; ++ struct semaphore s_orphan_lock; + unsigned long s_commit_interval; + struct block_device *journal_bdev; + #ifdef CONFIG_JBD_DEBUG + +_ diff --git a/lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch b/lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch new file mode 100644 index 0000000..df46643 --- /dev/null +++ b/lustre/kernel_patches/patches/extN-2.4.18-ino_sb_fixup.patch @@ -0,0 +1,33 @@ +--- ./include/linux/ext3_fs.h.orig Tue May 7 17:06:03 2002 ++++ ./include/linux/ext3_fs.h Tue May 7 17:07:11 2002 +@@ -17,6 +17,8 @@ + #define _LINUX_EXT3_FS_H + + #include ++#include ++#include + + /* + * The second extended filesystem constants/structures +@@ -86,8 +88,8 @@ + #define EXT3_MIN_BLOCK_LOG_SIZE 10 + + #ifdef __KERNEL__ +-#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) +-#define EXT3_I(inode) (&((inode)->u.ext3_i)) ++#define EXT3_SB(sb) ((struct ext3_sb_info *)&((sb)->u.generic_sbp)) ++#define EXT3_I(inode) ((struct ext3_inode_info *)&((inode)->u.generic_ip)) + + #define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) + #define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -447,7 +447,9 @@ + #define NEXT_ORPHAN(inode) EXT3_I(inode)->i_dtime + static inline struct inode *orphan_list_entry(struct list_head *l) + { +- return list_entry(l, struct inode, u.ext3_i.i_orphan); ++ return ((struct inode *)((char *)l - ++ (unsigned long)(offsetof(struct inode, u.generic_ip) + ++ offsetof(struct ext3_inode_info, i_orphan)))); + } + + /* diff --git a/lustre/kernel_patches/patches/extN-delete_thread.patch b/lustre/kernel_patches/patches/extN-delete_thread.patch new file mode 100644 index 0000000..4248b5c --- /dev/null +++ b/lustre/kernel_patches/patches/extN-delete_thread.patch @@ -0,0 +1,278 @@ + 0 files changed + +--- linux-2.4.18-p4smp-61chaos/include/linux/ext3_fs.h~extN-delete_thread 2003-05-29 10:19:15.000000000 +0800 ++++ linux-2.4.18-p4smp-61chaos-root/include/linux/ext3_fs.h 2003-05-29 10:50:04.000000000 +0800 +@@ -190,6 +190,7 @@ struct ext3_group_desc + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ ++#define EXT3_STATE_DELETE 0x00000010 /* deferred delete inode */ + + /* + * ioctl commands +--- linux-2.4.18-p4smp-61chaos/include/linux/ext3_fs_sb.h~extN-delete_thread 2003-05-29 10:19:15.000000000 +0800 ++++ linux-2.4.18-p4smp-61chaos-root/include/linux/ext3_fs_sb.h 2003-05-29 10:50:04.000000000 +0800 +@@ -29,6 +29,8 @@ + + #define EXT3_MAX_GROUP_LOADED 32 + ++#define EXT3_DELETE_THREAD ++ + /* + * third extended-fs super-block data in memory + */ +@@ -74,6 +76,14 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif ++#ifdef EXT3_DELETE_THREAD ++ spinlock_t s_delete_lock; ++ struct list_head s_delete_list; ++ unsigned long s_delete_blocks; ++ unsigned long s_delete_inodes; ++ wait_queue_head_t s_delete_thread_queue; ++ wait_queue_head_t s_delete_waiter_queue; ++#endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ +--- linux-2.4.18-p4smp-61chaos/fs/ext3/super.c~extN-delete_thread 2003-05-29 10:19:15.000000000 +0800 ++++ linux-2.4.18-p4smp-61chaos-root/fs/ext3/super.c 2003-05-29 10:50:04.000000000 +0800 +@@ -398,6 +398,207 @@ static void dump_orphan_list(struct supe + } + } + ++#ifdef EXT3_DELETE_THREAD ++/* ++ * Delete inodes in a loop until there are no more to be deleted. ++ * Normally, we run in the background doing the deletes and sleeping again, ++ * and clients just add new inodes to be deleted onto the end of the list. ++ * If someone is concerned about free space (e.g. block allocation or similar) ++ * then they can sleep on s_delete_waiter_queue and be woken up when space ++ * has been freed. ++ */ ++int ext3_delete_thread(void *data) ++{ ++ struct super_block *sb = data; ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ struct task_struct *tsk = current; ++ ++ /* Almost like daemonize, but not quite */ ++ exit_mm(current); ++ tsk->session = 1; ++ tsk->pgrp = 1; ++ tsk->tty = NULL; ++ exit_files(current); ++ reparent_to_init(); ++ ++ sprintf(tsk->comm, "kdelext3-%s", kdevname(sb->s_dev)); ++ sigfillset(&tsk->blocked); ++ ++ tsk->flags |= PF_KERNTHREAD; ++ ++ INIT_LIST_HEAD(&sbi->s_delete_list); ++ wake_up(&sbi->s_delete_waiter_queue); ++ printk(KERN_INFO "EXT3-fs: delete thread on %s started\n", ++ kdevname(sb->s_dev)); ++ ++ /* main loop */ ++ for (;;) { ++ sleep_on(&sbi->s_delete_thread_queue); ++ printk(KERN_DEBUG "%s woken up: %lu inodes, %lu blocks\n", ++ tsk->comm, sbi->s_delete_inodes, sbi->s_delete_blocks); ++ ++ spin_lock(&sbi->s_delete_lock); ++ if (list_empty(&sbi->s_delete_list)) { ++ memset(&sbi->s_delete_list, 0, ++ sizeof(sbi->s_delete_list)); ++ spin_unlock(&sbi->s_delete_lock); ++ printk(KERN_DEBUG "ext3 delete thread on %s exiting\n", ++ kdevname(sb->s_dev)); ++ wake_up(&sbi->s_delete_waiter_queue); ++ break; ++ } ++ ++ while (!list_empty(&sbi->s_delete_list)) { ++ struct inode *inode=list_entry(sbi->s_delete_list.next, ++ struct inode, i_dentry); ++ unsigned long blocks = inode->i_blocks >> ++ (inode->i_blkbits - 9); ++ ++ list_del_init(&inode->i_dentry); ++ spin_unlock(&sbi->s_delete_lock); ++ printk(KERN_DEBUG "%s delete ino %lu blk %lu\n", ++ tsk->comm, inode->i_ino, blocks); ++ ++ iput(inode); ++ ++ spin_lock(&sbi->s_delete_lock); ++ sbi->s_delete_blocks -= blocks; ++ sbi->s_delete_inodes--; ++ } ++ if (sbi->s_delete_blocks != 0 || sbi->s_delete_inodes != 0) ++ printk(KERN_WARNING ++ "%lu blocks and %lu left on list?\n", ++ sbi->s_delete_blocks, sbi->s_delete_inodes); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ spin_unlock(&sbi->s_delete_lock); ++ wake_up(&sbi->s_delete_waiter_queue); ++ } ++ ++ return 0; ++} ++ ++static void ext3_start_delete_thread(struct super_block *sb) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(sb); ++ int rc; ++ ++ spin_lock_init(&sbi->s_delete_lock); ++ memset(&sbi->s_delete_list, 0, sizeof(sbi->s_delete_list)); ++ init_waitqueue_head(&sbi->s_delete_thread_queue); ++ init_waitqueue_head(&sbi->s_delete_waiter_queue); ++ sbi->s_delete_blocks = 0; ++ sbi->s_delete_inodes = 0; ++ rc = kernel_thread(ext3_delete_thread, sb, CLONE_VM | CLONE_FILES); ++ if (rc < 0) ++ printk(KERN_ERR "EXT3-fs: cannot start delete thread: rc %d\n", ++ rc); ++ else ++ wait_event(sbi->s_delete_waiter_queue, sbi->s_delete_list.next); ++} ++ ++static void ext3_stop_delete_thread(struct ext3_sb_info *sbi) ++{ ++ wake_up(&sbi->s_delete_thread_queue); ++ wait_event(sbi->s_delete_waiter_queue, list_empty(&sbi->s_delete_list)); ++} ++ ++/* Instead of playing games with the inode flags, destruction, etc we just ++ * duplicate the inode data locally and put it on a list for the truncate ++ * thread. We need large parts of the inode struct in order to complete ++ * the truncate and unlink, so we may as well just copy the whole thing. ++ * ++ * If we have any problem deferring the delete, just delete it right away. ++ * If we defer it, we also mark how many blocks it would free, so that we ++ * can keep the statfs data correct, and we know if we should sleep on the ++ * truncate thread when we run out of space. ++ * ++ * One shouldn't consider this duplicate an "inode", as it isn't really ++ * visible to the VFS, but rather a data struct that holds truncate data. ++ * ++ * In 2.5 this can be done much more cleanly by just registering a "drop" ++ * method in the super_operations struct. ++ */ ++static void ext3_delete_inode_thread(struct inode *old_inode) ++{ ++ struct ext3_sb_info *sbi = EXT3_SB(old_inode->i_sb); ++ struct inode *new_inode; ++ unsigned long blocks = old_inode->i_blocks >> (old_inode->i_blkbits-9); ++ ++ if (is_bad_inode(old_inode)) { ++ clear_inode(old_inode); ++ return; ++ } ++ ++ /* We may want to delete the inode immediately and not defer it */ ++ if (IS_SYNC(old_inode) || blocks <= EXT3_NDIR_BLOCKS || ++ !sbi->s_delete_list.next) { ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ if (EXT3_I(old_inode)->i_state & EXT3_STATE_DELETE) { ++ ext3_debug("doing deferred inode %lu delete (%lu blocks)\n", ++ old_inode->i_ino, blocks); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ ++ /* We can iget this inode again here, because our caller has unhashed ++ * old_inode, so new_inode will be in a different inode struct. ++ * ++ * We need to ensure that the i_orphan pointers in the other inodes ++ * point at the new inode copy instead of the old one so the orphan ++ * list doesn't get corrupted when the old orphan inode is freed. ++ */ ++ down(&sbi->s_orphan_lock); ++ ++ EXT3_SB(old_inode->i_sb)->s_mount_state |= EXT3_ORPHAN_FS; ++ new_inode = iget(old_inode->i_sb, old_inode->i_ino); ++ EXT3_SB(old_inode->i_sb)->s_mount_state &= ~EXT3_ORPHAN_FS; ++ if (is_bad_inode(new_inode)) { ++ printk(KERN_WARNING "read bad inode %lu\n", old_inode->i_ino); ++ iput(new_inode); ++ new_inode = NULL; ++ } ++ if (!new_inode) { ++ up(&sbi->s_orphan_lock); ++ ext3_debug(KERN_DEBUG "delete inode %lu directly (bad read)\n", ++ old_inode->i_ino); ++ ext3_delete_inode(old_inode); ++ return; ++ } ++ J_ASSERT(new_inode != old_inode); ++ ++ J_ASSERT(!list_empty(&EXT3_I(old_inode)->i_orphan)); ++ /* Ugh. We need to insert new_inode into the same spot on the list ++ * as old_inode was, to ensure the in-memory orphan list is still ++ * the same as the on-disk orphan list. ++ */ ++ EXT3_I(new_inode)->i_orphan = EXT3_I(old_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.next->prev = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_orphan.prev->next = &EXT3_I(new_inode)->i_orphan; ++ EXT3_I(new_inode)->i_state |= EXT3_STATE_DELETE; ++ up(&sbi->s_orphan_lock); ++ ++ clear_inode(old_inode); ++ ++ printk(KERN_DEBUG "delete inode %lu (%lu blocks) by thread\n", ++ new_inode->i_ino, blocks); ++ spin_lock(&sbi->s_delete_lock); ++ J_ASSERT(list_empty(&new_inode->i_dentry)); ++ list_add_tail(&new_inode->i_dentry, &sbi->s_delete_list); ++ sbi->s_delete_blocks += blocks; ++ sbi->s_delete_inodes++; ++ spin_unlock(&sbi->s_delete_lock); ++ ++ wake_up(&sbi->s_delete_thread_queue); ++} ++#else ++#define ext3_start_delete_thread(sbi) do {} while(0) ++#define ext3_stop_delete_thread(sbi) do {} while(0) ++#endif /* EXT3_DELETE_THREAD */ ++ + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); +@@ -405,6 +606,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_stop_delete_thread(sbi); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -453,7 +655,11 @@ static struct super_operations ext3_sops + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ ++#ifdef EXT3_DELETE_THREAD ++ delete_inode: ext3_delete_inode_thread,/* BKL not held. We take it */ ++#else + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ ++#endif + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + sync_fs: ext3_sync_fs, +@@ -1209,6 +1415,7 @@ struct super_block * ext3_read_super (st + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); ++ ext3_start_delete_thread(sb); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + +_ diff --git a/lustre/extN/extN-iget-debug.diff b/lustre/kernel_patches/patches/extN-iget-debug.patch similarity index 78% rename from lustre/extN/extN-iget-debug.diff rename to lustre/kernel_patches/patches/extN-iget-debug.patch index 9714e35..dbe90c8 100644 --- a/lustre/extN/extN-iget-debug.diff +++ b/lustre/kernel_patches/patches/extN-iget-debug.patch @@ -4,7 +4,7 @@ return ret; } -+static int extN_find_inode(struct inode *inode, unsigned long ino, ++static int ext3_find_inode(struct inode *inode, unsigned long ino, + void *opaque) +{ + const char *name = NULL; @@ -22,15 +22,15 @@ + return 1; +} + - static struct dentry *extN_lookup(struct inode * dir, struct dentry *dentry) + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) { struct inode * inode; -@@ -724,7 +742,7 @@ +@@ -724,8 +742,8 @@ if (bh) { unsigned long ino = le32_to_cpu(de->inode); brelse (bh); - inode = iget(dir->i_sb, ino); -+ inode = iget4(dir->i_sb, ino, extN_find_inode, dentry); ++ inode = iget4(dir->i_sb, ino, ext3_find_inode, dentry); if (!inode) return ERR_PTR(-EACCES); @@ -38,11 +38,11 @@ +++ linux/fs/ext3/inode.c Sat Feb 1 00:34:45 2003 @@ -166,6 +166,9 @@ */ - void extN_put_inode (struct inode * inode) + void ext3_put_inode (struct inode * inode) { + printk(KERN_INFO "putting inode %s:%lu (%p) count %d\n", + kdevname(inode->i_dev), inode->i_ino, inode, + atomic_read(&inode->i_count)); - extN_discard_prealloc (inode); + ext3_discard_prealloc (inode); } diff --git a/lustre/extN/extN-misc-fixup.diff b/lustre/kernel_patches/patches/extN-misc-fixup.patch similarity index 58% rename from lustre/extN/extN-misc-fixup.diff rename to lustre/kernel_patches/patches/extN-misc-fixup.patch index db0bc0f..06ea72a 100644 --- a/lustre/extN/extN-misc-fixup.diff +++ b/lustre/kernel_patches/patches/extN-misc-fixup.patch @@ -1,23 +1,23 @@ ---- linux-2.4.17/fs/extN/super.c.orig Fri Dec 21 10:41:55 2001 -+++ linux-2.4.17/fs/extN/super.c Fri Mar 22 11:00:41 2002 +--- linux-2.4.17/fs/ext3/super.c.orig Fri Dec 21 10:41:55 2001 ++++ linux-2.4.17/fs/ext3/super.c Fri Mar 22 11:00:41 2002 @@ -1344,10 +1342,10 @@ - printk(KERN_ERR "EXTN-fs: I/O error on journal device\n"); + printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); goto out_journal; } - if (ntohl(journal->j_superblock->s_nr_users) != 1) { + if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - printk(KERN_ERR "EXTN-fs: External journal has more than one " + printk(KERN_ERR "EXT3-fs: External journal has more than one " "user (unsupported) - %d\n", - ntohl(journal->j_superblock->s_nr_users)); + be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } - EXTN_SB(sb)->journal_bdev = bdev; + EXT3_SB(sb)->journal_bdev = bdev; @@ -1560,6 +1560,7 @@ unlock_kernel(); return ret; } -+EXPORT_SYMBOL(extN_force_commit); /* here to avoid potential patch collisions */ ++EXPORT_SYMBOL(ext3_force_commit); /* here to avoid potential patch collisions */ /* * Ext3 always journals updates to the superblock itself, so we don't diff --git a/lustre/extN/extN-noread.diff b/lustre/kernel_patches/patches/extN-noread.patch similarity index 54% rename from lustre/extN/extN-noread.diff rename to lustre/kernel_patches/patches/extN-noread.patch index 56220e2..63f4463 100644 --- a/lustre/extN/extN-noread.diff +++ b/lustre/kernel_patches/patches/extN-noread.patch @@ -1,7 +1,11 @@ -diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c ---- lustre-head/fs/extN/ialloc.c Mon Dec 23 10:02:58 2002 -+++ lustre/fs/extN/ialloc.c Mon Dec 23 09:46:20 2002 -@@ -289,6 +289,37 @@ + fs/ext3/ialloc.c | 47 +++++++++++++++++++++- + fs/ext3/inode.c | 99 ++++++++++++++++++++++++++++++++++++------------ + include/linux/ext3_fs.h | 2 + 3 files changed, 122 insertions(+), 26 deletions(-) + +--- linux-2.4.18-chaos52/fs/ext3/ialloc.c~extN-noread 2003-05-16 12:26:29.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/ialloc.c 2003-05-16 12:26:31.000000000 +0800 +@@ -289,6 +289,37 @@ error_return: } /* @@ -12,7 +16,7 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c + * + * Caller must be holding superblock lock (group/bitmap read lock in future). + */ -+int extN_itable_block_used(struct super_block *sb, unsigned int block_group, ++int ext3_itable_block_used(struct super_block *sb, unsigned int block_group, + int offset) +{ + int bitmap_nr = load_inode_bitmap(sb, block_group); @@ -23,12 +27,12 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c + if (bitmap_nr < 0) + return 1; + -+ inodes_per_block = sb->s_blocksize / EXTN_SB(sb)->s_inode_size; ++ inodes_per_block = sb->s_blocksize / EXT3_SB(sb)->s_inode_size; + inum = offset & ~(inodes_per_block - 1); + iend = inum + inodes_per_block; -+ ibitmap = EXTN_SB(sb)->s_inode_bitmap[bitmap_nr]; ++ ibitmap = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; + for (; inum < iend; inum++) { -+ if (inum != offset && extN_test_bit(inum, ibitmap->b_data)) ++ if (inum != offset && ext3_test_bit(inum, ibitmap->b_data)) + return 1; + } + @@ -39,70 +43,69 @@ diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of -@@ -312,6 +343,7 @@ - struct extN_group_desc * gdp; - struct extN_group_desc * tmp; - struct extN_super_block * es; -+ struct extN_iloc iloc; +@@ -312,6 +343,7 @@ struct inode * ext3_new_inode (handle_t + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; ++ struct ext3_iloc iloc; int err = 0; /* Cannot create files in a deleted directory */ -@@ -505,7 +538,7 @@ +@@ -505,7 +537,7 @@ repeat: ei->i_prealloc_count = 0; #endif ei->i_block_group = i; - + - if (ei->i_flags & EXTN_SYNC_FL) + if (ei->i_flags & EXT3_SYNC_FL) inode->i_flags |= S_SYNC; if (IS_SYNC(inode)) -@@ -514,9 +547,18 @@ +@@ -514,9 +546,18 @@ repeat: inode->i_generation = sbi->s_next_generation++; - ei->i_state = EXTN_STATE_NEW; -- err = extN_mark_inode_dirty(handle, inode); -+ err = extN_get_inode_loc_new(inode, &iloc, 1); + ei->i_state = EXT3_STATE_NEW; +- err = ext3_mark_inode_dirty(handle, inode); ++ err = ext3_get_inode_loc_new(inode, &iloc, 1); if (err) goto fail; - + BUFFER_TRACE(iloc->bh, "get_write_access"); -+ err = extN_journal_get_write_access(handle, iloc.bh); ++ err = ext3_journal_get_write_access(handle, iloc.bh); + if (err) { + brelse(iloc.bh); + iloc.bh = NULL; + goto fail; + } -+ err = extN_mark_iloc_dirty(handle, inode, &iloc); ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) goto fail; + unlock_super (sb); if(DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); -diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c ---- lustre-head/fs/extN/inode.c Mon Dec 23 10:02:58 2002 -+++ lustre/fs/extN/inode.c Mon Dec 23 09:50:25 2002 -@@ -2011,23 +1994,28 @@ - extN_journal_stop(handle, inode); +--- linux-2.4.18-chaos52/fs/ext3/inode.c~extN-noread 2003-05-16 12:26:29.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/inode.c 2003-05-16 12:27:06.000000000 +0800 +@@ -2011,23 +2011,28 @@ out_stop: + ext3_journal_stop(handle, inode); } -/* -- * extN_get_inode_loc returns with an extra refcount against the +- * ext3_get_inode_loc returns with an extra refcount against the - * inode's underlying buffer_head on success. - */ +#define NUM_INODE_PREREAD 16 --int extN_get_inode_loc (struct inode *inode, struct extN_iloc *iloc) +-int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) +/* -+ * extN_get_inode_loc returns with an extra refcount against the inode's ++ * ext3_get_inode_loc returns with an extra refcount against the inode's + * underlying buffer_head on success. If this is for a new inode allocation + * (new is non-zero) then we may be able to optimize away the read if there + * are no other in-use inodes in this inode table block. If we need to do + * a read, then read in a whole chunk of blocks to avoid blocking again soon + * if we are doing lots of creates/updates. + */ -+int extN_get_inode_loc_new(struct inode *inode, struct extN_iloc *iloc, int new) ++int ext3_get_inode_loc_new(struct inode *inode, struct ext3_iloc *iloc, int new) { struct super_block *sb = inode->i_sb; - struct extN_sb_info *sbi = EXTN_SB(sb); + struct ext3_sb_info *sbi = EXT3_SB(sb); - struct buffer_head *bh = 0; + struct buffer_head *bh[NUM_INODE_PREREAD]; unsigned long block; @@ -110,25 +113,25 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c unsigned long group_desc; unsigned long desc; unsigned long offset; - struct extN_group_desc * gdp; + struct ext3_group_desc * gdp; - + - if ((inode->i_ino != EXTN_ROOT_INO && - inode->i_ino != EXTN_JOURNAL_INO && - inode->i_ino < EXTN_FIRST_INO(sb)) || -@@ -2042,38 +2034,86 @@ + if ((inode->i_ino != EXT3_ROOT_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(sb)) || +@@ -2042,38 +2047,86 @@ int ext3_get_inode_loc (struct inode *in } group_desc = block_group >> sbi->s_desc_per_block_bits; desc = block_group & (sbi->s_desc_per_block - 1); - bh = sbi->s_group_desc[group_desc]; - if (!bh) { + if (!sbi->s_group_desc[group_desc]) { - extN_error(sb, __FUNCTION__, "Descriptor not loaded"); + ext3_error(sb, __FUNCTION__, "Descriptor not loaded"); goto bad_inode; } -- gdp = (struct extN_group_desc *) bh->b_data; -+ gdp = (struct extN_group_desc *)(sbi->s_group_desc[group_desc]->b_data); +- gdp = (struct ext3_group_desc *) bh->b_data; ++ gdp = (struct ext3_group_desc *)(sbi->s_group_desc[group_desc]->b_data); + /* * Figure out the offset within the block group inode table @@ -138,13 +141,13 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c + offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group); + block = le32_to_cpu(gdp[desc].bg_inode_table) + -- (offset >> EXTN_BLOCK_SIZE_BITS(sb)); +- (offset >> EXT3_BLOCK_SIZE_BITS(sb)); - if (!(bh = sb_bread(sb, block))) { -- extN_error (sb, __FUNCTION__, +- ext3_error (sb, __FUNCTION__, - "unable to read inode block - " - "inode=%lu, block=%lu", inode->i_ino, block); - goto bad_inode; -+ (offset * sbi->s_inode_size >> EXTN_BLOCK_SIZE_BITS(sb)); ++ (offset * sbi->s_inode_size >> EXT3_BLOCK_SIZE_BITS(sb)); + + bh[0] = sb_getblk(sb, block); + if (buffer_uptodate(bh[0])) @@ -154,7 +157,7 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c + * in memory, then we just zero it out. Otherwise, we keep the + * current block contents (deleted inode data) for posterity. + */ -+ if (new && !extN_itable_block_used(sb, block_group, offset)) { ++ if (new && !ext3_itable_block_used(sb, block_group, offset)) { + lock_buffer(bh[0]); + memset(bh[0]->b_data, 0, bh[0]->b_size); + mark_buffer_uptodate(bh[0], 1); @@ -169,7 +172,7 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c + if (block_end > itable_end) + block_end = itable_end; + -+ for (; block < block_end; block++) { ++ for (++block; block < block_end; block++) { + bh[count] = sb_getblk(sb, block); + if (count && (buffer_uptodate(bh[count]) || + buffer_locked(bh[count]))) { @@ -186,21 +189,21 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c + + wait_on_buffer(bh[0]); + if (!buffer_uptodate(bh[0])) { -+ extN_error(sb, __FUNCTION__, ++ ext3_error(sb, __FUNCTION__, + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, + bh[0]->b_blocknr); + goto bad_inode; + } } -- offset &= (EXTN_BLOCK_SIZE(sb) - 1); +- offset &= (EXT3_BLOCK_SIZE(sb) - 1); + done: -+ offset = (offset * sbi->s_inode_size) & (EXTN_BLOCK_SIZE(sb) - 1); ++ offset = (offset * sbi->s_inode_size) & (EXT3_BLOCK_SIZE(sb) - 1); - iloc->bh = bh; -- iloc->raw_inode = (struct extN_inode *) (bh->b_data + offset); +- iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); + iloc->bh = bh[0]; -+ iloc->raw_inode = (struct extN_inode *)(bh[0]->b_data + offset); ++ iloc->raw_inode = (struct ext3_inode *)(bh[0]->b_data + offset); iloc->block_group = block_group; - + @@ -211,23 +214,24 @@ diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c return -EIO; } -+int extN_get_inode_loc(struct inode *inode, struct extN_iloc *iloc) ++int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc) +{ -+ return extN_get_inode_loc_new(inode, iloc, 0); ++ return ext3_get_inode_loc_new(inode, iloc, 0); +} + - void extN_read_inode(struct inode * inode) + void ext3_read_inode(struct inode * inode) { - struct extN_iloc iloc; -diff -ru include/linux/extN_fs.h.orig include/linux/extN_fs.h ---- lustre/include/linux/extN_fs.h.orig Sat Mar 8 01:23:09 2003 -+++ lustre/include/linux/extN_fs.h Sat Mar 8 01:24:31 2003 -@@ -642,6 +646,8 @@ - extern struct buffer_head * extN_getblk (handle_t *, struct inode *, long, int, int *); - extern struct buffer_head * extN_bread (handle_t *, struct inode *, int, int, int *); + struct ext3_iloc iloc; +--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~extN-noread 2003-05-16 12:26:29.000000000 +0800 ++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h 2003-05-16 12:26:31.000000000 +0800 +@@ -640,6 +640,8 @@ extern int ext3_forget(handle_t *, int, + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -+extern int extN_itable_block_used(struct super_block *sb, unsigned int, int); -+extern int extN_get_inode_loc_new(struct inode *, struct extN_iloc *, int); - extern int extN_get_inode_loc (struct inode *, struct extN_iloc *); - extern void extN_read_inode (struct inode *); - extern void extN_write_inode (struct inode *, int); ++extern int ext3_itable_block_used(struct super_block *sb, unsigned int, int); ++extern int ext3_get_inode_loc_new(struct inode *, struct ext3_iloc *, int); + extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *); + extern void ext3_read_inode (struct inode *); + extern void ext3_write_inode (struct inode *, int); + +_ diff --git a/lustre/kernel_patches/patches/extN-san.patch b/lustre/kernel_patches/patches/extN-san.patch new file mode 100644 index 0000000..d58fe8c --- /dev/null +++ b/lustre/kernel_patches/patches/extN-san.patch @@ -0,0 +1,106 @@ + fs/ext3/inode.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/super.c | 4 ++ + 2 files changed, 85 insertions(+) + +--- linux-2.4.18-18.8.0-l18/fs/ext3/inode.c~extN-san Sun May 18 12:58:13 2003 ++++ linux-2.4.18-18.8.0-l18-phil/fs/ext3/inode.c Sun May 18 13:24:49 2003 +@@ -2781,3 +2781,84 @@ int ext3_change_inode_journal_flag(struc + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ ++ ++/* for each block: 1 ind + 1 dind + 1 tind ++ * for each block: 3 bitmap blocks ++ * for each block: 3 group descriptor blocks ++ * i inode block ++ * 1 superblock ++ * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files ++ * ((1+1+1) * 3 * nblocks) + 1 + 1 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS ++ * ++ * XXX assuming: ++ * (1) fs logic block size == page size ++ * (2) ext3 in writeback mode ++ */ ++static inline int ext3_san_write_trans_blocks(int nblocks) ++{ ++ int ret; ++ ++ ret = (1 + 1 + 1) * 3 * nblocks + 1 + 1; ++ ++#ifdef CONFIG_QUOTA ++ ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return ret; ++} ++ ++/* Alloc blocks for an inode, while don't create any buffer/page ++ * for data I/O; set the inode size if file is extended. ++ * ++ * @inode: target inode ++ * @blocks: array of logic block number ++ * @nblocks: how many blocks need be alloced ++ * @newsize: new filesize we should set ++ * ++ * return: 0 success, otherwise failed ++ * (*blocks) contains physical block number alloced ++ * ++ * XXX this assume the fs block size == page size ++ */ ++int ext3_prep_san_write(struct inode *inode, long *blocks, ++ int nblocks, loff_t newsize) ++{ ++ handle_t *handle; ++ struct buffer_head bh_tmp; ++ int needed_blocks; ++ int i, ret = 0, ret2; ++ ++ needed_blocks = ext3_san_write_trans_blocks(nblocks); ++ ++ lock_kernel(); ++ handle = ext3_journal_start(inode, needed_blocks); ++ if (IS_ERR(handle)) { ++ unlock_kernel(); ++ return PTR_ERR(handle); ++ } ++ unlock_kernel(); ++ ++ /* alloc blocks one by one */ ++ for (i = 0; i < nblocks; i++) { ++ ret = ext3_get_block_handle(handle, inode, blocks[i], ++ &bh_tmp, 1); ++ if (ret) ++ break; ++ ++ blocks[i] = bh_tmp.b_blocknr; ++ } ++ ++ /* set inode size if needed */ ++ if (!ret && (newsize > inode->i_size)) { ++ inode->i_size = newsize; ++ ext3_mark_inode_dirty(handle, inode); ++ } ++ ++ lock_kernel(); ++ ret2 = ext3_journal_stop(handle, inode); ++ unlock_kernel(); ++ ++ if (!ret) ++ ret = ret2; ++ return ret; ++} +--- linux-2.4.18-18.8.0-l18/fs/ext3/super.c~extN-san Sun May 18 13:24:35 2003 ++++ linux-2.4.18-18.8.0-l18-phil/fs/ext3/super.c Sun May 18 13:24:55 2003 +@@ -1774,6 +1774,10 @@ static int __init init_ext3_fs(void) + + EXPORT_SYMBOL(ext3_bread); + ++int ext3_prep_san_write(struct inode *inode, long *blocks, ++ int nblocks, loff_t newsize); ++EXPORT_SYMBOL(ext3_prep_san_write); ++ + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); + +_ diff --git a/lustre/kernel_patches/patches/extN-wantedi.patch b/lustre/kernel_patches/patches/extN-wantedi.patch new file mode 100644 index 0000000..fc74c6b --- /dev/null +++ b/lustre/kernel_patches/patches/extN-wantedi.patch @@ -0,0 +1,171 @@ + fs/ext3/ialloc.c | 38 ++++++++++++++++++++++++++++++++++++-- + fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++ + fs/ext3/namei.c | 12 ++++++++---- + include/linux/ext3_fs.h | 5 ++++- + 4 files changed, 73 insertions(+), 7 deletions(-) + +--- linux-2.4.20/fs/ext3/namei.c~extN-wantedi 2003-04-08 23:35:55.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-04-08 23:35:55.000000000 -0600 +@@ -1555,7 +1555,8 @@ static int ext3_create (struct inode * d + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode (handle, dir, mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; +@@ -1583,7 +1584,8 @@ static int ext3_mknod (struct inode * di + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, mode); ++ inode = ext3_new_inode (handle, dir, mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); +@@ -1613,7 +1615,8 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR | mode); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -2009,7 +2012,8 @@ static int ext3_symlink (struct inode * + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +--- linux-2.4.20/fs/ext3/ialloc.c~extN-wantedi 2003-04-08 23:35:55.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/ialloc.c 2003-04-08 23:35:55.000000000 -0600 +@@ -299,7 +299,8 @@ error_return: + * group to find a free inode. + */ + struct inode * ext3_new_inode (handle_t *handle, +- const struct inode * dir, int mode) ++ const struct inode * dir, int mode, ++ unsigned long goal) + { + struct super_block * sb; + struct buffer_head * bh; +@@ -323,7 +324,39 @@ struct inode * ext3_new_inode (handle_t + init_rwsem(&inode->u.ext3_i.truncate_sem); + + lock_super (sb); +- es = sb->u.ext3_sb.s_es; ++ es = EXT3_SB(sb)->s_es; ++ ++ if (goal) { ++ i = (goal - 1) / EXT3_INODES_PER_GROUP(sb); ++ j = (goal - 1) % EXT3_INODES_PER_GROUP(sb); ++ gdp = ext3_get_group_desc(sb, i, &bh2); ++ ++ bitmap_nr = load_inode_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ goto fail; ++ ++ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr]; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = ext3_journal_get_write_access(handle, bh); ++ if (err) goto fail; ++ ++ if (ext3_set_bit(j, bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable\n", goal); ++ /* Oh well, we tried. */ ++ goto repeat; ++ } ++ ++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); ++ err = ext3_journal_dirty_metadata(handle, bh); ++ if (err) goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto have_bit_and_group; ++ } ++ + repeat: + gdp = NULL; + i = 0; +@@ -438,6 +471,7 @@ repeat: + } + goto repeat; + } ++ have_bit_and_group: + j += i * EXT3_INODES_PER_GROUP(sb) + 1; + if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", +--- linux-2.4.20/fs/ext3/ioctl.c~extN-wantedi 2003-04-08 23:35:55.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/ioctl.c 2003-04-08 23:35:55.000000000 -0600 +@@ -23,6 +23,31 @@ int ext3_ioctl (struct inode * inode, st + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { ++ case EXT3_IOC_CREATE_INUM: { ++ char name[32]; ++ struct dentry *dchild, *dparent; ++ int rc = 0; ++ ++ dparent = list_entry(inode->i_dentry.next, struct dentry, ++ d_alias); ++ snprintf(name, sizeof name, "%lu", arg); ++ dchild = lookup_one_len(name, dparent, strlen(name)); ++ if (dchild->d_inode) { ++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", ++ dparent->d_name.len, dparent->d_name.name, arg, ++ dchild->d_inode->i_ino); ++ rc = -EEXIST; ++ } else { ++ dchild->d_fsdata = (void *)arg; ++ rc = vfs_create(inode, dchild, 0644); ++ if (rc) ++ printk(KERN_ERR "vfs_create: %d\n", rc); ++ else if (dchild->d_inode->i_ino != arg) ++ rc = -EEXIST; ++ } ++ dput(dchild); ++ return rc; ++ } + case EXT3_IOC_GETFLAGS: + flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); +--- linux-2.4.20/include/linux/ext3_fs.h~extN-wantedi 2003-04-08 23:35:55.000000000 -0600 ++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-04-08 23:35:55.000000000 -0600 +@@ -201,6 +201,7 @@ struct ext3_group_desc + #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) + #define EXT3_IOC_GETVERSION _IOR('f', 3, long) + #define EXT3_IOC_SETVERSION _IOW('f', 4, long) ++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ + #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) + #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) + #ifdef CONFIG_JBD_DEBUG +@@ -671,7 +672,8 @@ extern int ext3fs_dirhash(const char *na + dx_hash_info *hinfo); + + /* ialloc.c */ +-extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); ++extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int, ++ unsigned long); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, unsigned long); + extern unsigned long ext3_count_free_inodes (struct super_block *); +@@ -757,4 +759,5 @@ extern struct inode_operations ext3_fast + + #endif /* __KERNEL__ */ + ++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long) + #endif /* _LINUX_EXT3_FS_H */ + +_ diff --git a/lustre/extN/htree-ext3-2.4.18.diff b/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch similarity index 99% rename from lustre/extN/htree-ext3-2.4.18.diff rename to lustre/kernel_patches/patches/htree-ext3-2.4.18.patch index 4251251..a54e9ca 100644 --- a/lustre/extN/htree-ext3-2.4.18.diff +++ b/lustre/kernel_patches/patches/htree-ext3-2.4.18.patch @@ -13,7 +13,7 @@ else if (!strcmp (this_char, "debug")) set_opt (*mount_options, DEBUG); else if (!strcmp (this_char, "errors")) { -@@ -702,6 +708,12 @@ static int ext3_setup_super(struct super +@@ -702,6 +708,12 @@ es->s_mtime = cpu_to_le32(CURRENT_TIME); ext3_update_dynamic_rev(sb); EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); diff --git a/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch b/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch new file mode 100644 index 0000000..6e7d920 --- /dev/null +++ b/lustre/kernel_patches/patches/invalidate_show-2.4.20-rh.patch @@ -0,0 +1,114 @@ + fs/inode.c | 23 +++++++++++++++-------- + fs/smbfs/inode.c | 2 +- + fs/super.c | 4 ++-- + include/linux/fs.h | 2 +- + 4 files changed, 19 insertions(+), 12 deletions(-) + +--- kernel-2.4.20/fs/inode.c~invalidate_show-2.4.20-rh 2003-05-24 01:56:40.000000000 -0400 ++++ kernel-2.4.20-root/fs/inode.c 2003-06-02 00:35:37.000000000 -0400 +@@ -628,7 +628,8 @@ static void dispose_list(struct list_hea + /* + * Invalidate all inodes for a device. + */ +-static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) ++static int invalidate_list(struct list_head *head, struct super_block * sb, ++ struct list_head * dispose, int show) + { + struct list_head *next; + int busy = 0, count = 0; +@@ -653,6 +654,11 @@ static int invalidate_list(struct list_h + count++; + continue; + } ++ if (show) ++ printk(KERN_ERR ++ "inode busy: dev %s:%lu (%p) mode %o count %u\n", ++ kdevname(sb->s_dev), inode->i_ino, inode, ++ inode->i_mode, atomic_read(&inode->i_count)); + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ +@@ -671,23 +677,24 @@ static int invalidate_list(struct list_h + /** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock ++ * @show: whether we should display any busy inodes found + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ + +-int invalidate_inodes(struct super_block * sb) ++int invalidate_inodes(struct super_block * sb, int show) + { + int busy; + LIST_HEAD(throw_away); + + spin_lock(&inode_lock); +- busy = invalidate_list(&inode_in_use, sb, &throw_away); +- busy |= invalidate_list(&inode_unused, sb, &throw_away); +- busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away); +- busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); +- busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away); ++ busy = invalidate_list(&inode_in_use, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused, sb, &throw_away, show); ++ busy |= invalidate_list(&inode_unused_pagecache, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_dirty, sb, &throw_away, show); ++ busy |= invalidate_list(&sb->s_locked_inodes, sb, &throw_away, show); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); +@@ -713,7 +720,7 @@ int invalidate_device(kdev_t dev, int do + * hold). + */ + shrink_dcache_sb(sb); +- res = invalidate_inodes(sb); ++ res = invalidate_inodes(sb, 0); + drop_super(sb); + } + invalidate_buffers(dev); +--- kernel-2.4.20/fs/super.c~invalidate_show-2.4.20-rh 2003-05-24 01:56:24.000000000 -0400 ++++ kernel-2.4.20-root/fs/super.c 2003-06-02 00:35:00.000000000 -0400 +@@ -943,7 +943,7 @@ void kill_super(struct super_block *sb) + lock_super(sb); + lock_kernel(); + sb->s_flags &= ~MS_ACTIVE; +- invalidate_inodes(sb); /* bad name - it should be evict_inodes() */ ++ invalidate_inodes(sb, 0); /* bad name - it should be evict_inodes() */ + if (sop) { + if (sop->write_super && sb->s_dirt) + sop->write_super(sb); +@@ -952,7 +952,7 @@ void kill_super(struct super_block *sb) + } + + /* Forget any remaining inodes */ +- if (invalidate_inodes(sb)) { ++ if (invalidate_inodes(sb, 1)) { + printk(KERN_ERR "VFS: Busy inodes after unmount. " + "Self-destruct in 5 seconds. Have a nice day...\n"); + } +--- kernel-2.4.20/include/linux/fs.h~invalidate_show-2.4.20-rh 2003-06-02 00:31:47.000000000 -0400 ++++ kernel-2.4.20-root/include/linux/fs.h 2003-06-02 00:35:00.000000000 -0400 +@@ -1284,7 +1284,7 @@ static inline void mark_buffer_dirty_ino + extern void set_buffer_flushtime(struct buffer_head *); + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); +-extern int invalidate_inodes(struct super_block *); ++extern int invalidate_inodes(struct super_block *, int); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); +--- kernel-2.4.20/fs/smbfs/inode.c~invalidate_show-2.4.20-rh 2002-11-28 18:53:15.000000000 -0500 ++++ kernel-2.4.20-root/fs/smbfs/inode.c 2003-06-02 00:35:00.000000000 -0400 +@@ -167,7 +167,7 @@ smb_invalidate_inodes(struct smb_sb_info + { + VERBOSE("\n"); + shrink_dcache_sb(SB_of(server)); +- invalidate_inodes(SB_of(server)); ++ invalidate_inodes(SB_of(server), 0); + } + + /* + +_ diff --git a/lustre/kernel_patches/patches/invalidate_show.patch b/lustre/kernel_patches/patches/invalidate_show.patch index c3ae2f5..9273c5c 100644 --- a/lustre/kernel_patches/patches/invalidate_show.patch +++ b/lustre/kernel_patches/patches/invalidate_show.patch @@ -1,6 +1,15 @@ ---- lum/fs/inode.c Sat Oct 19 11:42:42 2002 -+++ linux-2.4.18-uml35-ext3online/fs/inode.c Mon Oct 14 00:41:20 2002 -@@ -606,7 +553,8 @@ static void dispose_list(struct list_hea + + + + fs/inode.c | 21 ++++++++++++++------- + fs/smbfs/inode.c | 2 +- + fs/super.c | 4 ++-- + include/linux/fs.h | 2 +- + 4 files changed, 18 insertions(+), 11 deletions(-) + +--- linux-rh-2.4.20-8/fs/inode.c~invalidate_show 2003-04-11 14:04:56.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/inode.c 2003-04-16 20:59:35.000000000 +0800 +@@ -604,7 +604,8 @@ static void dispose_list(struct list_hea /* * Invalidate all inodes for a device. */ @@ -10,7 +19,7 @@ { struct list_head *next; int busy = 0, count = 0; -@@ -631,6 +579,11 @@ static int invalidate_list(struct list_h +@@ -629,6 +630,11 @@ static int invalidate_list(struct list_h count++; continue; } @@ -22,7 +31,7 @@ busy = 1; } /* only unused inodes may be cached with i_count zero */ -@@ -649,22 +601,23 @@ static int invalidate_list(struct list_h +@@ -647,22 +653,23 @@ static int invalidate_list(struct list_h /** * invalidate_inodes - discard the inodes on a device * @sb: superblock @@ -51,7 +60,7 @@ spin_unlock(&inode_lock); dispose_list(&throw_away); -@@ -690,7 +643,7 @@ int invalidate_device(kdev_t dev, int do +@@ -688,7 +695,7 @@ int invalidate_device(kdev_t dev, int do * hold). */ shrink_dcache_sb(sb); @@ -60,9 +69,9 @@ drop_super(sb); } invalidate_buffers(dev); ---- lum/fs/super.c.orig Sat Oct 19 11:42:42 2002 -+++ lum/fs/super.c Wed Oct 30 17:16:55 2002 -@@ -936,7 +936,7 @@ +--- linux-rh-2.4.20-8/fs/super.c~invalidate_show 2003-04-11 14:04:57.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/super.c 2003-04-16 20:59:35.000000000 +0800 +@@ -943,7 +943,7 @@ void kill_super(struct super_block *sb) lock_super(sb); lock_kernel(); sb->s_flags &= ~MS_ACTIVE; @@ -71,7 +80,7 @@ if (sop) { if (sop->write_super && sb->s_dirt) sop->write_super(sb); -@@ -945,7 +945,7 @@ +@@ -952,7 +952,7 @@ void kill_super(struct super_block *sb) } /* Forget any remaining inodes */ @@ -80,9 +89,9 @@ printk(KERN_ERR "VFS: Busy inodes after unmount. " "Self-destruct in 5 seconds. Have a nice day...\n"); } ---- lum/include/linux/fs.h Wed Oct 30 17:10:42 2002 -+++ lum/include/linux/fs.h.orig Tue Oct 22 23:15:00 2002 -@@ -1261,7 +1261,7 @@ +--- linux-rh-2.4.20-8/include/linux/fs.h~invalidate_show 2003-04-16 20:55:35.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/fs.h 2003-04-16 20:59:35.000000000 +0800 +@@ -1283,7 +1283,7 @@ static inline void mark_buffer_dirty_ino extern void set_buffer_flushtime(struct buffer_head *); extern void balance_dirty(void); extern int check_disk_change(kdev_t); @@ -91,9 +100,9 @@ extern int invalidate_device(kdev_t, int); extern void invalidate_inode_pages(struct inode *); extern void invalidate_inode_pages2(struct address_space *); ---- lum/fs/smbfs/inode.c.orig Mon Feb 25 12:38:09 2002 -+++ lum/fs/smbfs/inode.c Thu Feb 6 21:34:26 2003 -@@ -166,7 +166,7 @@ +--- linux-rh-2.4.20-8/fs/smbfs/inode.c~invalidate_show 2003-04-16 20:59:48.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/smbfs/inode.c 2003-04-16 21:00:43.000000000 +0800 +@@ -167,7 +167,7 @@ smb_invalidate_inodes(struct smb_sb_info { VERBOSE("\n"); shrink_dcache_sb(SB_of(server)); @@ -102,3 +111,5 @@ } /* + +_ diff --git a/lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch b/lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch new file mode 100644 index 0000000..3fdf3fd --- /dev/null +++ b/lustre/kernel_patches/patches/iod-rmap-exports-2.4.20.patch @@ -0,0 +1,86 @@ + fs/Makefile | 4 +++- + fs/inode.c | 4 +++- + mm/Makefile | 2 +- + mm/page_alloc.c | 1 + + mm/vmscan.c | 3 +++ + 5 files changed, 11 insertions(+), 3 deletions(-) + +--- linux-rh-2.4.20-6/fs/inode.c~iod-rmap-exports Tue Apr 1 01:01:56 2003 ++++ linux-rh-2.4.20-6-braam/fs/inode.c Tue Apr 1 01:01:56 2003 +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(inode_lock); + + /* + * Statistics gathering.. +--- linux-rh-2.4.20-6/fs/Makefile~iod-rmap-exports Tue Apr 1 01:01:56 2003 ++++ linux-rh-2.4.20-6-braam/fs/Makefile Tue Apr 1 01:02:34 2003 +@@ -1,3 +1,5 @@ ++ ++ + # + # Makefile for the Linux filesystems. + # +@@ -7,7 +9,7 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o ++export-objs := filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o inode.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +--- linux-rh-2.4.20-6/mm/vmscan.c~iod-rmap-exports Tue Apr 1 01:01:56 2003 ++++ linux-rh-2.4.20-6-braam/mm/vmscan.c Tue Apr 1 01:01:56 2003 +@@ -15,6 +15,8 @@ + * O(1) rmap vm, Arjan van de ven + */ + ++#include ++#include + #include + #include + #include +@@ -1061,6 +1063,7 @@ void wakeup_kswapd(unsigned int gfp_mask + set_current_state(TASK_RUNNING); + remove_wait_queue(&kswapd_done, &wait); + } ++EXPORT_SYMBOL(wakeup_kswapd); + + static void wakeup_memwaiters(void) + { +--- linux-rh-2.4.20-6/mm/Makefile~iod-rmap-exports Tue Apr 1 01:01:56 2003 ++++ linux-rh-2.4.20-6-braam/mm/Makefile Tue Apr 1 01:01:56 2003 +@@ -9,7 +9,7 @@ + + O_TARGET := mm.o + +-export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o ++export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o vmscan.o + + obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ + vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ +--- linux-rh-2.4.20-6/mm/page_alloc.c~iod-rmap-exports Tue Apr 1 01:01:56 2003 ++++ linux-rh-2.4.20-6-braam/mm/page_alloc.c Tue Apr 1 01:01:56 2003 +@@ -27,6 +27,7 @@ + + int nr_swap_pages; + pg_data_t *pgdat_list; ++EXPORT_SYMBOL(pgdat_list); + + /* + * + +_ diff --git a/lustre/kernel_patches/patches/iod-rmap-exports.patch b/lustre/kernel_patches/patches/iod-rmap-exports.patch index 8df0d82..5ba68dd 100644 --- a/lustre/kernel_patches/patches/iod-rmap-exports.patch +++ b/lustre/kernel_patches/patches/iod-rmap-exports.patch @@ -1,5 +1,12 @@ ---- linux/fs/inode.c.b_io 2003-02-18 16:39:16.000000000 -0800 -+++ linux/fs/inode.c 2003-02-18 16:39:45.000000000 -0800 + fs/Makefile | 4 +++- + fs/inode.c | 4 +++- + mm/Makefile | 2 +- + mm/page_alloc.c | 1 + + mm/vmscan.c | 3 +++ + 5 files changed, 11 insertions(+), 3 deletions(-) + +--- linux-2.4.18-18/fs/inode.c~iod-rmap-exports Thu Apr 3 00:40:01 2003 ++++ linux-2.4.18-18-braam/fs/inode.c Thu Apr 3 00:40:01 2003 @@ -5,6 +5,7 @@ */ @@ -8,7 +15,7 @@ #include #include #include -@@ -66,7 +67,8 @@ +@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo * NOTE! You also have to own the lock if you change * the i_state of an inode while it is in use.. */ @@ -18,9 +25,15 @@ /* * Statistics gathering.. ---- linux/fs/Makefile.b_io 2003-02-18 16:39:16.000000000 -0800 -+++ linux/fs/Makefile 2003-02-18 16:39:37.000000000 -0800 -@@ -7,7 +7,7 @@ +--- linux-2.4.18-18/fs/Makefile~iod-rmap-exports Thu Apr 3 00:40:01 2003 ++++ linux-2.4.18-18-braam/fs/Makefile Thu Apr 3 00:40:29 2003 +@@ -1,3 +1,5 @@ ++ ++ + # + # Makefile for the Linux filesystems. + # +@@ -7,7 +9,7 @@ O_TARGET := fs.o @@ -29,8 +42,8 @@ mod-subdirs := nls obj-y := open.o read_write.o devices.o file_table.o buffer.o \ ---- linux/mm/vmscan.c.b_io 2003-02-18 16:39:16.000000000 -0800 -+++ linux/mm/vmscan.c 2003-02-18 16:40:01.000000000 -0800 +--- linux-2.4.18-18/mm/vmscan.c~iod-rmap-exports Thu Apr 3 00:40:01 2003 ++++ linux-2.4.18-18-braam/mm/vmscan.c Thu Apr 3 00:40:01 2003 @@ -14,6 +14,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ @@ -40,7 +53,7 @@ #include #include #include -@@ -837,6 +839,7 @@ +@@ -837,6 +839,7 @@ void wakeup_kswapd(unsigned int gfp_mask set_current_state(TASK_RUNNING); remove_wait_queue(&kswapd_done, &wait); } @@ -48,8 +61,8 @@ static void wakeup_memwaiters(void) { ---- linux/mm/Makefile.b_io 2003-02-18 16:39:16.000000000 -0800 -+++ linux/mm/Makefile 2003-02-18 16:39:37.000000000 -0800 +--- linux-2.4.18-18/mm/Makefile~iod-rmap-exports Thu Apr 3 00:40:01 2003 ++++ linux-2.4.18-18-braam/mm/Makefile Thu Apr 3 00:40:01 2003 @@ -9,7 +9,7 @@ O_TARGET := mm.o @@ -59,9 +72,9 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \ vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \ ---- linux-chaos/mm/page_alloc.c.b_io_export Wed Jan 29 17:00:32 2003 -+++ linux-chaos/mm/page_alloc.c Wed Jan 29 17:01:31 2003 -@@ -31,6 +31,7 @@ +--- linux-2.4.18-18/mm/page_alloc.c~iod-rmap-exports Thu Apr 3 00:40:01 2003 ++++ linux-2.4.18-18-braam/mm/page_alloc.c Thu Apr 3 00:40:01 2003 +@@ -31,6 +31,7 @@ int nr_active_pages; int nr_inactive_dirty_pages; int nr_inactive_clean_pages; pg_data_t *pgdat_list; @@ -69,3 +82,5 @@ /* * The zone_table array is used to look up the address of the + +_ diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports.patch b/lustre/kernel_patches/patches/iod-stock-24-exports.patch new file mode 100644 index 0000000..2070377 --- /dev/null +++ b/lustre/kernel_patches/patches/iod-stock-24-exports.patch @@ -0,0 +1,48 @@ + fs/Makefile | 2 +- + fs/inode.c | 4 +++- + mm/page_alloc.c | 1 + + 3 files changed, 5 insertions(+), 2 deletions(-) + +--- linux-2.4.20/fs/inode.c~iod-stock-24-exports Wed Apr 2 23:21:20 2003 ++++ linux-2.4.20-braam/fs/inode.c Wed Apr 2 23:21:20 2003 +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; ++EXPORT_SYMBOL(inode_lock); + + /* + * Statistics gathering.. +--- linux-2.4.20/fs/Makefile~iod-stock-24-exports Wed Apr 2 23:21:20 2003 ++++ linux-2.4.20-braam/fs/Makefile Wed Apr 2 23:21:53 2003 +@@ -7,7 +7,7 @@ + + O_TARGET := fs.o + +-export-objs := filesystems.o open.o dcache.o buffer.o ++export-objs := filesystems.o open.o dcache.o buffer.o inode.o + mod-subdirs := nls + + obj-y := open.o read_write.o devices.o file_table.o buffer.o \ +--- linux-2.4.20/mm/page_alloc.c~iod-stock-24-exports Wed Apr 2 23:21:20 2003 ++++ linux-2.4.20-braam/mm/page_alloc.c Wed Apr 2 23:21:20 2003 +@@ -28,6 +28,7 @@ int nr_inactive_pages; + LIST_HEAD(inactive_list); + LIST_HEAD(active_list); + pg_data_t *pgdat_list; ++EXPORT_SYMBOL(pgdat_list); + + /* + * + +_ diff --git a/lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch b/lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch index 669b44d..3035f55 100644 --- a/lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch +++ b/lustre/kernel_patches/patches/iod-stock-24-exports_hp.patch @@ -1,5 +1,10 @@ ---- linux-2.4.19-hp2_pnnl4_Lv13/fs/inode.c.iod-export 2003-02-27 14:28:04.000000000 -0800 -+++ linux-2.4.19-hp2_pnnl4_Lv13/fs/inode.c 2003-03-03 13:54:59.000000000 -0800 + fs/Makefile | 2 +- + fs/inode.c | 4 +++- + mm/page_alloc.c | 1 + + 3 files changed, 5 insertions(+), 2 deletions(-) + +--- linux/fs/inode.c~iod-stock-24-exports_hp Wed Apr 9 10:44:54 2003 ++++ linux-mmonroe/fs/inode.c Wed Apr 9 10:49:50 2003 @@ -5,6 +5,7 @@ */ @@ -8,7 +13,7 @@ #include #include #include -@@ -66,7 +67,8 @@ +@@ -66,7 +67,8 @@ static LIST_HEAD(anon_hash_chain); /* fo * NOTE! You also have to own the lock if you change * the i_state of an inode while it is in use.. */ @@ -18,8 +23,8 @@ /* * Statistics gathering.. ---- linux-2.4.19-hp2_pnnl4_Lv13/fs/Makefile.iod-export 2003-02-27 14:28:01.000000000 -0800 -+++ linux-2.4.19-hp2_pnnl4_Lv13/fs/Makefile 2003-03-03 13:56:11.000000000 -0800 +--- linux/fs/Makefile~iod-stock-24-exports_hp Wed Apr 9 10:26:08 2003 ++++ linux-mmonroe/fs/Makefile Wed Apr 9 10:49:50 2003 @@ -7,7 +7,7 @@ O_TARGET := fs.o @@ -29,13 +34,15 @@ mod-subdirs := nls xfs obj-y := open.o read_write.o devices.o file_table.o buffer.o \ ---- linux-2.4.19-hp2_pnnl4_Lv13/mm/page_alloc.c.iod-export 2003-02-27 14:28:01.000000000 -0800 -+++ linux-2.4.19-hp2_pnnl4_Lv13/mm/page_alloc.c 2003-03-03 13:54:59.000000000 -0800 -@@ -28,6 +28,7 @@ +--- linux/mm/page_alloc.c~iod-stock-24-exports_hp Wed Apr 9 10:26:14 2003 ++++ linux-mmonroe/mm/page_alloc.c Wed Apr 9 10:49:50 2003 +@@ -28,6 +28,7 @@ int nr_inactive_pages; LIST_HEAD(inactive_list); LIST_HEAD(active_list); pg_data_t *pgdat_list; +EXPORT_SYMBOL(pgdat_list); - /* Used to look up the address of the struct zone encoded in page->zone */ - zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES]; + /* + * + +_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.18.patch b/lustre/kernel_patches/patches/iopen-2.4.18.patch new file mode 100644 index 0000000..d8dbdfb --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.18.patch @@ -0,0 +1,414 @@ + 0 files changed + +--- linux-2.4.18-chaos52/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-04-13 15:21:33.000000000 +0800 ++++ linux-2.4.18-chaos52-root/Documentation/filesystems/ext2.txt 2003-06-03 17:10:55.000000000 +0800 +@@ -35,6 +35,22 @@ resgid=n The group ID which may use th + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +--- linux-2.4.18-chaos52/fs/ext3/Makefile~iopen-2.4.18 2003-06-01 03:24:07.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/Makefile 2003-06-03 17:10:55.000000000 +0800 +@@ -11,7 +11,7 @@ O_TARGET := ext3.o + + export-objs := super.o inode.o xattr.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o xattr.o + obj-m := $(O_TARGET) + +--- linux-2.4.18-chaos52/fs/ext3/inode.c~iopen-2.4.18 2003-06-03 17:10:21.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/inode.c 2003-06-03 17:10:55.000000000 +0800 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2135,6 +2136,9 @@ void ext3_read_inode(struct inode * inod + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +--- /dev/null 2002-08-31 07:31:37.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/iopen.c 2003-06-03 17:10:55.000000000 +0800 +@@ -0,0 +1,259 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ spin_unlock(&dcache_lock); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ /* verify this dentry is really new */ ++ assert(!de->d_inode); ++ assert(list_empty(&de->d_subdirs)); ++ assert(list_empty(&de->d_alias)); ++ ++ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) { ++ spin_unlock(&dcache_lock); ++ return NULL; ++ } ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del(&goal->d_hash); ++ list_add(&goal->d_hash, &de->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ spin_unlock(&dcache_lock); ++ ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +--- /dev/null 2002-08-31 07:31:37.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/iopen.h 2003-06-03 17:10:55.000000000 +0800 +@@ -0,0 +1,13 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); +--- linux-2.4.18-chaos52/fs/ext3/namei.c~iopen-2.4.18 2003-06-03 17:10:20.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/namei.c 2003-06-03 17:10:55.000000000 +0800 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -703,16 +704,21 @@ cleanup_and_exit: + brelse (bh_use[ra_ptr]); + return ret; + } ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -723,6 +729,12 @@ static struct dentry *ext3_lookup(struct + if (!inode) + return ERR_PTR(-EACCES); + } ++ ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ iput(inode); ++ return alternate; ++ } ++ + d_add(dentry, inode); + return NULL; + } +--- linux-2.4.18-chaos52/fs/ext3/super.c~iopen-2.4.18 2003-06-03 17:10:21.000000000 +0800 ++++ linux-2.4.18-chaos52-root/fs/ext3/super.c 2003-06-03 17:10:55.000000000 +0800 +@@ -820,6 +820,17 @@ static int parse_options (char * options + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +--- linux-2.4.18-chaos52/include/linux/ext3_fs.h~iopen-2.4.18 2003-06-03 17:10:22.000000000 +0800 ++++ linux-2.4.18-chaos52-root/include/linux/ext3_fs.h 2003-06-03 17:12:08.000000000 +0800 +@@ -321,6 +321,8 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ ++#define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + +_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.20.patch b/lustre/kernel_patches/patches/iopen-2.4.20.patch new file mode 100644 index 0000000..3038cc87 --- /dev/null +++ b/lustre/kernel_patches/patches/iopen-2.4.20.patch @@ -0,0 +1,423 @@ + Documentation/filesystems/ext2.txt | 16 ++ + fs/ext3/Makefile | 2 + fs/ext3/inode.c | 4 + fs/ext3/iopen.c | 240 +++++++++++++++++++++++++++++++++++++ + fs/ext3/iopen.h | 15 ++ + fs/ext3/namei.c | 13 +- + fs/ext3/super.c | 11 + + include/linux/ext3_fs.h | 2 + 8 files changed, 301 insertions(+), 2 deletions(-) + +--- linux-2.4.20/Documentation/filesystems/ext2.txt~iopen 2001-07-11 16:44:45.000000000 -0600 ++++ linux-2.4.20-braam/Documentation/filesystems/ext2.txt 2003-05-17 14:06:00.000000000 -0600 +@@ -35,6 +35,22 @@ resgid=n The group ID which may use th + + sb=n Use alternate superblock at this location. + ++iopen Makes an invisible pseudo-directory called ++ __iopen__ available in the root directory ++ of the filesystem. Allows open-by-inode- ++ number. i.e., inode 3145 can be accessed ++ via /mntpt/__iopen__/3145 ++ ++iopen_nopriv This option makes the iopen directory be ++ world-readable. This may be safer since it ++ allows daemons to run as an unprivileged user, ++ however it significantly changes the security ++ model of a Unix filesystem, since previously ++ all files under a mode 700 directory were not ++ generally avilable even if the ++ permissions on the file itself is ++ world-readable. ++ + grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. + + +--- linux-2.4.20/fs/ext3/Makefile~iopen 2003-05-17 14:05:57.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/Makefile 2003-05-17 14:06:00.000000000 -0600 +@@ -11,7 +11,7 @@ O_TARGET := ext3.o + + export-objs := ext3-exports.o + +-obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + +--- linux-2.4.20/fs/ext3/inode.c~iopen 2003-05-17 14:06:00.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/inode.c 2003-05-17 14:06:00.000000000 -0600 +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include "iopen.h" + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start +@@ -2137,6 +2138,9 @@ void ext3_read_inode(struct inode * inod + struct buffer_head *bh; + int block; + ++ if (ext3_iopen_get_inode(inode)) ++ return; ++ + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.20-braam/fs/ext3/iopen.c 2003-05-17 22:18:55.000000000 -0600 +@@ -0,0 +1,259 @@ ++/* ++ * linux/fs/ext3/iopen.c ++ * ++ * Special support for open by inode number ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "iopen.h" ++ ++#ifndef assert ++#define assert(test) J_ASSERT(test) ++#endif ++ ++#define IOPEN_NAME_LEN 32 ++ ++/* ++ * This implements looking up an inode by number. ++ */ ++static struct dentry *iopen_lookup(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ unsigned long ino; ++ struct list_head *lp; ++ struct dentry *alternate; ++ char buf[IOPEN_NAME_LEN]; ++ ++ if (dentry->d_name.len >= IOPEN_NAME_LEN) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ memcpy(buf, dentry->d_name.name, dentry->d_name.len); ++ buf[dentry->d_name.len] = 0; ++ ++ if (strcmp(buf, ".") == 0) ++ ino = dir->i_ino; ++ else if (strcmp(buf, "..") == 0) ++ ino = EXT3_ROOT_INO; ++ else ++ ino = simple_strtoul(buf, 0, 0); ++ ++ if ((ino != EXT3_ROOT_INO && ++ //ino != EXT3_ACL_IDX_INO && ++ //ino != EXT3_ACL_DATA_INO && ++ ino < EXT3_FIRST_INO(dir->i_sb)) || ++ ino > le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) ++ return ERR_PTR(-ENOENT); ++ ++ inode = iget(dir->i_sb, ino); ++ if (!inode) ++ return ERR_PTR(-EACCES); ++ if (is_bad_inode(inode)) { ++ iput(inode); ++ return ERR_PTR(-ENOENT); ++ } ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ list_for_each(lp, &inode->i_dentry) { ++ alternate = list_entry(lp, struct dentry, d_alias); ++ assert(!(alternate->d_flags & DCACHE_NFSD_DISCONNECTED)); ++ } ++ ++ if (!list_empty(&inode->i_dentry)) { ++ alternate = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ dget_locked(alternate); ++ alternate->d_vfs_flags |= DCACHE_REFERENCED; ++ iput(inode); ++ spin_unlock(&dcache_lock); ++ return alternate; ++ } ++ dentry->d_flags |= DCACHE_NFSD_DISCONNECTED; ++ spin_unlock(&dcache_lock); ++ ++ d_add(dentry, inode); ++ return NULL; ++} ++ ++#define do_switch(x,y) do { \ ++ __typeof__ (x) __tmp = x; \ ++ x = y; y = __tmp; } while (0) ++ ++static inline void switch_names(struct dentry *dentry, struct dentry *target) ++{ ++ const unsigned char *old_name, *new_name; ++ ++ memcpy(dentry->d_iname, target->d_iname, DNAME_INLINE_LEN); ++ old_name = target->d_name.name; ++ new_name = dentry->d_name.name; ++ if (old_name == target->d_iname) ++ old_name = dentry->d_iname; ++ if (new_name == dentry->d_iname) ++ new_name = target->d_iname; ++ target->d_name.name = new_name; ++ dentry->d_name.name = old_name; ++} ++ ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++{ ++ struct dentry *tmp, *goal = NULL; ++ struct list_head *lp; ++ ++ /* preferrably return a connected dentry */ ++ spin_lock(&dcache_lock); ++ /* verify this dentry is really new */ ++ assert(!de->d_inode); ++ assert(list_empty(&de->d_subdirs)); ++ assert(list_empty(&de->d_alias)); ++ ++ ++ list_for_each(lp, &inode->i_dentry) { ++ tmp = list_entry(lp, struct dentry, d_alias); ++ if (tmp->d_flags & DCACHE_NFSD_DISCONNECTED) { ++ assert(tmp->d_alias.next == &inode->i_dentry); ++ assert(tmp->d_alias.prev == &inode->i_dentry); ++ goal = tmp; ++ dget_locked(goal); ++ break; ++ } ++ } ++ ++ if (!goal) { ++ spin_unlock(&dcache_lock); ++ return NULL; ++ } ++ ++ /* Move the goal to the de hash queue - like d_move() */ ++ goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; ++ list_del(&goal->d_hash); ++ list_add(&goal->d_hash, &de->d_hash); ++ ++ list_del(&goal->d_child); ++ list_del(&de->d_child); ++ ++ /* Switch the parents and the names.. */ ++ switch_names(goal, de); ++ do_switch(goal->d_parent, de->d_parent); ++ do_switch(goal->d_name.len, de->d_name.len); ++ do_switch(goal->d_name.hash, de->d_name.hash); ++ ++ /* And add them back to the (new) parent lists */ ++ list_add(&goal->d_child, &goal->d_parent->d_subdirs); ++ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ spin_unlock(&dcache_lock); ++ ++ return goal; ++} ++ ++/* ++ * These are the special structures for the iopen pseudo directory. ++ */ ++ ++static struct inode_operations iopen_inode_operations = { ++ lookup: iopen_lookup, /* BKL held */ ++}; ++ ++static struct file_operations iopen_file_operations = { ++ read: generic_read_dir, ++}; ++ ++static int match_dentry(struct dentry *dentry, const char *name) ++{ ++ int len; ++ ++ len = strlen(name); ++ if (dentry->d_name.len != len) ++ return 0; ++ if (strncmp(dentry->d_name.name, name, len)) ++ return 0; ++ return 1; ++} ++ ++/* ++ * This function is spliced into ext3_lookup and returns 1 the file ++ * name is __iopen__ and dentry has been filled in appropriately. ++ */ ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) ++{ ++ struct inode *inode; ++ ++ if (dir->i_ino != EXT3_ROOT_INO || ++ !test_opt(dir->i_sb, IOPEN) || ++ !match_dentry(dentry, "__iopen__")) ++ return 0; ++ ++ inode = iget(dir->i_sb, EXT3_BAD_INO); ++ ++ if (!inode) ++ return 0; ++ d_add(dentry, inode); ++ return 1; ++} ++ ++/* ++ * This function is spliced into read_inode; it returns 1 if inode ++ * number is the one for /__iopen__, in which case the inode is filled ++ * in appropriately. Otherwise, this fuction returns 0. ++ */ ++int ext3_iopen_get_inode(struct inode *inode) ++{ ++ if (inode->i_ino != EXT3_BAD_INO) ++ return 0; ++ ++ inode->i_mode = S_IFDIR | S_IRUSR | S_IXUSR; ++ if (test_opt(inode->i_sb, IOPEN_NOPRIV)) ++ inode->i_mode |= 0777; ++ inode->i_uid = 0; ++ inode->i_gid = 0; ++ inode->i_nlink = 1; ++ inode->i_size = 4096; ++ inode->i_atime = CURRENT_TIME; ++ inode->i_ctime = CURRENT_TIME; ++ inode->i_mtime = CURRENT_TIME; ++ inode->u.ext3_i.i_dtime = 0; ++ inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size ++ * (for stat), not the fs block ++ * size */ ++ inode->i_blocks = 0; ++ inode->i_version = 1; ++ inode->i_generation = 0; ++ ++ inode->i_op = &iopen_inode_operations; ++ inode->i_fop = &iopen_file_operations; ++ inode->i_mapping->a_ops = 0; ++ ++ return 1; ++} +--- /dev/null 2003-01-30 03:24:37.000000000 -0700 ++++ linux-2.4.20-braam/fs/ext3/iopen.h 2003-05-17 14:06:00.000000000 -0600 +@@ -0,0 +1,13 @@ ++/* ++ * iopen.h ++ * ++ * Special support for opening files by inode number. ++ * ++ * Copyright (C) 2001 by Theodore Ts'o (tytso@alum.mit.edu). ++ * ++ * This file may be redistributed under the terms of the GNU General ++ * Public License. ++ */ ++ ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); +--- linux-2.4.20/fs/ext3/namei.c~iopen 2003-05-17 14:05:59.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/namei.c 2003-05-17 22:23:08.000000000 -0600 +@@ -35,7 +35,7 @@ + #include + #include + #include +- ++#include "iopen.h" + + /* + * define how far ahead to read directories while searching them. +@@ -921,16 +921,21 @@ errout: + return NULL; + } + #endif ++struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; ++ struct dentry *alternate = NULL; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; ++ + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { +@@ -942,6 +947,12 @@ static struct dentry *ext3_lookup(struct + return ERR_PTR(-EACCES); + } + } ++ ++ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { ++ iput(inode); ++ return alternate; ++ } ++ + d_add(dentry, inode); + return NULL; + } +--- linux-2.4.20/fs/ext3/super.c~iopen 2003-05-17 14:05:59.000000000 -0600 ++++ linux-2.4.20-braam/fs/ext3/super.c 2003-05-17 14:06:00.000000000 -0600 +@@ -820,6 +820,17 @@ static int parse_options (char * options + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; ++ else if (!strcmp (this_char, "iopen")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } else if (!strcmp (this_char, "noiopen")) { ++ clear_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } ++ else if (!strcmp (this_char, "iopen_nopriv")) { ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ } + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create +--- linux-2.4.20/include/linux/ext3_fs.h~iopen 2003-05-17 14:05:59.000000000 -0600 ++++ linux-2.4.20-braam/include/linux/ext3_fs.h 2003-05-17 14:06:29.000000000 -0600 +@@ -322,6 +322,8 @@ struct ext3_inode { + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ ++#define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ ++#define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + +_ diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch new file mode 100644 index 0000000..8113828 --- /dev/null +++ b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20-rh.patch @@ -0,0 +1,124 @@ + + + + arch/i386/mm/init.c | 6 +++++ + arch/ia64/mm/init.c | 6 +++++ + include/linux/slab.h | 1 + kernel/ksyms.c | 1 + mm/slab.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 67 insertions(+) + +--- rh-2.4.20/arch/i386/mm/init.c~kmem_cache_validate_2.4.20-rh 2003-04-11 14:05:09.000000000 +0800 ++++ rh-2.4.20-root/arch/i386/mm/init.c 2003-04-13 10:51:58.000000000 +0800 +@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn + static unsigned long totalram_pages; + static unsigned long totalhigh_pages; + ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} ++ + int do_check_pgt_cache(int low, int high) + { + return 0; /* FIXME! */ +--- rh-2.4.20/arch/ia64/mm/init.c~kmem_cache_validate_2.4.20-rh 2003-04-11 14:04:43.000000000 +0800 ++++ rh-2.4.20-root/arch/ia64/mm/init.c 2003-04-13 10:51:58.000000000 +0800 +@@ -45,6 +45,12 @@ unsigned long vmalloc_end = VMALLOC_END_ + static struct page *vmem_map; + static unsigned long num_dma_physpages; + ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} ++ + int + do_check_pgt_cache (int low, int high) + { +--- rh-2.4.20/include/linux/slab.h~kmem_cache_validate_2.4.20-rh 2003-04-12 15:46:39.000000000 +0800 ++++ rh-2.4.20-root/include/linux/slab.h 2003-04-13 10:53:00.000000000 +0800 +@@ -57,6 +57,7 @@ extern int kmem_cache_destroy(kmem_cache + extern int kmem_cache_shrink(kmem_cache_t *); + extern void *kmem_cache_alloc(kmem_cache_t *, int); + extern void kmem_cache_free(kmem_cache_t *, void *); ++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); + extern unsigned int kmem_cache_size(kmem_cache_t *); + + extern void *kmalloc(size_t, int); +--- rh-2.4.20/kernel/ksyms.c~kmem_cache_validate_2.4.20-rh 2003-04-12 16:15:26.000000000 +0800 ++++ rh-2.4.20-root/kernel/ksyms.c 2003-04-13 10:54:10.000000000 +0800 +@@ -123,6 +123,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); + EXPORT_SYMBOL(kmem_cache_shrink); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); ++EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_size); + EXPORT_SYMBOL(kmalloc); + EXPORT_SYMBOL(kfree); +--- rh-2.4.20/mm/slab.c~kmem_cache_validate_2.4.20-rh 2003-04-11 14:04:56.000000000 +0800 ++++ rh-2.4.20-root/mm/slab.c 2003-04-13 10:51:58.000000000 +0800 +@@ -1208,6 +1208,59 @@ failed: + * Called with the cache-lock held. + */ + ++extern struct page *check_get_page(unsigned long kaddr); ++struct page *page_mem_map(struct page *page); ++static int kmem_check_cache_obj (kmem_cache_t * cachep, ++ slab_t *slabp, void * objp) ++{ ++ int i; ++ unsigned int objnr; ++ ++#if DEBUG ++ if (cachep->flags & SLAB_RED_ZONE) { ++ objp -= BYTES_PER_WORD; ++ if ( *(unsigned long *)objp != RED_MAGIC2) ++ /* Either write before start, or a double free. */ ++ return 0; ++ if (*(unsigned long *)(objp+cachep->objsize - ++ BYTES_PER_WORD) != RED_MAGIC2) ++ /* Either write past end, or a double free. */ ++ return 0; ++ } ++#endif ++ ++ objnr = (objp-slabp->s_mem)/cachep->objsize; ++ if (objnr >= cachep->num) ++ return 0; ++ if (objp != slabp->s_mem + objnr*cachep->objsize) ++ return 0; ++ ++ /* Check slab's freelist to see if this obj is there. */ ++ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { ++ if (i == objnr) ++ return 0; ++ } ++ return 1; ++} ++ ++ ++int kmem_cache_validate(kmem_cache_t *cachep, void *objp) ++{ ++ struct page *page = check_get_page((unsigned long)objp); ++ ++ if (!VALID_PAGE(page)) ++ return 0; ++ ++ if (!PageSlab(page)) ++ return 0; ++ ++ /* XXX check for freed slab objects ? */ ++ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) ++ return 0; ++ ++ return (cachep == GET_PAGE_CACHE(page)); ++} ++ + #if DEBUG + static int kmem_extra_free_checks (kmem_cache_t * cachep, + slab_t *slabp, void * objp) + +_ diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch new file mode 100644 index 0000000..e802312 --- /dev/null +++ b/lustre/kernel_patches/patches/kmem_cache_validate_2.4.20.patch @@ -0,0 +1,116 @@ + 0 files changed + +--- linux-2.4.20-8/arch/ia64/mm/init.c~kmem_cache_validate_2.4.20 2002-11-29 07:53:09.000000000 +0800 ++++ linux-2.4.20-8-root/arch/ia64/mm/init.c 2003-06-01 01:44:13.000000000 +0800 +@@ -45,6 +45,12 @@ static struct page *vmem_map; + static unsigned long num_dma_physpages; + #endif + ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} ++ + int + do_check_pgt_cache (int low, int high) + { +--- linux-2.4.20-8/include/linux/slab.h~kmem_cache_validate_2.4.20 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-8-root/include/linux/slab.h 2003-06-01 01:44:13.000000000 +0800 +@@ -56,6 +56,7 @@ extern kmem_cache_t *kmem_cache_create(c + extern int kmem_cache_destroy(kmem_cache_t *); + extern int kmem_cache_shrink(kmem_cache_t *); + extern void *kmem_cache_alloc(kmem_cache_t *, int); ++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); + extern void kmem_cache_free(kmem_cache_t *, void *); + extern unsigned int kmem_cache_size(kmem_cache_t *); + +--- linux-2.4.20-8/kernel/ksyms.c~kmem_cache_validate_2.4.20 2003-06-01 01:44:11.000000000 +0800 ++++ linux-2.4.20-8-root/kernel/ksyms.c 2003-06-01 01:44:13.000000000 +0800 +@@ -103,6 +103,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep); + EXPORT_SYMBOL(kmem_cache_create); + EXPORT_SYMBOL(kmem_cache_destroy); + EXPORT_SYMBOL(kmem_cache_shrink); ++EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); + EXPORT_SYMBOL(kmem_cache_size); +--- linux-2.4.20-8/mm/slab.c~kmem_cache_validate_2.4.20 2003-06-01 01:44:08.000000000 +0800 ++++ linux-2.4.20-8-root/mm/slab.c 2003-06-01 01:44:13.000000000 +0800 +@@ -1205,6 +1205,59 @@ failed: + * Called with the cache-lock held. + */ + ++extern struct page *check_get_page(unsigned long kaddr); ++struct page *page_mem_map(struct page *page); ++static int kmem_check_cache_obj (kmem_cache_t * cachep, ++ slab_t *slabp, void * objp) ++{ ++ int i; ++ unsigned int objnr; ++ ++#if DEBUG ++ if (cachep->flags & SLAB_RED_ZONE) { ++ objp -= BYTES_PER_WORD; ++ if ( *(unsigned long *)objp != RED_MAGIC2) ++ /* Either write before start, or a double free. */ ++ return 0; ++ if (*(unsigned long *)(objp+cachep->objsize - ++ BYTES_PER_WORD) != RED_MAGIC2) ++ /* Either write past end, or a double free. */ ++ return 0; ++ } ++#endif ++ ++ objnr = (objp-slabp->s_mem)/cachep->objsize; ++ if (objnr >= cachep->num) ++ return 0; ++ if (objp != slabp->s_mem + objnr*cachep->objsize) ++ return 0; ++ ++ /* Check slab's freelist to see if this obj is there. */ ++ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { ++ if (i == objnr) ++ return 0; ++ } ++ return 1; ++} ++ ++ ++int kmem_cache_validate(kmem_cache_t *cachep, void *objp) ++{ ++ struct page *page = check_get_page((unsigned long)objp); ++ ++ if (!VALID_PAGE(page)) ++ return 0; ++ ++ if (!PageSlab(page)) ++ return 0; ++ ++ /* XXX check for freed slab objects ? */ ++ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) ++ return 0; ++ ++ return (cachep == GET_PAGE_CACHE(page)); ++} ++ + #if DEBUG + static int kmem_extra_free_checks (kmem_cache_t * cachep, + slab_t *slabp, void * objp) +--- linux-2.4.20-8/arch/i386/mm/init.c~kmem_cache_validate_2.4.20 2002-11-29 07:53:09.000000000 +0800 ++++ linux-2.4.20-8-root/arch/i386/mm/init.c 2003-06-01 01:46:43.000000000 +0800 +@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn + static unsigned long totalram_pages; + static unsigned long totalhigh_pages; + ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} ++ + int do_check_pgt_cache(int low, int high) + { + int freed = 0; + +_ diff --git a/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch b/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch index 03385a7..04b49ea 100644 --- a/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch +++ b/lustre/kernel_patches/patches/kmem_cache_validate_hp.patch @@ -1,12 +1,13 @@ + arch/i386/mm/init.c | 6 +++++ arch/ia64/mm/init.c | 6 +++++ include/linux/slab.h | 1 kernel/ksyms.c | 1 mm/slab.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ - 4 files changed, 61 insertions(+) + 5 files changed, 67 insertions(+) ---- linux-2.4.19-hp2_pnnl2/arch/ia64/mm/init.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 -+++ linux-2.4.19-hp2_pnnl2-root/arch/ia64/mm/init.c Sun Jan 19 18:59:24 2003 -@@ -44,6 +44,12 @@ unsigned long vmalloc_end = VMALLOC_END_ +--- linux/arch/ia64/mm/init.c~kmem_cache_validate_hp 2003-04-11 14:24:25.000000000 +0800 ++++ linux-root/arch/ia64/mm/init.c 2003-05-16 20:03:56.000000000 +0800 +@@ -45,6 +45,12 @@ unsigned long vmalloc_end = VMALLOC_END_ static struct page *vmem_map; static unsigned long num_dma_physpages; @@ -19,8 +20,8 @@ int do_check_pgt_cache (int low, int high) { ---- linux-2.4.19-hp2_pnnl2/include/linux/slab.h~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 -+++ linux-2.4.19-hp2_pnnl2-root/include/linux/slab.h Sun Jan 19 19:01:07 2003 +--- linux/include/linux/slab.h~kmem_cache_validate_hp 2002-11-29 07:53:15.000000000 +0800 ++++ linux-root/include/linux/slab.h 2003-05-16 20:03:56.000000000 +0800 @@ -56,6 +56,7 @@ extern kmem_cache_t *kmem_cache_create(c extern int kmem_cache_destroy(kmem_cache_t *); extern int kmem_cache_shrink(kmem_cache_t *); @@ -29,9 +30,9 @@ extern void kmem_cache_free(kmem_cache_t *, void *); extern unsigned int kmem_cache_size(kmem_cache_t *); ---- linux-2.4.19-hp2_pnnl2/kernel/ksyms.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 -+++ linux-2.4.19-hp2_pnnl2-root/kernel/ksyms.c Sun Jan 19 19:00:32 2003 -@@ -118,6 +118,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep); +--- linux/kernel/ksyms.c~kmem_cache_validate_hp 2003-05-16 20:03:55.000000000 +0800 ++++ linux-root/kernel/ksyms.c 2003-05-16 20:03:56.000000000 +0800 +@@ -119,6 +119,7 @@ EXPORT_SYMBOL(kmem_find_general_cachep); EXPORT_SYMBOL(kmem_cache_create); EXPORT_SYMBOL(kmem_cache_destroy); EXPORT_SYMBOL(kmem_cache_shrink); @@ -39,9 +40,9 @@ EXPORT_SYMBOL(kmem_cache_alloc); EXPORT_SYMBOL(kmem_cache_free); EXPORT_SYMBOL(kmem_cache_size); ---- linux-2.4.19-hp2_pnnl2/mm/slab.c~kmem_cache_validate_hp Sun Jan 19 18:59:23 2003 -+++ linux-2.4.19-hp2_pnnl2-root/mm/slab.c Sun Jan 19 18:59:24 2003 -@@ -1207,6 +1207,59 @@ failed: +--- linux/mm/slab.c~kmem_cache_validate_hp 2002-11-29 07:53:15.000000000 +0800 ++++ linux-root/mm/slab.c 2003-05-16 20:03:56.000000000 +0800 +@@ -1205,6 +1205,59 @@ failed: * Called with the cache-lock held. */ @@ -101,5 +102,20 @@ #if DEBUG static int kmem_extra_free_checks (kmem_cache_t * cachep, slab_t *slabp, void * objp) +--- linux/arch/i386/mm/init.c~kmem_cache_validate_hp 2003-05-16 20:03:22.000000000 +0800 ++++ linux-root/arch/i386/mm/init.c 2003-05-16 20:06:16.000000000 +0800 +@@ -42,6 +42,12 @@ mmu_gather_t mmu_gathers[NR_CPUS]; + unsigned long highstart_pfn, highend_pfn; + static unsigned long totalram_pages; + static unsigned long totalhigh_pages; ++ ++struct page *check_get_page(unsigned long kaddr) ++{ ++#warning FIXME: Lustre team, is this solid? ++ return virt_to_page(kaddr); ++} + + int do_check_pgt_cache(int low, int high) + { _ diff --git a/lustre/extN/linux-2.4.18ea-0.8.26.diff b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26.patch similarity index 93% rename from lustre/extN/linux-2.4.18ea-0.8.26.diff rename to lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26.patch index 4c8fb86..75ebcd0 100644 --- a/lustre/extN/linux-2.4.18ea-0.8.26.diff +++ b/lustre/kernel_patches/patches/linux-2.4.18ea-0.8.26.patch @@ -1,30 +1,7 @@ -Linux Extended Attributes -- Kernel Patch -24 April 2002, 11:31:18 + 0 files changed - -This patch is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This patch is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this patch; if not, write to the Free Software Foundation, -Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -After extracting the linux-2.4.18.tar.gz package, apply this patch as follows: - - cd linux - patch -p1 < ../linux-2.4.18ea-0.8.26.patch - -diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c ---- linux-2.4.18/fs/ext3/ialloc.c Sun Feb 24 04:42:59 2002 -+++ linux-2.4.18ea/fs/ext3/ialloc.c Sun Feb 24 04:34:43 2002 +--- linux-2.4.18-18/fs/ext3/ialloc.c~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/ialloc.c 2003-04-20 16:14:31.000000000 +0800 @@ -17,6 +17,7 @@ #include #include @@ -33,7 +10,7 @@ diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c #include #include #include -@@ -216,6 +217,7 @@ +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, * as writing the quota to disk may need the lock as well. */ DQUOT_INIT(inode); @@ -41,9 +18,8 @@ diff -Nur linux-2.4.18/fs/ext3/ialloc.c linux-2.4.18ea/fs/ext3/ialloc.c DQUOT_FREE_INODE(inode); DQUOT_DROP(inode); -diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c ---- linux-2.4.18/fs/ext3/inode.c Sun Feb 24 04:42:59 2002 -+++ linux-2.4.18ea/fs/ext3/inode.c Thu Mar 14 21:51:59 2002 +--- linux-2.4.18-18/fs/ext3/inode.c~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/inode.c 2003-04-20 16:14:31.000000000 +0800 @@ -39,6 +39,18 @@ */ #undef SEARCH_FROM_ZERO @@ -72,7 +48,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c struct inode *inode, struct buffer_head *bh, int blocknr) { -@@ -164,9 +176,7 @@ +@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i { handle_t *handle; @@ -83,7 +59,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c goto no_delete; lock_kernel(); -@@ -1845,6 +1855,8 @@ +@@ -1861,6 +1871,8 @@ void ext3_truncate(struct inode * inode) if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return; @@ -92,7 +68,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; -@@ -1992,8 +2004,6 @@ +@@ -2008,8 +2020,6 @@ int ext3_get_inode_loc (struct inode *in struct ext3_group_desc * gdp; if ((inode->i_ino != EXT3_ROOT_INO && @@ -101,7 +77,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c inode->i_ino != EXT3_JOURNAL_INO && inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || inode->i_ino > le32_to_cpu( -@@ -2120,10 +2130,7 @@ +@@ -2136,10 +2146,7 @@ void ext3_read_inode(struct inode * inod brelse (iloc.bh); @@ -113,7 +89,7 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; inode->i_mapping->a_ops = &ext3_aops; -@@ -2131,7 +2138,7 @@ +@@ -2147,7 +2154,7 @@ void ext3_read_inode(struct inode * inod inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; } else if (S_ISLNK(inode->i_mode)) { @@ -122,10 +98,9 @@ diff -Nur linux-2.4.18/fs/ext3/inode.c linux-2.4.18ea/fs/ext3/inode.c inode->i_op = &ext3_fast_symlink_inode_operations; else { inode->i_op = &page_symlink_inode_operations; -diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c ---- linux-2.4.18/fs/ext3/namei.c Fri Nov 9 23:25:04 2001 -+++ linux-2.4.18ea/fs/ext3/namei.c Mon Mar 11 03:27:00 2002 -@@ -23,6 +23,7 @@ +--- linux-2.4.18-18/fs/ext3/namei.c~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/namei.c 2003-04-20 16:14:31.000000000 +0800 +@@ -27,6 +27,7 @@ #include #include #include @@ -133,15 +108,15 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c #include #include #include -@@ -435,6 +435,7 @@ static int ext3_add_nondir(handle_t *han - return 0; - } +@@ -1183,6 +1184,7 @@ static int ext3_add_nondir(handle_t *han + d_instantiate(dentry, inode); + return 0; } + ext3_xattr_drop_inode(handle, inode); ext3_dec_count(handle, inode); iput(inode); return err; -@@ -514,7 +519,7 @@ +@@ -1268,15 +1270,14 @@ static int ext3_mkdir(struct inode * dir if (IS_SYNC(dir)) handle->h_sync = 1; @@ -150,7 +125,7 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; -@@ -522,7 +527,6 @@ + inode->i_op = &ext3_dir_inode_operations; inode->i_fop = &ext3_dir_operations; - inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; @@ -159,7 +134,7 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c dir_block = ext3_bread (handle, inode, 0, 1, &err); if (!dir_block) { inode->i_nlink--; /* is this nlink == 0? */ -@@ -549,9 +553,6 @@ +@@ -1303,9 +1304,6 @@ static int ext3_mkdir(struct inode * dir BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, dir_block); brelse (dir_block); @@ -169,16 +144,17 @@ diff -Nur linux-2.4.18/fs/ext3/namei.c linux-2.4.18ea/fs/ext3/namei.c ext3_mark_inode_dirty(handle, inode); err = ext3_add_entry (handle, dentry, inode); if (err) -@@ -917,5 +919,5 @@ +@@ -1671,7 +1669,7 @@ static int ext3_symlink (struct inode * + if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (inode->u.ext3_i.i_data)) { + if (l > sizeof(EXT3_I(inode)->i_data)) { inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &ext3_aops; -diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c ---- linux-2.4.18/fs/ext3/super.c Sun Feb 24 04:42:59 2002 -+++ linux-2.4.18ea/fs/ext3/super.c Thu Apr 4 21:41:05 2002 + /* +--- linux-2.4.18-18/fs/ext3/super.c~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/super.c 2003-04-20 16:14:31.000000000 +0800 @@ -24,6 +24,7 @@ #include #include @@ -187,7 +163,7 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c #include #include #include -@@ -404,6 +405,7 @@ +@@ -404,6 +405,7 @@ void ext3_put_super (struct super_block kdev_t j_dev = sbi->s_journal->j_dev; int i; @@ -195,7 +171,7 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c journal_destroy(sbi->s_journal); if (!(sb->s_flags & MS_RDONLY)) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); -@@ -1734,14 +1772,25 @@ +@@ -1748,14 +1750,25 @@ int ext3_statfs (struct super_block * sb static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); @@ -224,10 +200,9 @@ diff -Nur linux-2.4.18/fs/ext3/super.c linux-2.4.18ea/fs/ext3/super.c + return error; } - EXPORT_NO_SYMBOLS; -diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c ---- linux-2.4.18/fs/ext3/xattr.c Thu Jan 1 01:00:00 1970 -+++ linux-2.4.18ea/fs/ext3/xattr.c Wed Apr 3 13:19:05 2002 + EXPORT_SYMBOL(ext3_bread); +--- /dev/null 2002-08-31 07:31:37.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/xattr.c 2003-04-20 16:14:31.000000000 +0800 @@ -0,0 +1,1247 @@ +/* + * linux/fs/ext3/xattr.c @@ -302,11 +277,11 @@ diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c +#include + +/* These symbols may be needed by a module. */ -+EXPORT_SYMBOL(extN_xattr_register); -+EXPORT_SYMBOL(extN_xattr_unregister); -+EXPORT_SYMBOL(extN_xattr_get); -+EXPORT_SYMBOL(extN_xattr_list); -+EXPORT_SYMBOL(extN_xattr_set); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) +# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) @@ -1476,9 +1451,8 @@ diff -Nur linux-2.4.18/fs/ext3/xattr.c linux-2.4.18ea/fs/ext3/xattr.c +} + +#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ -diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3_fs.h ---- linux-2.4.18/include/linux/ext3_fs.h Sun Feb 24 04:42:59 2002 -+++ linux-2.4.18ea/include/linux/ext3_fs.h Mon Mar 11 03:27:00 2002 +--- linux-2.4.18-18/include/linux/ext3_fs.h~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/include/linux/ext3_fs.h 2003-04-20 16:14:31.000000000 +0800 @@ -58,8 +58,6 @@ */ #define EXT3_BAD_INO 1 /* Bad blocks inode */ @@ -1525,7 +1499,7 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3 * Structure of a blocks group descriptor */ struct ext3_group_desc -@@ -512,7 +487,7 @@ +@@ -513,7 +488,7 @@ struct ext3_super_block { #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ @@ -1534,8 +1508,9 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3 #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ EXT3_FEATURE_INCOMPAT_RECOVER) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ -@@ -603,4 +578,22 @@ - */ +@@ -606,6 +581,24 @@ struct ext3_iloc + unsigned long block_group; + }; +/* Defined for extended attributes */ +#define CONFIG_EXT3_FS_XATTR y @@ -1556,8 +1531,9 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3 +#endif + /* - * Ok, these declarations are also in but none of the -@@ -628,6 +603,7 @@ + * Function prototypes + */ +@@ -647,6 +640,7 @@ extern void ext3_check_inodes_bitmap (st extern unsigned long ext3_count_free (struct buffer_head *, unsigned); /* inode.c */ @@ -1565,9 +1541,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_fs.h linux-2.4.18ea/include/linux/ext3 extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); -diff -Nur linux-2.4.18/include/linux/ext3_jbd.h linux-2.4.18ea/include/linux/ext3_jbd.h ---- linux-2.4.18/include/linux/ext3_jbd.h Fri Dec 21 18:42:03 2001 -+++ linux-2.4.18ea/include/linux/ext3_jbd.h Mon Mar 25 00:11:36 2002 +--- linux-2.4.18-18/include/linux/ext3_jbd.h~linux-2.4.18ea-0.8.26 2003-04-20 16:14:31.000000000 +0800 ++++ linux-2.4.18-18-root/include/linux/ext3_jbd.h 2003-04-20 16:14:31.000000000 +0800 @@ -30,13 +30,19 @@ #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 @@ -1589,9 +1564,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_jbd.h linux-2.4.18ea/include/linux/ext extern int ext3_writepage_trans_blocks(struct inode *inode); -diff -Nur linux-2.4.18/include/linux/ext3_xattr.h linux-2.4.18ea/include/linux/ext3_xattr.h ---- linux-2.4.18/include/linux/ext3_xattr.h Thu Jan 1 01:00:00 1970 -+++ linux-2.4.18ea/include/linux/ext3_xattr.h Fri Apr 5 10:08:01 2002 +--- /dev/null 2002-08-31 07:31:37.000000000 +0800 ++++ linux-2.4.18-18-root/include/linux/ext3_xattr.h 2003-04-20 16:14:31.000000000 +0800 @@ -0,0 +1,155 @@ +/* + File: linux/ext3_xattr.h @@ -1748,9 +1722,8 @@ diff -Nur linux-2.4.18/include/linux/ext3_xattr.h linux-2.4.18ea/include/linux/e + +#endif /* __KERNEL__ */ + -diff -Nur linux-2.4.18/include/linux/xattr.h linux-2.4.18ea/include/linux/xattr.h ---- linux-2.4.18/include/linux/xattr.h Thu Jan 1 01:00:00 1970 -+++ linux-2.4.18ea/include/linux/xattr.h Sun Mar 24 23:42:21 2002 +--- /dev/null 2002-08-31 07:31:37.000000000 +0800 ++++ linux-2.4.18-18-root/include/linux/xattr.h 2003-04-20 16:14:31.000000000 +0800 @@ -0,0 +1,15 @@ +/* + File: linux/xattr.h @@ -1767,3 +1740,20 @@ diff -Nur linux-2.4.18/include/linux/xattr.h linux-2.4.18ea/include/linux/xattr. +#define XATTR_REPLACE 2 /* set value, fail if attr does not exist */ + +#endif /* _LINUX_XATTR_H */ +--- linux-2.4.18-18/fs/ext3/Makefile~linux-2.4.18ea-0.8.26 2003-04-20 16:14:54.000000000 +0800 ++++ linux-2.4.18-18-root/fs/ext3/Makefile 2003-04-20 16:15:15.000000000 +0800 +@@ -9,10 +9,10 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := super.o inode.o xattr.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o ++ ioctl.o namei.o super.o symlink.o xattr.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make + +_ diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch new file mode 100644 index 0000000..5c6c6a9 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-chaos.patch @@ -0,0 +1,5538 @@ + Documentation/Configure.help | 66 ++ + arch/alpha/defconfig | 7 + arch/alpha/kernel/entry.S | 12 + arch/arm/defconfig | 7 + arch/arm/kernel/calls.S | 24 + arch/i386/defconfig | 7 + arch/ia64/defconfig | 7 + arch/m68k/defconfig | 7 + arch/mips/defconfig | 7 + arch/mips64/defconfig | 7 + arch/ppc/defconfig | 14 + arch/ppc64/kernel/misc.S | 2 + arch/s390/defconfig | 7 + arch/s390/kernel/entry.S | 24 + arch/s390x/defconfig | 7 + arch/s390x/kernel/entry.S | 24 + arch/s390x/kernel/wrapper32.S | 92 +++ + arch/sparc/defconfig | 7 + arch/sparc/kernel/systbls.S | 10 + arch/sparc64/defconfig | 7 + arch/sparc64/kernel/systbls.S | 20 + fs/Config.in | 14 + fs/Makefile | 3 + fs/ext2/Makefile | 4 + fs/ext2/file.c | 5 + fs/ext2/ialloc.c | 2 + fs/ext2/inode.c | 34 - + fs/ext2/namei.c | 14 + fs/ext2/super.c | 29 + fs/ext2/symlink.c | 14 + fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ + fs/ext2/xattr_user.c | 103 +++ + fs/ext3/Makefile | 10 + fs/ext3/file.c | 5 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 35 - + fs/ext3/namei.c | 21 + fs/ext3/super.c | 36 + + fs/ext3/symlink.c | 14 + fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/xattr_user.c | 111 +++ + fs/jfs/jfs_xattr.h | 6 + fs/jfs/xattr.c | 6 + fs/mbcache.c | 648 ++++++++++++++++++++++ + include/asm-arm/unistd.h | 2 + include/asm-ppc64/unistd.h | 2 + include/asm-s390/unistd.h | 15 + include/asm-s390x/unistd.h | 15 + include/asm-sparc/unistd.h | 24 + include/asm-sparc64/unistd.h | 24 + include/linux/cache_def.h | 15 + include/linux/errno.h | 4 + include/linux/ext2_fs.h | 31 - + include/linux/ext2_xattr.h | 157 +++++ + include/linux/ext3_fs.h | 31 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 157 +++++ + include/linux/fs.h | 2 + include/linux/mbcache.h | 69 ++ + kernel/ksyms.c | 4 + mm/vmscan.c | 36 + + fs/ext3/ext3-exports.c | 14 + + 62 files changed, 4331 insertions(+), 197 deletions(-) + +--- linux-rh-2.4.20-8/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:50.000000000 +0800 ++++ linux-rh-2.4.20-8-root/Documentation/Configure.help 2003-05-07 17:34:25.000000000 +0800 +@@ -15226,6 +15226,39 @@ CONFIG_EXT2_FS + be compiled as a module, and so this could be dangerous. Most + everyone wants to say Y here. + ++Ext2 extended attributes ++CONFIG_EXT2_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext2 extended attribute block sharing ++CONFIG_EXT2_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext2 extended user attributes ++CONFIG_EXT2_FS_XATTR_USER ++ This option enables extended user attributes on ext2. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext2 trusted extended attributes ++CONFIG_EXT2_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext2 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Ext3 journalling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journalling version of the Second extended file system +@@ -15258,6 +15291,39 @@ CONFIG_EXT3_FS + of your root partition (the one containing the directory /) cannot + be compiled as a module, and so this may be dangerous. + ++Ext3 extended attributes ++CONFIG_EXT3_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext3 extended attribute block sharing ++CONFIG_EXT3_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext3 extended user attributes ++CONFIG_EXT3_FS_XATTR_USER ++ This option enables extended user attributes on ext3. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext3 trusted extended attributes ++CONFIG_EXT3_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext3 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journalling layer for block devices. It is +--- linux-rh-2.4.20-8/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2001-11-20 07:19:42.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/alpha/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ALPHA=y + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set +--- linux-rh-2.4.20-8/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:53.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/alpha/kernel/entry.S 2003-05-07 17:34:25.000000000 +0800 +@@ -1162,6 +1162,18 @@ sys_call_table: + .quad sys_readahead + .quad sys_ni_syscall /* 380, sys_security */ + .quad sys_tkill ++ .quad sys_setxattr ++ .quad sys_lsetxattr ++ .quad sys_fsetxattr ++ .quad sys_getxattr /* 385 */ ++ .quad sys_lgetxattr ++ .quad sys_fgetxattr ++ .quad sys_listxattr ++ .quad sys_llistxattr ++ .quad sys_flistxattr /* 390 */ ++ .quad sys_removexattr ++ .quad sys_lremovexattr ++ .quad sys_fremovexattr + + /* Remember to update everything, kids. */ + .ifne (. - sys_call_table) - (NR_SYSCALLS * 8) +--- linux-rh-2.4.20-8/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2001-05-20 08:43:05.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/arm/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ARM=y + # CONFIG_EISA is not set + # CONFIG_SBUS is not set +--- linux-rh-2.4.20-8/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:42.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/arm/kernel/calls.S 2003-05-07 17:34:25.000000000 +0800 +@@ -240,18 +240,18 @@ __syscall_start: + .long SYMBOL_NAME(sys_ni_syscall) /* Security */ + .long SYMBOL_NAME(sys_gettid) + /* 225 */ .long SYMBOL_NAME(sys_readahead) +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */ +-/* 230 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */ +-/* 235 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */ ++ .long SYMBOL_NAME(sys_setxattr) ++ .long SYMBOL_NAME(sys_lsetxattr) ++ .long SYMBOL_NAME(sys_fsetxattr) ++ .long SYMBOL_NAME(sys_getxattr) ++/* 230 */ .long SYMBOL_NAME(sys_lgetxattr) ++ .long SYMBOL_NAME(sys_fgetxattr) ++ .long SYMBOL_NAME(sys_listxattr) ++ .long SYMBOL_NAME(sys_llistxattr) ++ .long SYMBOL_NAME(sys_flistxattr) ++/* 235 */ .long SYMBOL_NAME(sys_removexattr) ++ .long SYMBOL_NAME(sys_lremovexattr) ++ .long SYMBOL_NAME(sys_fremovexattr) + .long SYMBOL_NAME(sys_tkill) + /* + * Please check 2.5 _before_ adding calls here, +--- linux-rh-2.4.20-8/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:53.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/i386/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_X86=y + CONFIG_ISA=y + # CONFIG_SBUS is not set +--- linux-rh-2.4.20-8/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/ia64/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux-rh-2.4.20-8/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2000-06-20 03:56:08.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/m68k/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + + # +--- linux-rh-2.4.20-8/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:10.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/mips/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + CONFIG_MIPS32=y + # CONFIG_MIPS64 is not set +--- linux-rh-2.4.20-8/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:10.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/mips64/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + # CONFIG_MIPS32 is not set + CONFIG_MIPS64=y +--- linux-rh-2.4.20-8/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/ppc/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,20 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set + CONFIG_RWSEM_XCHGADD_ALGORITHM=y +--- linux-rh-2.4.20-8/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/ppc64/kernel/misc.S 2003-05-07 17:34:25.000000000 +0800 +@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_gettid /* 207 */ + #if 0 /* Reserved syscalls */ + .llong .sys_tkill /* 208 */ ++#endif + .llong .sys_setxattr + .llong .sys_lsetxattr /* 210 */ + .llong .sys_fsetxattr +@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_removexattr + .llong .sys_lremovexattr + .llong .sys_fremovexattr /* 220 */ ++#if 0 /* Reserved syscalls */ + .llong .sys_futex + #endif + .llong .sys_perfmonctl /* Put this here for now ... */ +--- linux-rh-2.4.20-8/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/s390/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux-rh-2.4.20-8/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/s390/kernel/entry.S 2003-05-07 17:34:25.000000000 +0800 +@@ -558,18 +558,18 @@ sys_call_table: + .long sys_fcntl64 + .long sys_ni_syscall + .long sys_ni_syscall +- .long sys_ni_syscall /* 224 - reserved for setxattr */ +- .long sys_ni_syscall /* 225 - reserved for lsetxattr */ +- .long sys_ni_syscall /* 226 - reserved for fsetxattr */ +- .long sys_ni_syscall /* 227 - reserved for getxattr */ +- .long sys_ni_syscall /* 228 - reserved for lgetxattr */ +- .long sys_ni_syscall /* 229 - reserved for fgetxattr */ +- .long sys_ni_syscall /* 230 - reserved for listxattr */ +- .long sys_ni_syscall /* 231 - reserved for llistxattr */ +- .long sys_ni_syscall /* 232 - reserved for flistxattr */ +- .long sys_ni_syscall /* 233 - reserved for removexattr */ +- .long sys_ni_syscall /* 234 - reserved for lremovexattr */ +- .long sys_ni_syscall /* 235 - reserved for fremovexattr */ ++ .long sys_setxattr ++ .long sys_lsetxattr /* 225 */ ++ .long sys_fsetxattr ++ .long sys_getxattr ++ .long sys_lgetxattr ++ .long sys_fgetxattr ++ .long sys_listxattr /* 230 */ ++ .long sys_llistxattr ++ .long sys_flistxattr ++ .long sys_removexattr ++ .long sys_lremovexattr ++ .long sys_fremovexattr /* 235 */ + .long sys_gettid + .long sys_tkill + .rept 255-237 +--- linux-rh-2.4.20-8/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/s390x/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux-rh-2.4.20-8/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:11.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/s390x/kernel/entry.S 2003-05-07 17:34:25.000000000 +0800 +@@ -591,18 +591,18 @@ sys_call_table: + .long SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */ ++ .long SYSCALL(sys_setxattr,sys32_setxattr_wrapper) ++ .long SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper) /* 225 */ ++ .long SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper) ++ .long SYSCALL(sys_getxattr,sys32_getxattr_wrapper) ++ .long SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper) ++ .long SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper) ++ .long SYSCALL(sys_listxattr,sys32_listxattr_wrapper) /* 230 */ ++ .long SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper) ++ .long SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper) ++ .long SYSCALL(sys_removexattr,sys32_removexattr_wrapper) ++ .long SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper) ++ .long SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */ + .long SYSCALL(sys_gettid,sys_gettid) + .long SYSCALL(sys_tkill,sys_tkill) + .rept 255-237 +--- linux-rh-2.4.20-8/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54-chaos 2002-02-26 03:37:56.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/s390x/kernel/wrapper32.S 2003-05-07 17:34:25.000000000 +0800 +@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper: + llgtr %r3,%r3 # struct stat64 * + llgfr %r4,%r4 # long + jg sys32_fstat64 # branch to system call ++ ++ .globl sys32_setxattr_wrapper ++sys32_setxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_setxattr ++ ++ .globl sys32_lsetxattr_wrapper ++sys32_lsetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_lsetxattr ++ ++ .globl sys32_fsetxattr_wrapper ++sys32_fsetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_fsetxattr ++ ++ .globl sys32_getxattr_wrapper ++sys32_getxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_getxattr ++ ++ .globl sys32_lgetxattr_wrapper ++sys32_lgetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_lgetxattr ++ ++ .globl sys32_fgetxattr_wrapper ++sys32_fgetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_fgetxattr ++ ++ .globl sys32_listxattr_wrapper ++sys32_listxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_listxattr ++ ++ .globl sys32_llistxattr_wrapper ++sys32_llistxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_llistxattr ++ ++ .globl sys32_flistxattr_wrapper ++sys32_flistxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_flistxattr ++ ++ .globl sys32_removexattr_wrapper ++sys32_removexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_removexattr ++ ++ .globl sys32_lremovexattr_wrapper ++sys32_lremovexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_lremovexattr ++ ++ .globl sys32_fremovexattr_wrapper ++sys32_fremovexattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ jg sys_fremovexattr ++ ++ +--- linux-rh-2.4.20-8/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/sparc/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + CONFIG_HIGHMEM=y + +--- linux-rh-2.4.20-8/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/sparc/kernel/systbls.S 2003-05-07 17:34:25.000000000 +0800 +@@ -51,11 +51,11 @@ sys_call_table: + /*150*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + /*155*/ .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +-/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +-/*175*/ .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module +-/*185*/ .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname ++/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++/*175*/ .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module ++/*185*/ .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname + /*190*/ .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + /*195*/ .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask + /*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir +--- linux-rh-2.4.20-8/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/sparc64/defconfig 2003-05-07 17:34:25.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux-rh-2.4.20-8/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:43.000000000 +0800 ++++ linux-rh-2.4.20-8-root/arch/sparc64/kernel/systbls.S 2003-05-07 17:34:25.000000000 +0800 +@@ -52,11 +52,11 @@ sys_call_table32: + /*150*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +- .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir +@@ -111,11 +111,11 @@ sys_call_table: + /*150*/ .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install +- .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall +--- linux-rh-2.4.20-8/fs/Config.in~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:05:03.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/Config.in 2003-05-07 17:34:25.000000000 +0800 +@@ -34,6 +34,11 @@ dep_mbool ' Debug Befs' CONFIG_BEFS_DEB + dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + + tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS ++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS ++dep_bool ' Ext3 extended attribute block sharing' \ ++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR ++dep_bool ' Ext3 extended user attributes' \ ++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS +@@ -93,6 +98,11 @@ dep_mbool ' QNX4FS write support (DANGE + tristate 'ROM file system support' CONFIG_ROMFS_FS + + tristate 'Second extended fs support' CONFIG_EXT2_FS ++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS ++dep_bool ' Ext2 extended attribute block sharing' \ ++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR ++dep_bool ' Ext2 extended user attributes' \ ++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR + + tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS + +@@ -164,6 +174,10 @@ else + define_tristate CONFIG_ZISOFS_FS n + fi + ++# Meta block cache for Extended Attributes (ext2/ext3) ++#tristate 'Meta block cache' CONFIG_FS_MBCACHE ++define_tristate CONFIG_FS_MBCACHE y ++ + mainmenu_option next_comment + comment 'Partition Types' + source fs/partitions/Config.in +--- linux-rh-2.4.20-8/fs/Makefile~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:58.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/Makefile 2003-05-07 17:34:25.000000000 +0800 +@@ -84,6 +84,9 @@ obj-y += binfmt_script.o + + obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o + ++export-objs += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++ + # persistent filesystems + obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) + +--- linux-rh-2.4.20-8/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54-chaos 2001-10-11 23:05:18.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/Makefile 2003-05-07 17:34:25.000000000 +0800 +@@ -13,4 +13,8 @@ obj-y := balloc.o bitmap.o dir.o file + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux-rh-2.4.20-8/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54-chaos 2001-10-11 23:05:18.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/file.c 2003-05-07 17:34:25.000000000 +0800 +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + + /* +@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati + + struct inode_operations ext2_file_inode_operations = { + truncate: ext2_truncate, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux-rh-2.4.20-8/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/ialloc.c 2003-05-07 17:34:25.000000000 +0800 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino + */ + if (!is_bad_inode(inode)) { + /* Quota is already initialized in iput() */ ++ ext2_xattr_delete_inode(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + } +--- linux-rh-2.4.20-8/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/inode.c 2003-05-07 17:34:25.000000000 +0800 +@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL"); + static int ext2_update_inode(struct inode * inode, int do_sync); + + /* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext2_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext2_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ ++/* + * Called at each iput() + */ + void ext2_put_inode (struct inode * inode) +@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i + { + lock_kernel(); + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + inode->u.ext2_i.i_dtime = CURRENT_TIME; + mark_inode_dirty(inode); +@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext2_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino + unsigned long offset; + struct ext2_group_desc * gdp; + +- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && +- inode->i_ino != EXT2_ACL_DATA_INO && ++ if ((inode->i_ino != EXT2_ROOT_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_read_inode", +@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino + for (block = 0; block < EXT2_N_BLOCKS; block++) + inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; + +- if (inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + inode->i_mapping->a_ops = &ext2_aops; +@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino + inode->i_fop = &ext2_dir_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext2_inode_is_fast_symlink(inode)) + inode->i_op = &ext2_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + } +- } else ++ } else { ++ inode->i_op = &ext2_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(raw_inode->i_block[0])); ++ } + brelse (bh); + inode->i_attr_flags = 0; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { +--- linux-rh-2.4.20-8/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54-chaos 2001-10-04 13:57:36.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/namei.c 2003-05-07 17:34:25.000000000 +0800 +@@ -31,6 +31,7 @@ + + #include + #include ++#include + #include + + /* +@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * + + if (l > sizeof (inode->u.ext2_i.i_data)) { + /* slow symlink */ +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + err = block_symlink(inode, symname, l); + if (err) +@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o + rmdir: ext2_rmdir, + mknod: ext2_mknod, + rename: ext2_rename, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ ++struct inode_operations ext2_special_inode_operations = { ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux-rh-2.4.20-8/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/super.c 2003-05-07 17:34:25.000000000 +0800 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block + int db_count; + int i; + ++ ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + +@@ -175,6 +177,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st + blocksize = BLOCK_SIZE; + + sb->u.ext2_sb.s_mount_opt = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ ++#endif + if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, + &sb->u.ext2_sb.s_mount_opt)) { + return NULL; +@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, + + static int __init init_ext2_fs(void) + { +- return register_filesystem(&ext2_fs_type); ++ int error = init_ext2_xattr(); ++ if (error) ++ return error; ++ error = init_ext2_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext2_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext2_xattr_user(); ++fail: ++ exit_ext2_xattr(); ++ return error; + } + + static void __exit exit_ext2_fs(void) + { + unregister_filesystem(&ext2_fs_type); ++ exit_ext2_xattr_user(); ++ exit_ext2_xattr(); + } + + EXPORT_NO_SYMBOLS; +--- linux-rh-2.4.20-8/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54-chaos 2000-09-28 04:41:33.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/symlink.c 2003-05-07 17:34:25.000000000 +0800 +@@ -19,6 +19,7 @@ + + #include + #include ++#include + + static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext2_symlink_inode_operations = { ++ readlink: page_readlink, ++ follow_link: page_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ + struct inode_operations ext2_fast_symlink_inode_operations = { + readlink: ext2_readlink, + follow_link: ext2_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/xattr.c 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,1212 @@ ++/* ++ * linux/fs/ext2/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT2_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++EXPORT_SYMBOL(ext2_xattr_register); ++EXPORT_SYMBOL(ext2_xattr_unregister); ++EXPORT_SYMBOL(ext2_xattr_get); ++EXPORT_SYMBOL(ext2_xattr_list); ++EXPORT_SYMBOL(ext2_xattr_set); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT2_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext2_xattr_set2(struct inode *, struct buffer_head *, ++ struct ext2_xattr_header *); ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++static int ext2_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext2_xattr_cache_find(struct inode *, ++ struct ext2_xattr_header *); ++static void ext2_xattr_cache_remove(struct buffer_head *); ++static void ext2_xattr_rehash(struct ext2_xattr_header *, ++ struct ext2_xattr_entry *); ++ ++static struct mb_cache *ext2_xattr_cache; ++ ++#else ++# define ext2_xattr_cache_insert(bh) 0 ++# define ext2_xattr_cache_find(inode, header) NULL ++# define ext2_xattr_cache_remove(bh) while(0) {} ++# define ext2_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext2_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext2_xattr_sem); ++ ++static inline int ++ext2_xattr_new_block(struct inode *inode, int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + ++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext2_new_block(inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext2_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext2_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext2_xattr_free_block(struct inode * inode, unsigned long block) ++{ ++ ext2_free_blocks(inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext2_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext2_xattr_free_block(inode, block) \ ++ ext2_free_blocks(inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; ++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ if (!ext2_xattr_handlers[name_index-1]) { ++ ext2_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext2_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ ext2_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext2_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static struct ext2_xattr_handler * ++ext2_xattr_resolve_name(const char **name) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext2_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext2_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext2_handler_lock); ++ return handler; ++} ++ ++static inline struct ext2_xattr_handler * ++ext2_xattr_handler(int name_index) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ read_lock(&ext2_handler_lock); ++ handler = ext2_xattr_handlers[name_index-1]; ++ read_unlock(&ext2_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext2_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext2_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT2_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT2_I(inode)->i_file_acl) ++ return 0; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext2_xattr_update_super_block(struct super_block *sb) ++{ ++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT2_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext2_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_header *header = NULL; ++ struct ext2_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT2_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext2_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(sb, "ext2_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext2_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT2_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT2_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT2_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext2_xattr_cache_remove(bh); ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT2_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT2_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext2_xattr_set2(inode, bh, NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT2_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT2_XATTR_PAD, 0, ++ EXT2_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext2_xattr_rehash(header, here); ++ ++ error = ext2_xattr_set2(inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext2_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext2_xattr_set(): Update the file system. ++ */ ++static int ++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ++ struct ext2_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext2_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext2_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext2_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT2_I(inode)->i_file_acl != 0; ++ int block = ext2_xattr_new_block(inode, &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++ ext2_xattr_free_block(inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext2_xattr_cache_insert(new_bh); ++ ++ ext2_xattr_update_super_block(sb); ++ } ++ mark_buffer_dirty(new_bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &new_bh); ++ wait_on_buffer(new_bh); ++ error = -EIO; ++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) ++ goto cleanup; ++ } ++ } ++ ++ /* Update the inode. */ ++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ if (IS_SYNC(inode)) { ++ error = ext2_sync_inode (inode); ++ if (error) ++ goto cleanup; ++ } else ++ mark_inode_dirty(inode); ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext2_xattr_free_block(inode, old_bh->b_blocknr); ++ mark_buffer_clean(old_bh); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext2_xattr_quota_free(inode); ++ mark_buffer_dirty(old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT2_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext2_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext2_xattr_cache_remove(bh); ++ ext2_xattr_free_block(inode, block); ++ bforget(bh); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ mark_buffer_dirty(bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ } ++ ext2_xattr_quota_free(inode); ++ } ++ EXT2_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext2_xattr_sem); ++} ++ ++/* ++ * ext2_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++/* ++ * ext2_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext2_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext2_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext2_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext2_xattr_cmp(struct ext2_xattr_header *header1, ++ struct ext2_xattr_header *header2) ++{ ++ struct ext2_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT2_XATTR_NEXT(entry1); ++ entry2 = EXT2_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext2_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT2_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT2_XATTR_REFCOUNT_MAX); ++ } else if (!ext2_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext2_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext2_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext2_xattr_rehash(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ struct ext2_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext2_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT2_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext2_xattr(void) ++{ ++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext2_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++ mb_cache_destroy(ext2_xattr_cache); ++} ++ ++#else /* CONFIG_EXT2_FS_XATTR_SHARING */ ++ ++int __init ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext2/xattr_user.c 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,103 @@ ++/* ++ * linux/fs/ext2/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext2_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext2_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext2_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, ++ value, size, flags); ++} ++ ++struct ext2_xattr_handler ext2_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext2_xattr_user_list, ++ get: ext2_xattr_user_get, ++ set: ext2_xattr_user_set, ++}; ++ ++int __init ++init_ext2_xattr_user(void) ++{ ++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} ++ ++void ++exit_ext2_xattr_user(void) ++{ ++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} +--- linux-rh-2.4.20-8/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/Makefile 2003-05-07 17:45:13.000000000 +0800 +@@ -1,5 +1,5 @@ + # +-# Makefile for the linux ext2-filesystem routines. ++# Makefile for the linux ext3-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here +@@ -9,10 +9,14 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux-rh-2.4.20-8/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/file.c 2003-05-07 17:34:25.000000000 +0800 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; + +--- linux-rh-2.4.20-8/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:48.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/ialloc.c 2003-05-07 17:34:25.000000000 +0800 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +--- linux-rh-2.4.20-8/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:58.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/inode.c 2003-05-07 17:34:25.000000000 +0800 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext3_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -179,9 +191,7 @@ void ext3_delete_inode (struct inode * i + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1874,6 +1884,8 @@ void ext3_truncate(struct inode * inode) + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -2021,8 +2033,6 @@ int ext3_get_inode_loc (struct inode *in + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2149,10 +2159,7 @@ void ext3_read_inode(struct inode * inod + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2160,15 +2167,17 @@ void ext3_read_inode(struct inode * inod + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } +- } else ++ } else { ++ inode->i_op = &ext3_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); ++ } + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ +--- linux-rh-2.4.20-8/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/namei.c 2003-05-07 17:34:25.000000000 +0800 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1613,7 +1614,7 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1621,7 +1622,6 @@ static int ext3_mkdir(struct inode * dir + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1648,9 +1648,6 @@ static int ext3_mkdir(struct inode * dir + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +@@ -2019,7 +2016,7 @@ static int ext3_symlink (struct inode * + goto out_stop; + + if (l > sizeof (EXT3_I(inode)->i_data)) { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. +@@ -2245,4 +2242,16 @@ struct inode_operations ext3_dir_inode_o + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; ++ ++struct inode_operations ext3_special_inode_operations = { ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ +--- linux-rh-2.4.20-8/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/super.c 2003-05-07 17:40:45.000000000 +0800 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -502,6 +504,7 @@ static int parse_options (char * options + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; ++ + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; +@@ -514,6 +517,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -931,6 +941,12 @@ struct super_block * ext3_read_super (st + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; ++ ++ /* Default extended attribute flags */ ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ ++#endif ++ + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; +@@ -1768,17 +1784,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, + + static int __init init_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (error) ++ return error; ++ error = init_ext3_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_xattr_user(); ++fail: ++ exit_ext3_xattr(); ++ return error; + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + +-EXPORT_SYMBOL(ext3_force_commit); +-EXPORT_SYMBOL(ext3_bread); +- + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); +--- linux-rh-2.4.20-8/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54-chaos 2001-11-10 06:25:04.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/symlink.c 2003-05-07 17:34:25.000000000 +0800 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext3_symlink_inode_operations = { ++ readlink: page_readlink, /* BKL not held. Don't need */ ++ follow_link: page_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/xattr.c 2003-05-07 17:42:06.000000000 +0800 +@@ -0,0 +1,1225 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define EXT3_EA_USER "user." ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) while(0) {} ++# define ext3_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT3_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext3_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext3_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext3_xattr_cache_insert(new_bh); ++ ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext3_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext3_xattr_sem); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/ext3/xattr_user.c 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,111 @@ ++/* ++ * linux/fs/ext3/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext3_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext3_xattr_user_list, ++ get: ext3_xattr_user_get, ++ set: ext3_xattr_user_set, ++}; ++ ++int __init ++init_ext3_xattr_user(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} ++ ++void ++exit_ext3_xattr_user(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} +--- linux-rh-2.4.20-8/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/jfs/jfs_xattr.h 2003-05-07 17:34:25.000000000 +0800 +@@ -52,8 +52,10 @@ struct jfs_ea_list { + #define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int); +-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int); ++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t, ++ int); ++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, ++ int); + extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); + extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); + extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); +--- linux-rh-2.4.20-8/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54-chaos 2002-11-29 07:53:15.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/jfs/xattr.c 2003-05-07 17:34:25.000000000 +0800 +@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s + } + + static int can_set_xattr(struct inode *inode, const char *name, +- void *value, size_t value_len) ++ const void *value, size_t value_len) + { + if (IS_RDONLY(inode)) + return -EROFS; +@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i + return permission(inode, MAY_WRITE); + } + +-int __jfs_setxattr(struct inode *inode, const char *name, void *value, ++int __jfs_setxattr(struct inode *inode, const char *name, const void *value, + size_t value_len, int flags) + { + struct jfs_ea_list *ealist; +@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, + return rc; + } + +-int jfs_setxattr(struct dentry *dentry, const char *name, void *value, ++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t value_len, int flags) + { + if (value == NULL) { /* empty EA, do not remove */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/fs/mbcache.c 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,648 @@ ++/* ++ * linux/fs/mbcache.c ++ * (C) 2001-2002 Andreas Gruenbacher, ++ */ ++ ++/* ++ * Filesystem Meta Information Block Cache (mbcache) ++ * ++ * The mbcache caches blocks of block devices that need to be located ++ * by their device/block number, as well as by other criteria (such ++ * as the block's contents). ++ * ++ * There can only be one cache entry in a cache per device and block number. ++ * Additional indexes need not be unique in this sense. The number of ++ * additional indexes (=other criteria) can be hardwired at compile time ++ * or specified at cache create time. ++ * ++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' ++ * in the cache. A valid entry is in the main hash tables of the cache, ++ * and may also be in the lru list. An invalid entry is not in any hashes ++ * or lists. ++ * ++ * A valid cache entry is only in the lru list if no handles refer to it. ++ * Invalid cache entries will be freed when the last handle to the cache ++ * entry is released. Entries that cannot be freed immediately are put ++ * back on the lru list. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#ifdef MB_CACHE_DEBUG ++# define mb_debug(f...) do { \ ++ printk(KERN_DEBUG f); \ ++ printk("\n"); \ ++ } while (0) ++#define mb_assert(c) do { if (!(c)) \ ++ printk(KERN_ERR "assertion " #c " failed\n"); \ ++ } while(0) ++#else ++# define mb_debug(f...) do { } while(0) ++# define mb_assert(c) do { } while(0) ++#endif ++#define mb_error(f...) do { \ ++ printk(KERN_ERR f); \ ++ printk("\n"); \ ++ } while(0) ++ ++MODULE_AUTHOR("Andreas Gruenbacher "); ++MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ++MODULE_LICENSE("GPL"); ++#endif ++ ++EXPORT_SYMBOL(mb_cache_create); ++EXPORT_SYMBOL(mb_cache_shrink); ++EXPORT_SYMBOL(mb_cache_destroy); ++EXPORT_SYMBOL(mb_cache_entry_alloc); ++EXPORT_SYMBOL(mb_cache_entry_insert); ++EXPORT_SYMBOL(mb_cache_entry_release); ++EXPORT_SYMBOL(mb_cache_entry_takeout); ++EXPORT_SYMBOL(mb_cache_entry_free); ++EXPORT_SYMBOL(mb_cache_entry_dup); ++EXPORT_SYMBOL(mb_cache_entry_get); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++EXPORT_SYMBOL(mb_cache_entry_find_first); ++EXPORT_SYMBOL(mb_cache_entry_find_next); ++#endif ++ ++ ++/* ++ * Global data: list of all mbcache's, lru list, and a spinlock for ++ * accessing cache data structures on SMP machines. The lru list is ++ * global across all mbcaches. ++ */ ++ ++static LIST_HEAD(mb_cache_list); ++static LIST_HEAD(mb_cache_lru_list); ++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; ++ ++static inline int ++mb_cache_indexes(struct mb_cache *cache) ++{ ++#ifdef MB_CACHE_INDEXES_COUNT ++ return MB_CACHE_INDEXES_COUNT; ++#else ++ return cache->c_indexes_count; ++#endif ++} ++ ++/* ++ * What the mbcache registers as to get shrunk dynamically. ++ */ ++ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask); ++ ++static struct cache_definition mb_cache_definition = { ++ "mb_cache", ++ mb_cache_memory_pressure ++}; ++ ++ ++static inline int ++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) ++{ ++ return !list_empty(&ce->e_block_list); ++} ++ ++ ++static inline void ++__mb_cache_entry_unhash(struct mb_cache_entry *ce) ++{ ++ int n; ++ ++ if (__mb_cache_entry_is_hashed(ce)) { ++ list_del_init(&ce->e_block_list); ++ for (n=0; ne_cache); n++) ++ list_del(&ce->e_indexes[n].o_list); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ ++ mb_assert(atomic_read(&ce->e_used) == 0); ++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { ++ /* free failed -- put back on the lru list ++ for freeing later. */ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&ce->e_lru_list, &mb_cache_lru_list); ++ spin_unlock(&mb_cache_spinlock); ++ } else { ++ kmem_cache_free(cache->c_entry_cache, ce); ++ atomic_dec(&cache->c_entry_count); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) ++{ ++ if (atomic_dec_and_test(&ce->e_used)) { ++ if (__mb_cache_entry_is_hashed(ce)) ++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); ++ else { ++ spin_unlock(&mb_cache_spinlock); ++ __mb_cache_entry_forget(ce, GFP_KERNEL); ++ return; ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_memory_pressure() memory pressure callback ++ * ++ * This function is called by the kernel memory management when memory ++ * gets low. ++ * ++ * @priority: Amount by which to shrink the cache (0 = highes priority) ++ * @gfp_mask: (ignored) ++ */ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int count = 0; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &mb_cache_list) { ++ struct mb_cache *cache = ++ list_entry(l, struct mb_cache, c_cache_list); ++ mb_debug("cache %s (%d)", cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ count += atomic_read(&cache->c_entry_count); ++ } ++ mb_debug("trying to free %d of %d entries", ++ count / (priority ? priority : 1), count); ++ if (priority) ++ count /= priority; ++ while (count-- && !list_empty(&mb_cache_lru_list)) { ++ struct mb_cache_entry *ce = ++ list_entry(mb_cache_lru_list.next, ++ struct mb_cache_entry, e_lru_list); ++ list_del(&ce->e_lru_list); ++ __mb_cache_entry_unhash(ce); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), gfp_mask); ++ } ++} ++ ++ ++/* ++ * mb_cache_create() create a new cache ++ * ++ * All entries in one cache are equal size. Cache entries may be from ++ * multiple devices. If this is the first mbcache created, registers ++ * the cache with kernel memory management. Returns NULL if no more ++ * memory was available. ++ * ++ * @name: name of the cache (informal) ++ * @cache_op: contains the callback called when freeing a cache entry ++ * @entry_size: The size of a cache entry, including ++ * struct mb_cache_entry ++ * @indexes_count: number of additional indexes in the cache. Must equal ++ * MB_CACHE_INDEXES_COUNT if the number of indexes is ++ * hardwired. ++ * @bucket_count: number of hash buckets ++ */ ++struct mb_cache * ++mb_cache_create(const char *name, struct mb_cache_op *cache_op, ++ size_t entry_size, int indexes_count, int bucket_count) ++{ ++ int m=0, n; ++ struct mb_cache *cache = NULL; ++ ++ if(entry_size < sizeof(struct mb_cache_entry) + ++ indexes_count * sizeof(struct mb_cache_entry_index)) ++ return NULL; ++ ++ MOD_INC_USE_COUNT; ++ cache = kmalloc(sizeof(struct mb_cache) + ++ indexes_count * sizeof(struct list_head), GFP_KERNEL); ++ if (!cache) ++ goto fail; ++ cache->c_name = name; ++ cache->c_op.free = NULL; ++ if (cache_op) ++ cache->c_op.free = cache_op->free; ++ atomic_set(&cache->c_entry_count, 0); ++ cache->c_bucket_count = bucket_count; ++#ifdef MB_CACHE_INDEXES_COUNT ++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); ++#else ++ cache->c_indexes_count = indexes_count; ++#endif ++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_block_hash) ++ goto fail; ++ for (n=0; nc_block_hash[n]); ++ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * ++ sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_indexes_hash[m]) ++ goto fail; ++ for (n=0; nc_indexes_hash[m][n]); ++ } ++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, ++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); ++ if (!cache->c_entry_cache) ++ goto fail; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&cache->c_cache_list, &mb_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ return cache; ++ ++fail: ++ if (cache) { ++ while (--m >= 0) ++ kfree(cache->c_indexes_hash[m]); ++ if (cache->c_block_hash) ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ } ++ MOD_DEC_USE_COUNT; ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_shrink() ++ * ++ * Removes all cache entires of a device from the cache. All cache entries ++ * currently in use cannot be freed, and thus remain in the cache. ++ * ++ * @cache: which cache to shrink ++ * @dev: which device's cache entries to shrink ++ */ ++void ++mb_cache_shrink(struct mb_cache *cache, kdev_t dev) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_dev == dev) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++} ++ ++ ++/* ++ * mb_cache_destroy() ++ * ++ * Shrinks the cache to its minimum possible size (hopefully 0 entries), ++ * and then destroys it. If this was the last mbcache, un-registers the ++ * mbcache from kernel memory management. ++ */ ++void ++mb_cache_destroy(struct mb_cache *cache) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_cache == cache) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ list_del(&cache->c_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++ ++ if (atomic_read(&cache->c_entry_count) > 0) { ++ mb_error("cache %s: %d orphaned entries", ++ cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ } ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) ++ /* We don't have kmem_cache_destroy() in 2.2.x */ ++ kmem_cache_shrink(cache->c_entry_cache); ++#else ++ kmem_cache_destroy(cache->c_entry_cache); ++#endif ++ for (n=0; n < mb_cache_indexes(cache); n++) ++ kfree(cache->c_indexes_hash[n]); ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ ++ MOD_DEC_USE_COUNT; ++} ++ ++ ++/* ++ * mb_cache_entry_alloc() ++ * ++ * Allocates a new cache entry. The new entry will not be valid initially, ++ * and thus cannot be looked up yet. It should be filled with data, and ++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL ++ * if no more memory was available. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_alloc(struct mb_cache *cache) ++{ ++ struct mb_cache_entry *ce; ++ ++ atomic_inc(&cache->c_entry_count); ++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); ++ if (ce) { ++ INIT_LIST_HEAD(&ce->e_lru_list); ++ INIT_LIST_HEAD(&ce->e_block_list); ++ ce->e_cache = cache; ++ atomic_set(&ce->e_used, 1); ++ } ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_insert() ++ * ++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into ++ * the cache. After this, the cache entry can be looked up, but is not yet ++ * in the lru list as the caller still holds a handle to it. Returns 0 on ++ * success, or -EBUSY if a cache entry for that device + inode exists ++ * already (this may happen after a failed lookup, if another process has ++ * inserted the same cache entry in the meantime). ++ * ++ * @dev: device the cache entry belongs to ++ * @block: block number ++ * @keys: array of additional keys. There must be indexes_count entries ++ * in the array (as specified when creating the cache). ++ */ ++int ++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, ++ unsigned long block, unsigned int keys[]) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ int error = -EBUSY, n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) ++ goto out; ++ } ++ __mb_cache_entry_unhash(ce); ++ ce->e_dev = dev; ++ ce->e_block = block; ++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); ++ for (n=0; ne_indexes[n].o_key = keys[n]; ++ bucket = keys[n] % cache->c_bucket_count; ++ list_add(&ce->e_indexes[n].o_list, ++ &cache->c_indexes_hash[n][bucket]); ++ } ++out: ++ spin_unlock(&mb_cache_spinlock); ++ return error; ++} ++ ++ ++/* ++ * mb_cache_entry_release() ++ * ++ * Release a handle to a cache entry. When the last handle to a cache entry ++ * is released it is either freed (if it is invalid) or otherwise inserted ++ * in to the lru list. ++ */ ++void ++mb_cache_entry_release(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_takeout() ++ * ++ * Take a cache entry out of the cache, making it invalid. The entry can later ++ * be re-inserted using mb_cache_entry_insert(), or released using ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_takeout(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_entry_free() ++ * ++ * This is equivalent to the sequence mb_cache_entry_takeout() -- ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_free(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_dup() ++ * ++ * Duplicate a handle to a cache entry (does not duplicate the cache entry ++ * itself). After the call, both the old and the new handle must be released. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_dup(struct mb_cache_entry *ce) ++{ ++ atomic_inc(&ce->e_used); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_get() ++ * ++ * Get a cache entry by device / block number. (There can only be one entry ++ * in the cache per device and block.) Returns NULL if no such cache entry ++ * exists. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) ++{ ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ ce = list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ goto cleanup; ++ } ++ } ++ ce = NULL; ++ ++cleanup: ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++ ++static struct mb_cache_entry * ++__mb_cache_entry_find(struct list_head *l, struct list_head *head, ++ int index, kdev_t dev, unsigned int key) ++{ ++ while (l != head) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, ++ e_indexes[index].o_list); ++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ return ce; ++ } ++ l = l->next; ++ } ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_entry_find_first() ++ * ++ * Find the first cache entry on a given device with a certain key in ++ * an additional index. Additonal matches can be found with ++ * mb_cache_entry_find_next(). Returns NULL if no match was found. ++ * ++ * @cache: the cache to search ++ * @index: the number of the additonal index to search (0<=indexc_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = cache->c_indexes_hash[index][bucket].next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_find_next() ++ * ++ * Find the next cache entry on a given device with a certain key in an ++ * additional index. Returns NULL if no match could be found. The previous ++ * entry is atomatically released, so that mb_cache_entry_find_next() can ++ * be called like this: ++ * ++ * entry = mb_cache_entry_find_first(); ++ * while (entry) { ++ * ... ++ * entry = mb_cache_entry_find_next(entry, ...); ++ * } ++ * ++ * @prev: The previous match ++ * @index: the number of the additonal index to search (0<=indexe_cache; ++ unsigned int bucket = key % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = prev->e_indexes[index].o_list.next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ __mb_cache_entry_release_unlock(prev); ++ return ce; ++} ++ ++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ ++ ++static int __init init_mbcache(void) ++{ ++ register_cache(&mb_cache_definition); ++ return 0; ++} ++ ++static void __exit exit_mbcache(void) ++{ ++ unregister_cache(&mb_cache_definition); ++} ++ ++module_init(init_mbcache) ++module_exit(exit_mbcache) ++ +--- linux-rh-2.4.20-8/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:53.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-arm/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -244,7 +244,6 @@ + #define __NR_security (__NR_SYSCALL_BASE+223) + #define __NR_gettid (__NR_SYSCALL_BASE+224) + #define __NR_readahead (__NR_SYSCALL_BASE+225) +-#if 0 /* allocated in 2.5 */ + #define __NR_setxattr (__NR_SYSCALL_BASE+226) + #define __NR_lsetxattr (__NR_SYSCALL_BASE+227) + #define __NR_fsetxattr (__NR_SYSCALL_BASE+228) +@@ -257,7 +256,6 @@ + #define __NR_removexattr (__NR_SYSCALL_BASE+235) + #define __NR_lremovexattr (__NR_SYSCALL_BASE+236) + #define __NR_fremovexattr (__NR_SYSCALL_BASE+237) +-#endif + #define __NR_tkill (__NR_SYSCALL_BASE+238) + /* + * Please check 2.5 _before_ adding calls here, +--- linux-rh-2.4.20-8/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:45.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-ppc64/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -218,6 +218,7 @@ + #define __NR_gettid 207 + #if 0 /* Reserved syscalls */ + #define __NR_tkill 208 ++#endif + #define __NR_setxattr 209 + #define __NR_lsetxattr 210 + #define __NR_fsetxattr 211 +@@ -230,6 +231,7 @@ + #define __NR_removexattr 218 + #define __NR_lremovexattr 219 + #define __NR_fremovexattr 220 ++#if 0 /* Reserved syscalls */ + #define __NR_futex 221 + #endif + +--- linux-rh-2.4.20-8/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:45.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-s390/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -212,9 +212,18 @@ + #define __NR_madvise 219 + #define __NR_getdents64 220 + #define __NR_fcntl64 221 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux-rh-2.4.20-8/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:45.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-s390x/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -180,9 +180,18 @@ + #define __NR_pivot_root 217 + #define __NR_mincore 218 + #define __NR_madvise 219 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux-rh-2.4.20-8/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:45.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-sparc/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- linux-rh-2.4.20-8/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54-chaos 2002-08-03 08:39:45.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/asm-sparc64/unistd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/cache_def.h 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,15 @@ ++/* ++ * linux/cache_def.h ++ * Handling of caches defined in drivers, filesystems, ... ++ * ++ * Copyright (C) 2002 by Andreas Gruenbacher, ++ */ ++ ++struct cache_definition { ++ const char *name; ++ void (*shrink)(int, unsigned int); ++ struct list_head link; ++}; ++ ++extern void register_cache(struct cache_definition *); ++extern void unregister_cache(struct cache_definition *); +--- linux-rh-2.4.20-8/include/linux/errno.h~linux-2.4.20-xattr-0.8.54-chaos 2003-04-11 14:04:53.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/errno.h 2003-05-07 17:34:25.000000000 +0800 +@@ -26,4 +26,8 @@ + + #endif + ++/* Defined for extended attributes */ ++#define ENOATTR ENODATA /* No such attribute */ ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++ + #endif +--- linux-rh-2.4.20-8/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54-chaos 2003-04-12 15:46:42.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext2_fs.h 2003-05-07 17:34:25.000000000 +0800 +@@ -57,8 +57,6 @@ + */ + #define EXT2_BAD_INO 1 /* Bad blocks inode */ + #define EXT2_ROOT_INO 2 /* Root inode */ +-#define EXT2_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT2_ACL_DATA_INO 4 /* ACL inode */ + #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +@@ -86,7 +84,6 @@ + #else + # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) + #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -121,28 +118,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext2_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext2_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext2_group_desc +@@ -314,6 +289,7 @@ struct ext2_inode { + #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt + #define set_opt(o, opt) o |= EXT2_MOUNT_##opt +@@ -397,6 +373,7 @@ struct ext2_super_block { + + #ifdef __KERNEL__ + #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) ++#define EXT2_I(inode) (&((inode)->u.ext2_i)) + #else + /* Assume that user mode programs are passing in an ext2fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +@@ -466,7 +443,7 @@ struct ext2_super_block { + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +-#define EXT2_FEATURE_COMPAT_SUPP 0 ++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -623,8 +600,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext2_dir_inode_operations; ++extern struct inode_operations ext2_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext2_symlink_inode_operations; + extern struct inode_operations ext2_fast_symlink_inode_operations; + + #endif /* __KERNEL__ */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext2_xattr.h 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext2_xattr.h ++ ++ On-disk format of extended attributes for the ext2 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT2_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT2_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT2_XATTR_INDEX_MAX 10 ++#define EXT2_XATTR_INDEX_USER 1 ++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext2_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext2_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT2_XATTR_PAD_BITS 2 ++#define EXT2_XATTR_PAD (1<e_name_len)) ) ++#define EXT2_XATTR_SIZE(size) \ ++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT2_FS_XATTR ++ ++struct ext2_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext2_xattr_register(int, struct ext2_xattr_handler *); ++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); ++ ++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); ++extern int ext2_removexattr(struct dentry *, const char *); ++ ++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext2_xattr_list(struct inode *, char *, size_t); ++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext2_xattr_delete_inode(struct inode *); ++extern void ext2_xattr_put_super(struct super_block *); ++ ++extern int init_ext2_xattr(void) __init; ++extern void exit_ext2_xattr(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR */ ++# define ext2_setxattr NULL ++# define ext2_getxattr NULL ++# define ext2_listxattr NULL ++# define ext2_removexattr NULL ++ ++static inline int ++ext2_xattr_get(struct inode *inode, int name_index, ++ const char *name, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++} ++ ++static inline void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR */ ++ ++# ifdef CONFIG_EXT2_FS_XATTR_USER ++ ++extern int init_ext2_xattr_user(void) __init; ++extern void exit_ext2_xattr_user(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++static inline int ++init_ext2_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr_user(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux-rh-2.4.20-8/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext3_fs.h 2003-05-07 17:34:25.000000000 +0800 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -344,6 +319,7 @@ struct ext3_inode { + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -520,7 +496,7 @@ struct ext3_super_block { + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +@@ -771,8 +748,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; ++extern struct inode_operations ext3_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + + +--- linux-rh-2.4.20-8/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:59.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext3_jbd.h 2003-05-07 17:34:25.000000000 +0800 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8U + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/ext3_xattr.h 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext3_xattr_delete_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux-rh-2.4.20-8/include/linux/fs.h~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:58.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/fs.h 2003-05-07 17:34:25.000000000 +0800 +@@ -915,7 +915,7 @@ struct inode_operations { + int (*setattr) (struct dentry *, struct iattr *); + int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); +- int (*setxattr) (struct dentry *, const char *, void *, size_t, int); ++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-rh-2.4.20-8-root/include/linux/mbcache.h 2003-05-07 17:34:25.000000000 +0800 +@@ -0,0 +1,69 @@ ++/* ++ File: linux/mbcache.h ++ ++ (C) 2001 by Andreas Gruenbacher, ++*/ ++ ++/* Hardwire the number of additional indexes */ ++#define MB_CACHE_INDEXES_COUNT 1 ++ ++struct mb_cache_entry; ++ ++struct mb_cache_op { ++ int (*free)(struct mb_cache_entry *, int); ++}; ++ ++struct mb_cache { ++ struct list_head c_cache_list; ++ const char *c_name; ++ struct mb_cache_op c_op; ++ atomic_t c_entry_count; ++ int c_bucket_count; ++#ifndef MB_CACHE_INDEXES_COUNT ++ int c_indexes_count; ++#endif ++ kmem_cache_t *c_entry_cache; ++ struct list_head *c_block_hash; ++ struct list_head *c_indexes_hash[0]; ++}; ++ ++struct mb_cache_entry_index { ++ struct list_head o_list; ++ unsigned int o_key; ++}; ++ ++struct mb_cache_entry { ++ struct list_head e_lru_list; ++ struct mb_cache *e_cache; ++ atomic_t e_used; ++ kdev_t e_dev; ++ unsigned long e_block; ++ struct list_head e_block_list; ++ struct mb_cache_entry_index e_indexes[0]; ++}; ++ ++/* Functions on caches */ ++ ++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, ++ int, int); ++void mb_cache_shrink(struct mb_cache *, kdev_t); ++void mb_cache_destroy(struct mb_cache *); ++ ++/* Functions on cache entries */ ++ ++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); ++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, ++ unsigned int[]); ++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); ++void mb_cache_entry_release(struct mb_cache_entry *); ++void mb_cache_entry_takeout(struct mb_cache_entry *); ++void mb_cache_entry_free(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, ++ unsigned long); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, ++ kdev_t, unsigned int); ++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, ++ kdev_t, unsigned int); ++#endif +--- linux-rh-2.4.20-8/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:58.000000000 +0800 ++++ linux-rh-2.4.20-8-root/kernel/ksyms.c 2003-05-07 17:34:25.000000000 +0800 +@@ -12,6 +12,7 @@ + #define __KERNEL_SYSCALLS__ + #include + #include ++#include + #include + #include + #include +@@ -107,6 +108,7 @@ EXPORT_SYMBOL(exit_mm); + EXPORT_SYMBOL(exit_files); + EXPORT_SYMBOL(exit_fs); + EXPORT_SYMBOL(exit_sighand); ++EXPORT_SYMBOL(copy_fs_struct); + + /* internal kernel memory management */ + EXPORT_SYMBOL(_alloc_pages); +@@ -125,6 +127,8 @@ EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); + EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_size); ++EXPORT_SYMBOL(register_cache); ++EXPORT_SYMBOL(unregister_cache); + EXPORT_SYMBOL(kmalloc); + EXPORT_SYMBOL(kfree); + EXPORT_SYMBOL(vfree); +--- linux-rh-2.4.20-8/mm/vmscan.c~linux-2.4.20-xattr-0.8.54-chaos 2003-05-07 17:33:58.000000000 +0800 ++++ linux-rh-2.4.20-8-root/mm/vmscan.c 2003-05-07 17:34:25.000000000 +0800 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -444,6 +445,39 @@ static inline void kachunk_cache(struct + + #define BATCH_WORK_AMOUNT 64 + ++static DECLARE_MUTEX(other_caches_sem); ++static LIST_HEAD(cache_definitions); ++ ++void register_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_add(&cache->link, &cache_definitions); ++ up(&other_caches_sem); ++} ++ ++void unregister_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_del(&cache->link); ++ up(&other_caches_sem); ++} ++ ++static void shrink_other_caches(unsigned int priority, int gfp_mask) ++{ ++ struct list_head *p; ++ ++ if (down_trylock(&other_caches_sem)) ++ return; ++ ++ list_for_each_prev(p, &cache_definitions) { ++ struct cache_definition *cache = ++ list_entry(p, struct cache_definition, link); ++ ++ cache->shrink(priority, gfp_mask); ++ } ++ up(&other_caches_sem); ++} ++ + /* + * returns the active cache ratio relative to the total active list + * times 10 (eg. 30% cache returns 3) +@@ -887,7 +921,7 @@ static int do_try_to_free_pages_kswapd(u + + ret += shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + ret += shrink_icache_memory(DEF_PRIORITY, gfp_mask); +- // ret += shrink_other_caches(DEF_PRIORITY, gfp_mask); ++ shrink_other_caches(DEF_PRIORITY, gfp_mask); + #ifdef CONFIG_QUOTA + ret += shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + #endif +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-root/fs/ext3/ext3-exports.c 2003-05-05 18:19:11.000000000 +0800 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); + +_ diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch new file mode 100644 index 0000000..f0f92e5 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54-hp.patch @@ -0,0 +1,5536 @@ + Documentation/Configure.help | 66 ++ + arch/alpha/defconfig | 7 + arch/alpha/kernel/entry.S | 12 + arch/arm/defconfig | 7 + arch/arm/kernel/calls.S | 24 + arch/i386/defconfig | 7 + arch/ia64/defconfig | 7 + arch/m68k/defconfig | 7 + arch/mips/defconfig | 7 + arch/mips64/defconfig | 7 + arch/ppc/defconfig | 14 + arch/ppc64/kernel/misc.S | 2 + arch/s390/defconfig | 7 + arch/s390/kernel/entry.S | 24 + arch/s390x/defconfig | 7 + arch/s390x/kernel/entry.S | 24 + arch/s390x/kernel/wrapper32.S | 92 +++ + arch/sparc/defconfig | 7 + arch/sparc/kernel/systbls.S | 10 + arch/sparc64/defconfig | 7 + arch/sparc64/kernel/systbls.S | 20 + fs/Config.in | 14 + fs/Makefile | 3 + fs/ext2/Makefile | 4 + fs/ext2/file.c | 5 + fs/ext2/ialloc.c | 2 + fs/ext2/inode.c | 34 - + fs/ext2/namei.c | 14 + fs/ext2/super.c | 29 + fs/ext2/symlink.c | 14 + fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ + fs/ext2/xattr_user.c | 103 +++ + fs/ext3/Makefile | 9 + fs/ext3/ext3-exports.c | 13 + fs/ext3/file.c | 5 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 35 - + fs/ext3/namei.c | 21 + fs/ext3/super.c | 36 + + fs/ext3/symlink.c | 14 + fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/xattr_user.c | 111 +++ + fs/jfs/jfs_xattr.h | 6 + fs/jfs/xattr.c | 6 + fs/mbcache.c | 648 ++++++++++++++++++++++ + include/asm-arm/unistd.h | 2 + include/asm-ppc64/unistd.h | 2 + include/asm-s390/unistd.h | 15 + include/asm-s390x/unistd.h | 15 + include/asm-sparc/unistd.h | 24 + include/asm-sparc64/unistd.h | 24 + include/linux/cache_def.h | 15 + include/linux/errno.h | 4 + include/linux/ext2_fs.h | 31 - + include/linux/ext2_xattr.h | 157 +++++ + include/linux/ext3_fs.h | 31 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 157 +++++ + include/linux/fs.h | 2 + include/linux/mbcache.h | 69 ++ + kernel/ksyms.c | 4 + mm/vmscan.c | 35 + + 62 files changed, 4343 insertions(+), 182 deletions(-) + +--- linux/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:23 2003 ++++ linux-mmonroe/Documentation/Configure.help Fri May 16 08:43:00 2003 +@@ -15309,6 +15309,39 @@ CONFIG_EXT2_FS + be compiled as a module, and so this could be dangerous. Most + everyone wants to say Y here. + ++Ext2 extended attributes ++CONFIG_EXT2_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext2 extended attribute block sharing ++CONFIG_EXT2_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext2 extended user attributes ++CONFIG_EXT2_FS_XATTR_USER ++ This option enables extended user attributes on ext2. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext2 trusted extended attributes ++CONFIG_EXT2_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext2 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Ext3 journalling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journalling version of the Second extended file system +@@ -15341,6 +15374,39 @@ CONFIG_EXT3_FS + of your root partition (the one containing the directory /) cannot + be compiled as a module, and so this may be dangerous. + ++Ext3 extended attributes ++CONFIG_EXT3_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext3 extended attribute block sharing ++CONFIG_EXT3_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext3 extended user attributes ++CONFIG_EXT3_FS_XATTR_USER ++ This option enables extended user attributes on ext3. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext3 trusted extended attributes ++CONFIG_EXT3_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext3 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journalling layer for block devices. It is +--- linux/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/alpha/defconfig Fri May 16 08:43:00 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ALPHA=y + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set +--- linux/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:42 2002 ++++ linux-mmonroe/arch/alpha/kernel/entry.S Fri May 16 08:43:00 2003 +@@ -1154,6 +1154,18 @@ sys_call_table: + .quad sys_readahead + .quad sys_ni_syscall /* 380, sys_security */ + .quad sys_tkill ++ .quad sys_setxattr ++ .quad sys_lsetxattr ++ .quad sys_fsetxattr ++ .quad sys_getxattr /* 385 */ ++ .quad sys_lgetxattr ++ .quad sys_fgetxattr ++ .quad sys_listxattr ++ .quad sys_llistxattr ++ .quad sys_flistxattr /* 390 */ ++ .quad sys_removexattr ++ .quad sys_lremovexattr ++ .quad sys_fremovexattr + + /* Remember to update everything, kids. */ + .ifne (. - sys_call_table) - (NR_SYSCALLS * 8) +--- linux/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/arm/defconfig Fri May 16 08:43:00 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ARM=y + # CONFIG_EISA is not set + # CONFIG_SBUS is not set +--- linux/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:42 2002 ++++ linux-mmonroe/arch/arm/kernel/calls.S Fri May 16 08:43:00 2003 +@@ -240,18 +240,18 @@ __syscall_start: + .long SYMBOL_NAME(sys_ni_syscall) /* Security */ + .long SYMBOL_NAME(sys_gettid) + /* 225 */ .long SYMBOL_NAME(sys_readahead) +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */ +-/* 230 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */ +-/* 235 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */ ++ .long SYMBOL_NAME(sys_setxattr) ++ .long SYMBOL_NAME(sys_lsetxattr) ++ .long SYMBOL_NAME(sys_fsetxattr) ++ .long SYMBOL_NAME(sys_getxattr) ++/* 230 */ .long SYMBOL_NAME(sys_lgetxattr) ++ .long SYMBOL_NAME(sys_fgetxattr) ++ .long SYMBOL_NAME(sys_listxattr) ++ .long SYMBOL_NAME(sys_llistxattr) ++ .long SYMBOL_NAME(sys_flistxattr) ++/* 235 */ .long SYMBOL_NAME(sys_removexattr) ++ .long SYMBOL_NAME(sys_lremovexattr) ++ .long SYMBOL_NAME(sys_fremovexattr) + .long SYMBOL_NAME(sys_tkill) + /* + * Please check 2.5 _before_ adding calls here, +--- linux/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/i386/defconfig Fri May 16 08:43:00 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_X86=y + CONFIG_ISA=y + # CONFIG_SBUS is not set +--- linux/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/ia64/defconfig Fri May 16 08:43:00 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/m68k/defconfig Fri May 16 08:43:00 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + + # +--- linux/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/mips/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + CONFIG_MIPS32=y + # CONFIG_MIPS64 is not set +--- linux/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/mips64/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + # CONFIG_MIPS32 is not set + CONFIG_MIPS64=y +--- linux/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/ppc/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,20 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set + CONFIG_RWSEM_XCHGADD_ALGORITHM=y +--- linux/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:11 2002 ++++ linux-mmonroe/arch/ppc64/kernel/misc.S Fri May 16 08:43:01 2003 +@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_gettid /* 207 */ + #if 0 /* Reserved syscalls */ + .llong .sys_tkill /* 208 */ ++#endif + .llong .sys_setxattr + .llong .sys_lsetxattr /* 210 */ + .llong .sys_fsetxattr +@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_removexattr + .llong .sys_lremovexattr + .llong .sys_fremovexattr /* 220 */ ++#if 0 /* Reserved syscalls */ + .llong .sys_futex + #endif + .llong .sys_perfmonctl /* Put this here for now ... */ +--- linux/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/s390/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:11 2002 ++++ linux-mmonroe/arch/s390/kernel/entry.S Fri May 16 08:43:01 2003 +@@ -558,18 +558,18 @@ sys_call_table: + .long sys_fcntl64 + .long sys_ni_syscall + .long sys_ni_syscall +- .long sys_ni_syscall /* 224 - reserved for setxattr */ +- .long sys_ni_syscall /* 225 - reserved for lsetxattr */ +- .long sys_ni_syscall /* 226 - reserved for fsetxattr */ +- .long sys_ni_syscall /* 227 - reserved for getxattr */ +- .long sys_ni_syscall /* 228 - reserved for lgetxattr */ +- .long sys_ni_syscall /* 229 - reserved for fgetxattr */ +- .long sys_ni_syscall /* 230 - reserved for listxattr */ +- .long sys_ni_syscall /* 231 - reserved for llistxattr */ +- .long sys_ni_syscall /* 232 - reserved for flistxattr */ +- .long sys_ni_syscall /* 233 - reserved for removexattr */ +- .long sys_ni_syscall /* 234 - reserved for lremovexattr */ +- .long sys_ni_syscall /* 235 - reserved for fremovexattr */ ++ .long sys_setxattr ++ .long sys_lsetxattr /* 225 */ ++ .long sys_fsetxattr ++ .long sys_getxattr ++ .long sys_lgetxattr ++ .long sys_fgetxattr ++ .long sys_listxattr /* 230 */ ++ .long sys_llistxattr ++ .long sys_flistxattr ++ .long sys_removexattr ++ .long sys_lremovexattr ++ .long sys_fremovexattr /* 235 */ + .long sys_gettid + .long sys_tkill + .rept 255-237 +--- linux/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/s390x/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:11 2002 ++++ linux-mmonroe/arch/s390x/kernel/entry.S Fri May 16 08:43:01 2003 +@@ -591,18 +591,18 @@ sys_call_table: + .long SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */ ++ .long SYSCALL(sys_setxattr,sys32_setxattr_wrapper) ++ .long SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper) /* 225 */ ++ .long SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper) ++ .long SYSCALL(sys_getxattr,sys32_getxattr_wrapper) ++ .long SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper) ++ .long SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper) ++ .long SYSCALL(sys_listxattr,sys32_listxattr_wrapper) /* 230 */ ++ .long SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper) ++ .long SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper) ++ .long SYSCALL(sys_removexattr,sys32_removexattr_wrapper) ++ .long SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper) ++ .long SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */ + .long SYSCALL(sys_gettid,sys_gettid) + .long SYSCALL(sys_tkill,sys_tkill) + .rept 255-237 +--- linux/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54-hp Mon Feb 25 11:37:56 2002 ++++ linux-mmonroe/arch/s390x/kernel/wrapper32.S Fri May 16 08:43:01 2003 +@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper: + llgtr %r3,%r3 # struct stat64 * + llgfr %r4,%r4 # long + jg sys32_fstat64 # branch to system call ++ ++ .globl sys32_setxattr_wrapper ++sys32_setxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_setxattr ++ ++ .globl sys32_lsetxattr_wrapper ++sys32_lsetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_lsetxattr ++ ++ .globl sys32_fsetxattr_wrapper ++sys32_fsetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_fsetxattr ++ ++ .globl sys32_getxattr_wrapper ++sys32_getxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_getxattr ++ ++ .globl sys32_lgetxattr_wrapper ++sys32_lgetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_lgetxattr ++ ++ .globl sys32_fgetxattr_wrapper ++sys32_fgetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_fgetxattr ++ ++ .globl sys32_listxattr_wrapper ++sys32_listxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_listxattr ++ ++ .globl sys32_llistxattr_wrapper ++sys32_llistxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_llistxattr ++ ++ .globl sys32_flistxattr_wrapper ++sys32_flistxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_flistxattr ++ ++ .globl sys32_removexattr_wrapper ++sys32_removexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_removexattr ++ ++ .globl sys32_lremovexattr_wrapper ++sys32_lremovexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_lremovexattr ++ ++ .globl sys32_fremovexattr_wrapper ++sys32_fremovexattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ jg sys_fremovexattr ++ ++ +--- linux/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/sparc/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + CONFIG_HIGHMEM=y + +--- linux/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:43 2002 ++++ linux-mmonroe/arch/sparc/kernel/systbls.S Fri May 16 08:43:01 2003 +@@ -51,11 +51,11 @@ sys_call_table: + /*150*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + /*155*/ .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +-/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +-/*175*/ .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module +-/*185*/ .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname ++/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++/*175*/ .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module ++/*185*/ .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname + /*190*/ .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + /*195*/ .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask + /*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir +--- linux/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/sparc64/defconfig Fri May 16 08:43:01 2003 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/arch/sparc64/kernel/systbls.S Fri May 16 08:43:01 2003 +@@ -52,11 +52,11 @@ sys_call_table32: + /*150*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +- .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir +@@ -111,11 +111,11 @@ sys_call_table: + /*150*/ .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install +- .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall +--- linux/fs/Config.in~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:14 2003 ++++ linux-mmonroe/fs/Config.in Fri May 16 08:43:01 2003 +@@ -35,6 +35,11 @@ dep_mbool ' Debug Befs' CONFIG_BEFS_DEB + dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + + tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS ++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS ++dep_bool ' Ext3 extended attribute block sharing' \ ++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR ++dep_bool ' Ext3 extended user attributes' \ ++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS +@@ -98,6 +103,11 @@ dep_mbool ' QNX4FS write support (DANGE + tristate 'ROM file system support' CONFIG_ROMFS_FS + + tristate 'Second extended fs support' CONFIG_EXT2_FS ++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS ++dep_bool ' Ext2 extended attribute block sharing' \ ++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR ++dep_bool ' Ext2 extended user attributes' \ ++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR + + tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS + +@@ -176,6 +186,10 @@ else + define_tristate CONFIG_ZISOFS_FS n + fi + ++# Meta block cache for Extended Attributes (ext2/ext3) ++#tristate 'Meta block cache' CONFIG_FS_MBCACHE ++define_tristate CONFIG_FS_MBCACHE y ++ + mainmenu_option next_comment + comment 'Partition Types' + source fs/partitions/Config.in +--- linux/fs/Makefile~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/fs/Makefile Fri May 16 08:43:01 2003 +@@ -80,6 +80,9 @@ obj-y += binfmt_script.o + + obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o + ++export-objs += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++ + # persistent filesystems + obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) + +--- linux/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54-hp Thu Oct 11 08:05:18 2001 ++++ linux-mmonroe/fs/ext2/Makefile Fri May 16 08:43:01 2003 +@@ -13,4 +13,8 @@ obj-y := balloc.o bitmap.o dir.o file + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54-hp Thu Oct 11 08:05:18 2001 ++++ linux-mmonroe/fs/ext2/file.c Fri May 16 08:43:01 2003 +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + + /* +@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati + + struct inode_operations ext2_file_inode_operations = { + truncate: ext2_truncate, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/ext2/ialloc.c Fri May 16 08:43:01 2003 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino + */ + if (!is_bad_inode(inode)) { + /* Quota is already initialized in iput() */ ++ ext2_xattr_delete_inode(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + } +--- linux/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/ext2/inode.c Fri May 16 08:43:01 2003 +@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL"); + static int ext2_update_inode(struct inode * inode, int do_sync); + + /* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext2_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext2_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ ++/* + * Called at each iput() + */ + void ext2_put_inode (struct inode * inode) +@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i + { + lock_kernel(); + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + inode->u.ext2_i.i_dtime = CURRENT_TIME; + mark_inode_dirty(inode); +@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext2_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino + unsigned long offset; + struct ext2_group_desc * gdp; + +- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && +- inode->i_ino != EXT2_ACL_DATA_INO && ++ if ((inode->i_ino != EXT2_ROOT_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_read_inode", +@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino + for (block = 0; block < EXT2_N_BLOCKS; block++) + inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; + +- if (inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + inode->i_mapping->a_ops = &ext2_aops; +@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino + inode->i_fop = &ext2_dir_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext2_inode_is_fast_symlink(inode)) + inode->i_op = &ext2_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + } +- } else ++ } else { ++ inode->i_op = &ext2_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(raw_inode->i_block[0])); ++ } + brelse (bh); + inode->i_attr_flags = 0; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { +--- linux/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54-hp Wed Oct 3 22:57:36 2001 ++++ linux-mmonroe/fs/ext2/namei.c Fri May 16 08:43:01 2003 +@@ -31,6 +31,7 @@ + + #include + #include ++#include + #include + + /* +@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * + + if (l > sizeof (inode->u.ext2_i.i_data)) { + /* slow symlink */ +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + err = block_symlink(inode, symname, l); + if (err) +@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o + rmdir: ext2_rmdir, + mknod: ext2_mknod, + rename: ext2_rename, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ ++struct inode_operations ext2_special_inode_operations = { ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/ext2/super.c Fri May 16 08:43:01 2003 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block + int db_count; + int i; + ++ ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + +@@ -175,6 +177,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st + blocksize = BLOCK_SIZE; + + sb->u.ext2_sb.s_mount_opt = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ ++#endif + if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, + &sb->u.ext2_sb.s_mount_opt)) { + return NULL; +@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, + + static int __init init_ext2_fs(void) + { +- return register_filesystem(&ext2_fs_type); ++ int error = init_ext2_xattr(); ++ if (error) ++ return error; ++ error = init_ext2_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext2_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext2_xattr_user(); ++fail: ++ exit_ext2_xattr(); ++ return error; + } + + static void __exit exit_ext2_fs(void) + { + unregister_filesystem(&ext2_fs_type); ++ exit_ext2_xattr_user(); ++ exit_ext2_xattr(); + } + + EXPORT_NO_SYMBOLS; +--- linux/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54-hp Wed Sep 27 13:41:33 2000 ++++ linux-mmonroe/fs/ext2/symlink.c Fri May 16 08:43:01 2003 +@@ -19,6 +19,7 @@ + + #include + #include ++#include + + static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext2_symlink_inode_operations = { ++ readlink: page_readlink, ++ follow_link: page_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ + struct inode_operations ext2_fast_symlink_inode_operations = { + readlink: ext2_readlink, + follow_link: ext2_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/ext2/xattr.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,1212 @@ ++/* ++ * linux/fs/ext2/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT2_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++EXPORT_SYMBOL(ext2_xattr_register); ++EXPORT_SYMBOL(ext2_xattr_unregister); ++EXPORT_SYMBOL(ext2_xattr_get); ++EXPORT_SYMBOL(ext2_xattr_list); ++EXPORT_SYMBOL(ext2_xattr_set); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT2_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext2_xattr_set2(struct inode *, struct buffer_head *, ++ struct ext2_xattr_header *); ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++static int ext2_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext2_xattr_cache_find(struct inode *, ++ struct ext2_xattr_header *); ++static void ext2_xattr_cache_remove(struct buffer_head *); ++static void ext2_xattr_rehash(struct ext2_xattr_header *, ++ struct ext2_xattr_entry *); ++ ++static struct mb_cache *ext2_xattr_cache; ++ ++#else ++# define ext2_xattr_cache_insert(bh) 0 ++# define ext2_xattr_cache_find(inode, header) NULL ++# define ext2_xattr_cache_remove(bh) while(0) {} ++# define ext2_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext2_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext2_xattr_sem); ++ ++static inline int ++ext2_xattr_new_block(struct inode *inode, int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + ++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext2_new_block(inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext2_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext2_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext2_xattr_free_block(struct inode * inode, unsigned long block) ++{ ++ ext2_free_blocks(inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext2_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext2_xattr_free_block(inode, block) \ ++ ext2_free_blocks(inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; ++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ if (!ext2_xattr_handlers[name_index-1]) { ++ ext2_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext2_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ ext2_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext2_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static struct ext2_xattr_handler * ++ext2_xattr_resolve_name(const char **name) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext2_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext2_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext2_handler_lock); ++ return handler; ++} ++ ++static inline struct ext2_xattr_handler * ++ext2_xattr_handler(int name_index) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ read_lock(&ext2_handler_lock); ++ handler = ext2_xattr_handlers[name_index-1]; ++ read_unlock(&ext2_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext2_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext2_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT2_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT2_I(inode)->i_file_acl) ++ return 0; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext2_xattr_update_super_block(struct super_block *sb) ++{ ++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT2_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext2_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_header *header = NULL; ++ struct ext2_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT2_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext2_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(sb, "ext2_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext2_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT2_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT2_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT2_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext2_xattr_cache_remove(bh); ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT2_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT2_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext2_xattr_set2(inode, bh, NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT2_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT2_XATTR_PAD, 0, ++ EXT2_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext2_xattr_rehash(header, here); ++ ++ error = ext2_xattr_set2(inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext2_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext2_xattr_set(): Update the file system. ++ */ ++static int ++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ++ struct ext2_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext2_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext2_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext2_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT2_I(inode)->i_file_acl != 0; ++ int block = ext2_xattr_new_block(inode, &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++ ext2_xattr_free_block(inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext2_xattr_cache_insert(new_bh); ++ ++ ext2_xattr_update_super_block(sb); ++ } ++ mark_buffer_dirty(new_bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &new_bh); ++ wait_on_buffer(new_bh); ++ error = -EIO; ++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) ++ goto cleanup; ++ } ++ } ++ ++ /* Update the inode. */ ++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ if (IS_SYNC(inode)) { ++ error = ext2_sync_inode (inode); ++ if (error) ++ goto cleanup; ++ } else ++ mark_inode_dirty(inode); ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext2_xattr_free_block(inode, old_bh->b_blocknr); ++ mark_buffer_clean(old_bh); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext2_xattr_quota_free(inode); ++ mark_buffer_dirty(old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT2_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext2_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext2_xattr_cache_remove(bh); ++ ext2_xattr_free_block(inode, block); ++ bforget(bh); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ mark_buffer_dirty(bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ } ++ ext2_xattr_quota_free(inode); ++ } ++ EXT2_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext2_xattr_sem); ++} ++ ++/* ++ * ext2_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++/* ++ * ext2_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext2_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext2_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext2_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext2_xattr_cmp(struct ext2_xattr_header *header1, ++ struct ext2_xattr_header *header2) ++{ ++ struct ext2_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT2_XATTR_NEXT(entry1); ++ entry2 = EXT2_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext2_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT2_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT2_XATTR_REFCOUNT_MAX); ++ } else if (!ext2_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext2_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext2_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext2_xattr_rehash(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ struct ext2_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext2_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT2_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext2_xattr(void) ++{ ++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext2_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++ mb_cache_destroy(ext2_xattr_cache); ++} ++ ++#else /* CONFIG_EXT2_FS_XATTR_SHARING */ ++ ++int __init ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/ext2/xattr_user.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,103 @@ ++/* ++ * linux/fs/ext2/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext2_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext2_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext2_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, ++ value, size, flags); ++} ++ ++struct ext2_xattr_handler ext2_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext2_xattr_user_list, ++ get: ext2_xattr_user_get, ++ set: ext2_xattr_user_set, ++}; ++ ++int __init ++init_ext2_xattr_user(void) ++{ ++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} ++ ++void ++exit_ext2_xattr_user(void) ++{ ++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} +--- linux/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/fs/ext3/Makefile Fri May 16 08:43:01 2003 +@@ -1,5 +1,5 @@ + # +-# Makefile for the linux ext2-filesystem routines. ++# Makefile for the linux ext3-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here +@@ -9,10 +9,13 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + ++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/fs/ext3/file.c Fri May 16 08:43:01 2003 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; + +--- linux/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/ext3/ialloc.c Fri May 16 08:43:01 2003 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +--- linux/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/ext3/inode.c Fri May 16 08:43:01 2003 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext3_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1855,6 +1865,8 @@ void ext3_truncate(struct inode * inode) + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -2002,8 +2014,6 @@ int ext3_get_inode_loc (struct inode *in + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2130,10 +2140,7 @@ void ext3_read_inode(struct inode * inod + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2141,15 +2148,17 @@ void ext3_read_inode(struct inode * inod + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } +- } else ++ } else { ++ inode->i_op = &ext3_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); ++ } + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ +--- linux/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:47 2003 ++++ linux-mmonroe/fs/ext3/namei.c Fri May 16 08:43:01 2003 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1611,7 +1612,7 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1619,7 +1620,6 @@ static int ext3_mkdir(struct inode * dir + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1646,9 +1646,6 @@ static int ext3_mkdir(struct inode * dir + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +@@ -2017,7 +2014,7 @@ static int ext3_symlink (struct inode * + goto out_stop; + + if (l > sizeof (EXT3_I(inode)->i_data)) { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. +@@ -2244,4 +2241,16 @@ struct inode_operations ext3_dir_inode_o + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; ++ ++struct inode_operations ext3_special_inode_operations = { ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ +--- linux/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/fs/ext3/super.c Fri May 16 08:43:01 2003 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -502,6 +504,7 @@ static int parse_options (char * options + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; ++ + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; +@@ -514,6 +517,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -931,6 +941,12 @@ struct super_block * ext3_read_super (st + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; ++ ++ /* Default extended attribute flags */ ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ ++#endif ++ + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; +@@ -1768,17 +1784,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, + + static int __init init_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (error) ++ return error; ++ error = init_ext3_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_xattr_user(); ++fail: ++ exit_ext3_xattr(); ++ return error; + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + +-EXPORT_SYMBOL(ext3_force_commit); +-EXPORT_SYMBOL(ext3_bread); +- + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); +--- linux/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54-hp Fri Nov 9 14:25:04 2001 ++++ linux-mmonroe/fs/ext3/symlink.c Fri May 16 08:43:01 2003 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext3_symlink_inode_operations = { ++ readlink: page_readlink, /* BKL not held. Don't need */ ++ follow_link: page_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/ext3/xattr.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,1225 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define EXT3_EA_USER "user." ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) while(0) {} ++# define ext3_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT3_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext3_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext3_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext3_xattr_cache_insert(new_bh); ++ ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext3_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext3_xattr_sem); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/ext3/xattr_user.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,111 @@ ++/* ++ * linux/fs/ext3/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext3_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext3_xattr_user_list, ++ get: ext3_xattr_user_get, ++ set: ext3_xattr_user_set, ++}; ++ ++int __init ++init_ext3_xattr_user(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} ++ ++void ++exit_ext3_xattr_user(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/ext3/ext3-exports.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); +--- linux/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/jfs/jfs_xattr.h Fri May 16 08:43:01 2003 +@@ -52,8 +52,10 @@ struct jfs_ea_list { + #define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int); +-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int); ++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t, ++ int); ++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, ++ int); + extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); + extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); + extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); +--- linux/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54-hp Thu Nov 28 15:53:15 2002 ++++ linux-mmonroe/fs/jfs/xattr.c Fri May 16 08:43:01 2003 +@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s + } + + static int can_set_xattr(struct inode *inode, const char *name, +- void *value, size_t value_len) ++ const void *value, size_t value_len) + { + if (IS_RDONLY(inode)) + return -EROFS; +@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i + return permission(inode, MAY_WRITE); + } + +-int __jfs_setxattr(struct inode *inode, const char *name, void *value, ++int __jfs_setxattr(struct inode *inode, const char *name, const void *value, + size_t value_len, int flags) + { + struct jfs_ea_list *ealist; +@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, + return rc; + } + +-int jfs_setxattr(struct dentry *dentry, const char *name, void *value, ++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t value_len, int flags) + { + if (value == NULL) { /* empty EA, do not remove */ +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/fs/mbcache.c Fri May 16 08:43:01 2003 +@@ -0,0 +1,648 @@ ++/* ++ * linux/fs/mbcache.c ++ * (C) 2001-2002 Andreas Gruenbacher, ++ */ ++ ++/* ++ * Filesystem Meta Information Block Cache (mbcache) ++ * ++ * The mbcache caches blocks of block devices that need to be located ++ * by their device/block number, as well as by other criteria (such ++ * as the block's contents). ++ * ++ * There can only be one cache entry in a cache per device and block number. ++ * Additional indexes need not be unique in this sense. The number of ++ * additional indexes (=other criteria) can be hardwired at compile time ++ * or specified at cache create time. ++ * ++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' ++ * in the cache. A valid entry is in the main hash tables of the cache, ++ * and may also be in the lru list. An invalid entry is not in any hashes ++ * or lists. ++ * ++ * A valid cache entry is only in the lru list if no handles refer to it. ++ * Invalid cache entries will be freed when the last handle to the cache ++ * entry is released. Entries that cannot be freed immediately are put ++ * back on the lru list. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#ifdef MB_CACHE_DEBUG ++# define mb_debug(f...) do { \ ++ printk(KERN_DEBUG f); \ ++ printk("\n"); \ ++ } while (0) ++#define mb_assert(c) do { if (!(c)) \ ++ printk(KERN_ERR "assertion " #c " failed\n"); \ ++ } while(0) ++#else ++# define mb_debug(f...) do { } while(0) ++# define mb_assert(c) do { } while(0) ++#endif ++#define mb_error(f...) do { \ ++ printk(KERN_ERR f); \ ++ printk("\n"); \ ++ } while(0) ++ ++MODULE_AUTHOR("Andreas Gruenbacher "); ++MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ++MODULE_LICENSE("GPL"); ++#endif ++ ++EXPORT_SYMBOL(mb_cache_create); ++EXPORT_SYMBOL(mb_cache_shrink); ++EXPORT_SYMBOL(mb_cache_destroy); ++EXPORT_SYMBOL(mb_cache_entry_alloc); ++EXPORT_SYMBOL(mb_cache_entry_insert); ++EXPORT_SYMBOL(mb_cache_entry_release); ++EXPORT_SYMBOL(mb_cache_entry_takeout); ++EXPORT_SYMBOL(mb_cache_entry_free); ++EXPORT_SYMBOL(mb_cache_entry_dup); ++EXPORT_SYMBOL(mb_cache_entry_get); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++EXPORT_SYMBOL(mb_cache_entry_find_first); ++EXPORT_SYMBOL(mb_cache_entry_find_next); ++#endif ++ ++ ++/* ++ * Global data: list of all mbcache's, lru list, and a spinlock for ++ * accessing cache data structures on SMP machines. The lru list is ++ * global across all mbcaches. ++ */ ++ ++static LIST_HEAD(mb_cache_list); ++static LIST_HEAD(mb_cache_lru_list); ++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; ++ ++static inline int ++mb_cache_indexes(struct mb_cache *cache) ++{ ++#ifdef MB_CACHE_INDEXES_COUNT ++ return MB_CACHE_INDEXES_COUNT; ++#else ++ return cache->c_indexes_count; ++#endif ++} ++ ++/* ++ * What the mbcache registers as to get shrunk dynamically. ++ */ ++ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask); ++ ++static struct cache_definition mb_cache_definition = { ++ "mb_cache", ++ mb_cache_memory_pressure ++}; ++ ++ ++static inline int ++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) ++{ ++ return !list_empty(&ce->e_block_list); ++} ++ ++ ++static inline void ++__mb_cache_entry_unhash(struct mb_cache_entry *ce) ++{ ++ int n; ++ ++ if (__mb_cache_entry_is_hashed(ce)) { ++ list_del_init(&ce->e_block_list); ++ for (n=0; ne_cache); n++) ++ list_del(&ce->e_indexes[n].o_list); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ ++ mb_assert(atomic_read(&ce->e_used) == 0); ++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { ++ /* free failed -- put back on the lru list ++ for freeing later. */ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&ce->e_lru_list, &mb_cache_lru_list); ++ spin_unlock(&mb_cache_spinlock); ++ } else { ++ kmem_cache_free(cache->c_entry_cache, ce); ++ atomic_dec(&cache->c_entry_count); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) ++{ ++ if (atomic_dec_and_test(&ce->e_used)) { ++ if (__mb_cache_entry_is_hashed(ce)) ++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); ++ else { ++ spin_unlock(&mb_cache_spinlock); ++ __mb_cache_entry_forget(ce, GFP_KERNEL); ++ return; ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_memory_pressure() memory pressure callback ++ * ++ * This function is called by the kernel memory management when memory ++ * gets low. ++ * ++ * @priority: Amount by which to shrink the cache (0 = highes priority) ++ * @gfp_mask: (ignored) ++ */ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int count = 0; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &mb_cache_list) { ++ struct mb_cache *cache = ++ list_entry(l, struct mb_cache, c_cache_list); ++ mb_debug("cache %s (%d)", cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ count += atomic_read(&cache->c_entry_count); ++ } ++ mb_debug("trying to free %d of %d entries", ++ count / (priority ? priority : 1), count); ++ if (priority) ++ count /= priority; ++ while (count-- && !list_empty(&mb_cache_lru_list)) { ++ struct mb_cache_entry *ce = ++ list_entry(mb_cache_lru_list.next, ++ struct mb_cache_entry, e_lru_list); ++ list_del(&ce->e_lru_list); ++ __mb_cache_entry_unhash(ce); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), gfp_mask); ++ } ++} ++ ++ ++/* ++ * mb_cache_create() create a new cache ++ * ++ * All entries in one cache are equal size. Cache entries may be from ++ * multiple devices. If this is the first mbcache created, registers ++ * the cache with kernel memory management. Returns NULL if no more ++ * memory was available. ++ * ++ * @name: name of the cache (informal) ++ * @cache_op: contains the callback called when freeing a cache entry ++ * @entry_size: The size of a cache entry, including ++ * struct mb_cache_entry ++ * @indexes_count: number of additional indexes in the cache. Must equal ++ * MB_CACHE_INDEXES_COUNT if the number of indexes is ++ * hardwired. ++ * @bucket_count: number of hash buckets ++ */ ++struct mb_cache * ++mb_cache_create(const char *name, struct mb_cache_op *cache_op, ++ size_t entry_size, int indexes_count, int bucket_count) ++{ ++ int m=0, n; ++ struct mb_cache *cache = NULL; ++ ++ if(entry_size < sizeof(struct mb_cache_entry) + ++ indexes_count * sizeof(struct mb_cache_entry_index)) ++ return NULL; ++ ++ MOD_INC_USE_COUNT; ++ cache = kmalloc(sizeof(struct mb_cache) + ++ indexes_count * sizeof(struct list_head), GFP_KERNEL); ++ if (!cache) ++ goto fail; ++ cache->c_name = name; ++ cache->c_op.free = NULL; ++ if (cache_op) ++ cache->c_op.free = cache_op->free; ++ atomic_set(&cache->c_entry_count, 0); ++ cache->c_bucket_count = bucket_count; ++#ifdef MB_CACHE_INDEXES_COUNT ++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); ++#else ++ cache->c_indexes_count = indexes_count; ++#endif ++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_block_hash) ++ goto fail; ++ for (n=0; nc_block_hash[n]); ++ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * ++ sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_indexes_hash[m]) ++ goto fail; ++ for (n=0; nc_indexes_hash[m][n]); ++ } ++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, ++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); ++ if (!cache->c_entry_cache) ++ goto fail; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&cache->c_cache_list, &mb_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ return cache; ++ ++fail: ++ if (cache) { ++ while (--m >= 0) ++ kfree(cache->c_indexes_hash[m]); ++ if (cache->c_block_hash) ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ } ++ MOD_DEC_USE_COUNT; ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_shrink() ++ * ++ * Removes all cache entires of a device from the cache. All cache entries ++ * currently in use cannot be freed, and thus remain in the cache. ++ * ++ * @cache: which cache to shrink ++ * @dev: which device's cache entries to shrink ++ */ ++void ++mb_cache_shrink(struct mb_cache *cache, kdev_t dev) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_dev == dev) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++} ++ ++ ++/* ++ * mb_cache_destroy() ++ * ++ * Shrinks the cache to its minimum possible size (hopefully 0 entries), ++ * and then destroys it. If this was the last mbcache, un-registers the ++ * mbcache from kernel memory management. ++ */ ++void ++mb_cache_destroy(struct mb_cache *cache) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_cache == cache) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ list_del(&cache->c_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++ ++ if (atomic_read(&cache->c_entry_count) > 0) { ++ mb_error("cache %s: %d orphaned entries", ++ cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ } ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) ++ /* We don't have kmem_cache_destroy() in 2.2.x */ ++ kmem_cache_shrink(cache->c_entry_cache); ++#else ++ kmem_cache_destroy(cache->c_entry_cache); ++#endif ++ for (n=0; n < mb_cache_indexes(cache); n++) ++ kfree(cache->c_indexes_hash[n]); ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ ++ MOD_DEC_USE_COUNT; ++} ++ ++ ++/* ++ * mb_cache_entry_alloc() ++ * ++ * Allocates a new cache entry. The new entry will not be valid initially, ++ * and thus cannot be looked up yet. It should be filled with data, and ++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL ++ * if no more memory was available. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_alloc(struct mb_cache *cache) ++{ ++ struct mb_cache_entry *ce; ++ ++ atomic_inc(&cache->c_entry_count); ++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); ++ if (ce) { ++ INIT_LIST_HEAD(&ce->e_lru_list); ++ INIT_LIST_HEAD(&ce->e_block_list); ++ ce->e_cache = cache; ++ atomic_set(&ce->e_used, 1); ++ } ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_insert() ++ * ++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into ++ * the cache. After this, the cache entry can be looked up, but is not yet ++ * in the lru list as the caller still holds a handle to it. Returns 0 on ++ * success, or -EBUSY if a cache entry for that device + inode exists ++ * already (this may happen after a failed lookup, if another process has ++ * inserted the same cache entry in the meantime). ++ * ++ * @dev: device the cache entry belongs to ++ * @block: block number ++ * @keys: array of additional keys. There must be indexes_count entries ++ * in the array (as specified when creating the cache). ++ */ ++int ++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, ++ unsigned long block, unsigned int keys[]) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ int error = -EBUSY, n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) ++ goto out; ++ } ++ __mb_cache_entry_unhash(ce); ++ ce->e_dev = dev; ++ ce->e_block = block; ++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); ++ for (n=0; ne_indexes[n].o_key = keys[n]; ++ bucket = keys[n] % cache->c_bucket_count; ++ list_add(&ce->e_indexes[n].o_list, ++ &cache->c_indexes_hash[n][bucket]); ++ } ++out: ++ spin_unlock(&mb_cache_spinlock); ++ return error; ++} ++ ++ ++/* ++ * mb_cache_entry_release() ++ * ++ * Release a handle to a cache entry. When the last handle to a cache entry ++ * is released it is either freed (if it is invalid) or otherwise inserted ++ * in to the lru list. ++ */ ++void ++mb_cache_entry_release(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_takeout() ++ * ++ * Take a cache entry out of the cache, making it invalid. The entry can later ++ * be re-inserted using mb_cache_entry_insert(), or released using ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_takeout(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_entry_free() ++ * ++ * This is equivalent to the sequence mb_cache_entry_takeout() -- ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_free(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_dup() ++ * ++ * Duplicate a handle to a cache entry (does not duplicate the cache entry ++ * itself). After the call, both the old and the new handle must be released. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_dup(struct mb_cache_entry *ce) ++{ ++ atomic_inc(&ce->e_used); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_get() ++ * ++ * Get a cache entry by device / block number. (There can only be one entry ++ * in the cache per device and block.) Returns NULL if no such cache entry ++ * exists. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) ++{ ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ ce = list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ goto cleanup; ++ } ++ } ++ ce = NULL; ++ ++cleanup: ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++ ++static struct mb_cache_entry * ++__mb_cache_entry_find(struct list_head *l, struct list_head *head, ++ int index, kdev_t dev, unsigned int key) ++{ ++ while (l != head) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, ++ e_indexes[index].o_list); ++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ return ce; ++ } ++ l = l->next; ++ } ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_entry_find_first() ++ * ++ * Find the first cache entry on a given device with a certain key in ++ * an additional index. Additonal matches can be found with ++ * mb_cache_entry_find_next(). Returns NULL if no match was found. ++ * ++ * @cache: the cache to search ++ * @index: the number of the additonal index to search (0<=indexc_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = cache->c_indexes_hash[index][bucket].next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_find_next() ++ * ++ * Find the next cache entry on a given device with a certain key in an ++ * additional index. Returns NULL if no match could be found. The previous ++ * entry is atomatically released, so that mb_cache_entry_find_next() can ++ * be called like this: ++ * ++ * entry = mb_cache_entry_find_first(); ++ * while (entry) { ++ * ... ++ * entry = mb_cache_entry_find_next(entry, ...); ++ * } ++ * ++ * @prev: The previous match ++ * @index: the number of the additonal index to search (0<=indexe_cache; ++ unsigned int bucket = key % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = prev->e_indexes[index].o_list.next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ __mb_cache_entry_release_unlock(prev); ++ return ce; ++} ++ ++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ ++ ++static int __init init_mbcache(void) ++{ ++ register_cache(&mb_cache_definition); ++ return 0; ++} ++ ++static void __exit exit_mbcache(void) ++{ ++ unregister_cache(&mb_cache_definition); ++} ++ ++module_init(init_mbcache) ++module_exit(exit_mbcache) ++ +--- linux/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-arm/unistd.h Fri May 16 08:43:01 2003 +@@ -244,7 +244,6 @@ + #define __NR_security (__NR_SYSCALL_BASE+223) + #define __NR_gettid (__NR_SYSCALL_BASE+224) + #define __NR_readahead (__NR_SYSCALL_BASE+225) +-#if 0 /* allocated in 2.5 */ + #define __NR_setxattr (__NR_SYSCALL_BASE+226) + #define __NR_lsetxattr (__NR_SYSCALL_BASE+227) + #define __NR_fsetxattr (__NR_SYSCALL_BASE+228) +@@ -257,7 +256,6 @@ + #define __NR_removexattr (__NR_SYSCALL_BASE+235) + #define __NR_lremovexattr (__NR_SYSCALL_BASE+236) + #define __NR_fremovexattr (__NR_SYSCALL_BASE+237) +-#endif + #define __NR_tkill (__NR_SYSCALL_BASE+238) + /* + * Please check 2.5 _before_ adding calls here, +--- linux/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-ppc64/unistd.h Fri May 16 08:43:01 2003 +@@ -218,6 +218,7 @@ + #define __NR_gettid 207 + #if 0 /* Reserved syscalls */ + #define __NR_tkill 208 ++#endif + #define __NR_setxattr 209 + #define __NR_lsetxattr 210 + #define __NR_fsetxattr 211 +@@ -230,6 +231,7 @@ + #define __NR_removexattr 218 + #define __NR_lremovexattr 219 + #define __NR_fremovexattr 220 ++#if 0 /* Reserved syscalls */ + #define __NR_futex 221 + #endif + +--- linux/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-s390/unistd.h Fri May 16 08:43:01 2003 +@@ -212,9 +212,18 @@ + #define __NR_madvise 219 + #define __NR_getdents64 220 + #define __NR_fcntl64 221 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-s390x/unistd.h Fri May 16 08:43:01 2003 +@@ -180,9 +180,18 @@ + #define __NR_pivot_root 217 + #define __NR_mincore 218 + #define __NR_madvise 219 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-sparc/unistd.h Fri May 16 08:43:01 2003 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- linux/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54-hp Fri Aug 2 17:39:45 2002 ++++ linux-mmonroe/include/asm-sparc64/unistd.h Fri May 16 08:43:01 2003 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/include/linux/cache_def.h Fri May 16 08:43:01 2003 +@@ -0,0 +1,15 @@ ++/* ++ * linux/cache_def.h ++ * Handling of caches defined in drivers, filesystems, ... ++ * ++ * Copyright (C) 2002 by Andreas Gruenbacher, ++ */ ++ ++struct cache_definition { ++ const char *name; ++ void (*shrink)(int, unsigned int); ++ struct list_head link; ++}; ++ ++extern void register_cache(struct cache_definition *); ++extern void unregister_cache(struct cache_definition *); +--- linux/include/linux/errno.h~linux-2.4.20-xattr-0.8.54-hp Fri Feb 9 14:46:13 2001 ++++ linux-mmonroe/include/linux/errno.h Fri May 16 08:43:01 2003 +@@ -23,4 +23,8 @@ + + #endif + ++/* Defined for extended attributes */ ++#define ENOATTR ENODATA /* No such attribute */ ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++ + #endif +--- linux/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54-hp Thu Nov 22 11:46:52 2001 ++++ linux-mmonroe/include/linux/ext2_fs.h Fri May 16 08:43:01 2003 +@@ -57,8 +57,6 @@ + */ + #define EXT2_BAD_INO 1 /* Bad blocks inode */ + #define EXT2_ROOT_INO 2 /* Root inode */ +-#define EXT2_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT2_ACL_DATA_INO 4 /* ACL inode */ + #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +@@ -86,7 +84,6 @@ + #else + # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) + #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -121,28 +118,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext2_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext2_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext2_group_desc +@@ -314,6 +289,7 @@ struct ext2_inode { + #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt + #define set_opt(o, opt) o |= EXT2_MOUNT_##opt +@@ -397,6 +373,7 @@ struct ext2_super_block { + + #ifdef __KERNEL__ + #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) ++#define EXT2_I(inode) (&((inode)->u.ext2_i)) + #else + /* Assume that user mode programs are passing in an ext2fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +@@ -466,7 +443,7 @@ struct ext2_super_block { + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +-#define EXT2_FEATURE_COMPAT_SUPP 0 ++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -623,8 +600,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext2_dir_inode_operations; ++extern struct inode_operations ext2_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext2_symlink_inode_operations; + extern struct inode_operations ext2_fast_symlink_inode_operations; + + #endif /* __KERNEL__ */ +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/include/linux/ext2_xattr.h Fri May 16 08:43:01 2003 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext2_xattr.h ++ ++ On-disk format of extended attributes for the ext2 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT2_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT2_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT2_XATTR_INDEX_MAX 10 ++#define EXT2_XATTR_INDEX_USER 1 ++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext2_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext2_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT2_XATTR_PAD_BITS 2 ++#define EXT2_XATTR_PAD (1<e_name_len)) ) ++#define EXT2_XATTR_SIZE(size) \ ++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT2_FS_XATTR ++ ++struct ext2_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext2_xattr_register(int, struct ext2_xattr_handler *); ++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); ++ ++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); ++extern int ext2_removexattr(struct dentry *, const char *); ++ ++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext2_xattr_list(struct inode *, char *, size_t); ++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext2_xattr_delete_inode(struct inode *); ++extern void ext2_xattr_put_super(struct super_block *); ++ ++extern int init_ext2_xattr(void) __init; ++extern void exit_ext2_xattr(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR */ ++# define ext2_setxattr NULL ++# define ext2_getxattr NULL ++# define ext2_listxattr NULL ++# define ext2_removexattr NULL ++ ++static inline int ++ext2_xattr_get(struct inode *inode, int name_index, ++ const char *name, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++} ++ ++static inline void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR */ ++ ++# ifdef CONFIG_EXT2_FS_XATTR_USER ++ ++extern int init_ext2_xattr_user(void) __init; ++extern void exit_ext2_xattr_user(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++static inline int ++init_ext2_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr_user(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:47 2003 ++++ linux-mmonroe/include/linux/ext3_fs.h Fri May 16 08:43:01 2003 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -344,6 +319,7 @@ struct ext3_inode { + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -520,7 +496,7 @@ struct ext3_super_block { + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +@@ -771,8 +748,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; ++extern struct inode_operations ext3_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + + +--- linux/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/include/linux/ext3_jbd.h Fri May 16 08:43:01 2003 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/include/linux/ext3_xattr.h Fri May 16 08:43:01 2003 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext3_xattr_delete_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux/include/linux/fs.h~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:46 2003 ++++ linux-mmonroe/include/linux/fs.h Fri May 16 08:43:01 2003 +@@ -909,7 +909,7 @@ struct inode_operations { + int (*setattr) (struct dentry *, struct iattr *); + int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); +- int (*setxattr) (struct dentry *, const char *, void *, size_t, int); ++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +--- /dev/null Mon May 20 21:11:23 2002 ++++ linux-mmonroe/include/linux/mbcache.h Fri May 16 08:43:01 2003 +@@ -0,0 +1,69 @@ ++/* ++ File: linux/mbcache.h ++ ++ (C) 2001 by Andreas Gruenbacher, ++*/ ++ ++/* Hardwire the number of additional indexes */ ++#define MB_CACHE_INDEXES_COUNT 1 ++ ++struct mb_cache_entry; ++ ++struct mb_cache_op { ++ int (*free)(struct mb_cache_entry *, int); ++}; ++ ++struct mb_cache { ++ struct list_head c_cache_list; ++ const char *c_name; ++ struct mb_cache_op c_op; ++ atomic_t c_entry_count; ++ int c_bucket_count; ++#ifndef MB_CACHE_INDEXES_COUNT ++ int c_indexes_count; ++#endif ++ kmem_cache_t *c_entry_cache; ++ struct list_head *c_block_hash; ++ struct list_head *c_indexes_hash[0]; ++}; ++ ++struct mb_cache_entry_index { ++ struct list_head o_list; ++ unsigned int o_key; ++}; ++ ++struct mb_cache_entry { ++ struct list_head e_lru_list; ++ struct mb_cache *e_cache; ++ atomic_t e_used; ++ kdev_t e_dev; ++ unsigned long e_block; ++ struct list_head e_block_list; ++ struct mb_cache_entry_index e_indexes[0]; ++}; ++ ++/* Functions on caches */ ++ ++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, ++ int, int); ++void mb_cache_shrink(struct mb_cache *, kdev_t); ++void mb_cache_destroy(struct mb_cache *); ++ ++/* Functions on cache entries */ ++ ++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); ++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, ++ unsigned int[]); ++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); ++void mb_cache_entry_release(struct mb_cache_entry *); ++void mb_cache_entry_takeout(struct mb_cache_entry *); ++void mb_cache_entry_free(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, ++ unsigned long); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, ++ kdev_t, unsigned int); ++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, ++ kdev_t, unsigned int); ++#endif +--- linux/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:42:45 2003 ++++ linux-mmonroe/kernel/ksyms.c Fri May 16 08:43:52 2003 +@@ -11,6 +11,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -106,6 +107,7 @@ EXPORT_SYMBOL(exit_mm); + EXPORT_SYMBOL(exit_files); + EXPORT_SYMBOL(exit_fs); + EXPORT_SYMBOL(exit_sighand); ++EXPORT_SYMBOL(copy_fs_struct); + EXPORT_SYMBOL_GPL(make_pages_present); + + /* internal kernel memory management */ +@@ -126,6 +128,8 @@ EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); + EXPORT_SYMBOL(kmem_cache_size); ++EXPORT_SYMBOL(register_cache); ++EXPORT_SYMBOL(unregister_cache); + EXPORT_SYMBOL(kmalloc); + EXPORT_SYMBOL(kfree); + EXPORT_SYMBOL(vfree); +--- linux/mm/vmscan.c~linux-2.4.20-xattr-0.8.54-hp Fri May 16 08:39:23 2003 ++++ linux-mmonroe/mm/vmscan.c Fri May 16 08:43:01 2003 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -35,6 +36,39 @@ + */ + #define DEF_PRIORITY (6) + ++static DECLARE_MUTEX(other_caches_sem); ++static LIST_HEAD(cache_definitions); ++ ++void register_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_add(&cache->link, &cache_definitions); ++ up(&other_caches_sem); ++} ++ ++void unregister_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_del(&cache->link); ++ up(&other_caches_sem); ++} ++ ++static void shrink_other_caches(unsigned int priority, int gfp_mask) ++{ ++ struct list_head *p; ++ ++ if (down_trylock(&other_caches_sem)) ++ return; ++ ++ list_for_each_prev(p, &cache_definitions) { ++ struct cache_definition *cache = ++ list_entry(p, struct cache_definition, link); ++ ++ cache->shrink(priority, gfp_mask); ++ } ++ up(&other_caches_sem); ++} ++ + /* + * The swap-out function returns 1 if it successfully + * scanned all the pages it was asked to (`count'). +@@ -579,6 +613,7 @@ static int shrink_caches(zone_t * classz + + shrink_dcache_memory(priority, gfp_mask); + shrink_icache_memory(priority, gfp_mask); ++ shrink_other_caches(priority, gfp_mask); + #ifdef CONFIG_QUOTA + shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + #endif + +_ diff --git a/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch new file mode 100644 index 0000000..1489989 --- /dev/null +++ b/lustre/kernel_patches/patches/linux-2.4.20-xattr-0.8.54.patch @@ -0,0 +1,5595 @@ + Documentation/Configure.help | 66 ++ + arch/alpha/defconfig | 7 + arch/alpha/kernel/entry.S | 12 + arch/arm/defconfig | 7 + arch/arm/kernel/calls.S | 24 + arch/i386/defconfig | 7 + arch/ia64/defconfig | 7 + arch/ia64/kernel/entry.S | 24 + arch/m68k/defconfig | 7 + arch/mips/defconfig | 7 + arch/mips64/defconfig | 7 + arch/ppc/defconfig | 14 + arch/ppc64/kernel/misc.S | 2 + arch/s390/defconfig | 7 + arch/s390/kernel/entry.S | 24 + arch/s390x/defconfig | 7 + arch/s390x/kernel/entry.S | 24 + arch/s390x/kernel/wrapper32.S | 92 +++ + arch/sparc/defconfig | 7 + arch/sparc/kernel/systbls.S | 10 + arch/sparc64/defconfig | 7 + arch/sparc64/kernel/systbls.S | 20 + fs/Config.in | 14 + fs/Makefile | 3 + fs/ext2/Makefile | 4 + fs/ext2/file.c | 5 + fs/ext2/ialloc.c | 2 + fs/ext2/inode.c | 34 - + fs/ext2/namei.c | 14 + fs/ext2/super.c | 29 + fs/ext2/symlink.c | 14 + fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++ + fs/ext2/xattr_user.c | 103 +++ + fs/ext3/Makefile | 10 + fs/ext3/file.c | 5 + fs/ext3/ialloc.c | 2 + fs/ext3/inode.c | 35 - + fs/ext3/namei.c | 21 + fs/ext3/super.c | 36 + + fs/ext3/symlink.c | 14 + fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++ + fs/ext3/xattr_user.c | 111 +++ + fs/jfs/jfs_xattr.h | 6 + fs/jfs/xattr.c | 6 + fs/mbcache.c | 648 ++++++++++++++++++++++ + include/asm-arm/unistd.h | 2 + include/asm-ia64/unistd.h | 13 + include/asm-ppc64/unistd.h | 2 + include/asm-s390/unistd.h | 15 + include/asm-s390x/unistd.h | 15 + include/asm-sparc/unistd.h | 24 + include/asm-sparc64/unistd.h | 24 + include/linux/cache_def.h | 15 + include/linux/errno.h | 4 + include/linux/ext2_fs.h | 31 - + include/linux/ext2_xattr.h | 157 +++++ + include/linux/ext3_fs.h | 31 - + include/linux/ext3_jbd.h | 8 + include/linux/ext3_xattr.h | 157 +++++ + include/linux/fs.h | 2 + include/linux/mbcache.h | 69 ++ + kernel/ksyms.c | 4 + mm/vmscan.c | 35 + + fs/ext3/ext3-exports.c | 14 + + 64 files changed, 4355 insertions(+), 195 deletions(-) + +--- linux-2.4.20/Documentation/Configure.help~linux-2.4.20-xattr-0.8.54 2003-05-05 17:43:06.000000000 +0800 ++++ linux-2.4.20-root/Documentation/Configure.help 2003-05-07 18:08:03.000000000 +0800 +@@ -15242,6 +15242,39 @@ CONFIG_EXT2_FS + be compiled as a module, and so this could be dangerous. Most + everyone wants to say Y here. + ++Ext2 extended attributes ++CONFIG_EXT2_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext2 extended attribute block sharing ++CONFIG_EXT2_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext2 extended user attributes ++CONFIG_EXT2_FS_XATTR_USER ++ This option enables extended user attributes on ext2. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext2 trusted extended attributes ++CONFIG_EXT2_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext2 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Ext3 journalling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journalling version of the Second extended file system +@@ -15274,6 +15307,39 @@ CONFIG_EXT3_FS + of your root partition (the one containing the directory /) cannot + be compiled as a module, and so this may be dangerous. + ++Ext3 extended attributes ++CONFIG_EXT3_FS_XATTR ++ Extended attributes are name:value pairs associated with inodes by ++ the kernel or by users (see the attr(5) manual page, or visit ++ for details). ++ ++ If unsure, say N. ++ ++Ext3 extended attribute block sharing ++CONFIG_EXT3_FS_XATTR_SHARING ++ This options enables code for sharing identical extended attribute ++ blocks among multiple inodes. ++ ++ Usually, say Y. ++ ++Ext3 extended user attributes ++CONFIG_EXT3_FS_XATTR_USER ++ This option enables extended user attributes on ext3. Processes can ++ associate extended user attributes with inodes to store additional ++ information such as the character encoding of files, etc. (see the ++ attr(5) manual page, or visit for details). ++ ++ If unsure, say N. ++ ++Ext3 trusted extended attributes ++CONFIG_EXT3_FS_XATTR_TRUSTED ++ This option enables extended attributes on ext3 that are accessible ++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this ++ is only the super user. Trusted extended attributes are meant for ++ implementing system/security services. ++ ++ If unsure, say N. ++ + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journalling layer for block devices. It is +--- linux-2.4.20/arch/alpha/defconfig~linux-2.4.20-xattr-0.8.54 2001-11-20 07:19:42.000000000 +0800 ++++ linux-2.4.20-root/arch/alpha/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ALPHA=y + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set +--- linux-2.4.20/arch/alpha/kernel/entry.S~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:42.000000000 +0800 ++++ linux-2.4.20-root/arch/alpha/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800 +@@ -1154,6 +1154,18 @@ sys_call_table: + .quad sys_readahead + .quad sys_ni_syscall /* 380, sys_security */ + .quad sys_tkill ++ .quad sys_setxattr ++ .quad sys_lsetxattr ++ .quad sys_fsetxattr ++ .quad sys_getxattr /* 385 */ ++ .quad sys_lgetxattr ++ .quad sys_fgetxattr ++ .quad sys_listxattr ++ .quad sys_llistxattr ++ .quad sys_flistxattr /* 390 */ ++ .quad sys_removexattr ++ .quad sys_lremovexattr ++ .quad sys_fremovexattr + + /* Remember to update everything, kids. */ + .ifne (. - sys_call_table) - (NR_SYSCALLS * 8) +--- linux-2.4.20/arch/arm/defconfig~linux-2.4.20-xattr-0.8.54 2001-05-20 08:43:05.000000000 +0800 ++++ linux-2.4.20-root/arch/arm/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_ARM=y + # CONFIG_EISA is not set + # CONFIG_SBUS is not set +--- linux-2.4.20/arch/arm/kernel/calls.S~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:42.000000000 +0800 ++++ linux-2.4.20-root/arch/arm/kernel/calls.S 2003-05-07 18:08:03.000000000 +0800 +@@ -240,18 +240,18 @@ __syscall_start: + .long SYMBOL_NAME(sys_ni_syscall) /* Security */ + .long SYMBOL_NAME(sys_gettid) + /* 225 */ .long SYMBOL_NAME(sys_readahead) +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_setxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fsetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_getxattr */ +-/* 230 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_lgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fgetxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_listxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_llistxattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_flistxattr */ +-/* 235 */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_removexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_lremovexattr */ +- .long SYMBOL_NAME(sys_ni_syscall) /* sys_fremovexattr */ ++ .long SYMBOL_NAME(sys_setxattr) ++ .long SYMBOL_NAME(sys_lsetxattr) ++ .long SYMBOL_NAME(sys_fsetxattr) ++ .long SYMBOL_NAME(sys_getxattr) ++/* 230 */ .long SYMBOL_NAME(sys_lgetxattr) ++ .long SYMBOL_NAME(sys_fgetxattr) ++ .long SYMBOL_NAME(sys_listxattr) ++ .long SYMBOL_NAME(sys_llistxattr) ++ .long SYMBOL_NAME(sys_flistxattr) ++/* 235 */ .long SYMBOL_NAME(sys_removexattr) ++ .long SYMBOL_NAME(sys_lremovexattr) ++ .long SYMBOL_NAME(sys_fremovexattr) + .long SYMBOL_NAME(sys_tkill) + /* + * Please check 2.5 _before_ adding calls here, +--- linux-2.4.20/arch/i386/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:09.000000000 +0800 ++++ linux-2.4.20-root/arch/i386/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_X86=y + CONFIG_ISA=y + # CONFIG_SBUS is not set +--- linux-2.4.20/arch/ia64/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:09.000000000 +0800 ++++ linux-2.4.20-root/arch/ia64/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux-2.4.20/arch/ia64/kernel/entry.S~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:09.000000000 +0800 ++++ linux-2.4.20-root/arch/ia64/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800 +@@ -1170,18 +1170,18 @@ sys_call_table: + data8 sys_getdents64 + data8 sys_getunwind // 1215 + data8 sys_readahead +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall // 1220 +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall // 1225 +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall +- data8 ia64_ni_syscall ++ data8 sys_setxattr ++ data8 sys_lsetxattr ++ data8 sys_fsetxattr ++ data8 sys_getxattr // 1220 ++ data8 sys_lgetxattr ++ data8 sys_fgetxattr ++ data8 sys_listxattr ++ data8 sys_llistxattr ++ data8 sys_flistxattr // 1225 ++ data8 sys_removexattr ++ data8 sys_lremovexattr ++ data8 sys_fremovexattr + data8 sys_tkill + data8 ia64_ni_syscall // 1230 + data8 ia64_ni_syscall +--- linux-2.4.20/arch/m68k/defconfig~linux-2.4.20-xattr-0.8.54 2000-06-20 03:56:08.000000000 +0800 ++++ linux-2.4.20-root/arch/m68k/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + + # +--- linux-2.4.20/arch/mips/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:10.000000000 +0800 ++++ linux-2.4.20-root/arch/mips/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + CONFIG_MIPS32=y + # CONFIG_MIPS64 is not set +--- linux-2.4.20/arch/mips64/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:10.000000000 +0800 ++++ linux-2.4.20-root/arch/mips64/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_MIPS=y + # CONFIG_MIPS32 is not set + CONFIG_MIPS64=y +--- linux-2.4.20/arch/ppc/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/ppc/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,20 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_UID16 is not set + # CONFIG_RWSEM_GENERIC_SPINLOCK is not set + CONFIG_RWSEM_XCHGADD_ALGORITHM=y +--- linux-2.4.20/arch/ppc64/kernel/misc.S~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/ppc64/kernel/misc.S 2003-05-07 18:08:03.000000000 +0800 +@@ -731,6 +731,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_gettid /* 207 */ + #if 0 /* Reserved syscalls */ + .llong .sys_tkill /* 208 */ ++#endif + .llong .sys_setxattr + .llong .sys_lsetxattr /* 210 */ + .llong .sys_fsetxattr +@@ -743,6 +744,7 @@ _GLOBAL(sys_call_table32) + .llong .sys_removexattr + .llong .sys_lremovexattr + .llong .sys_fremovexattr /* 220 */ ++#if 0 /* Reserved syscalls */ + .llong .sys_futex + #endif + .llong .sys_perfmonctl /* Put this here for now ... */ +--- linux-2.4.20/arch/s390/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/s390/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux-2.4.20/arch/s390/kernel/entry.S~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/s390/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800 +@@ -558,18 +558,18 @@ sys_call_table: + .long sys_fcntl64 + .long sys_ni_syscall + .long sys_ni_syscall +- .long sys_ni_syscall /* 224 - reserved for setxattr */ +- .long sys_ni_syscall /* 225 - reserved for lsetxattr */ +- .long sys_ni_syscall /* 226 - reserved for fsetxattr */ +- .long sys_ni_syscall /* 227 - reserved for getxattr */ +- .long sys_ni_syscall /* 228 - reserved for lgetxattr */ +- .long sys_ni_syscall /* 229 - reserved for fgetxattr */ +- .long sys_ni_syscall /* 230 - reserved for listxattr */ +- .long sys_ni_syscall /* 231 - reserved for llistxattr */ +- .long sys_ni_syscall /* 232 - reserved for flistxattr */ +- .long sys_ni_syscall /* 233 - reserved for removexattr */ +- .long sys_ni_syscall /* 234 - reserved for lremovexattr */ +- .long sys_ni_syscall /* 235 - reserved for fremovexattr */ ++ .long sys_setxattr ++ .long sys_lsetxattr /* 225 */ ++ .long sys_fsetxattr ++ .long sys_getxattr ++ .long sys_lgetxattr ++ .long sys_fgetxattr ++ .long sys_listxattr /* 230 */ ++ .long sys_llistxattr ++ .long sys_flistxattr ++ .long sys_removexattr ++ .long sys_lremovexattr ++ .long sys_fremovexattr /* 235 */ + .long sys_gettid + .long sys_tkill + .rept 255-237 +--- linux-2.4.20/arch/s390x/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/s390x/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + # CONFIG_ISA is not set + # CONFIG_EISA is not set + # CONFIG_MCA is not set +--- linux-2.4.20/arch/s390x/kernel/entry.S~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:11.000000000 +0800 ++++ linux-2.4.20-root/arch/s390x/kernel/entry.S 2003-05-07 18:08:03.000000000 +0800 +@@ -591,18 +591,18 @@ sys_call_table: + .long SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) + .long SYSCALL(sys_ni_syscall,sys_ni_syscall) +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */ +- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */ ++ .long SYSCALL(sys_setxattr,sys32_setxattr_wrapper) ++ .long SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper) /* 225 */ ++ .long SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper) ++ .long SYSCALL(sys_getxattr,sys32_getxattr_wrapper) ++ .long SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper) ++ .long SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper) ++ .long SYSCALL(sys_listxattr,sys32_listxattr_wrapper) /* 230 */ ++ .long SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper) ++ .long SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper) ++ .long SYSCALL(sys_removexattr,sys32_removexattr_wrapper) ++ .long SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper) ++ .long SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */ + .long SYSCALL(sys_gettid,sys_gettid) + .long SYSCALL(sys_tkill,sys_tkill) + .rept 255-237 +--- linux-2.4.20/arch/s390x/kernel/wrapper32.S~linux-2.4.20-xattr-0.8.54 2002-02-26 03:37:56.000000000 +0800 ++++ linux-2.4.20-root/arch/s390x/kernel/wrapper32.S 2003-05-07 18:08:03.000000000 +0800 +@@ -1091,3 +1091,95 @@ sys32_fstat64_wrapper: + llgtr %r3,%r3 # struct stat64 * + llgfr %r4,%r4 # long + jg sys32_fstat64 # branch to system call ++ ++ .globl sys32_setxattr_wrapper ++sys32_setxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_setxattr ++ ++ .globl sys32_lsetxattr_wrapper ++sys32_lsetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_lsetxattr ++ ++ .globl sys32_fsetxattr_wrapper ++sys32_fsetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ lgfr %r6,%r6 # int ++ jg sys_fsetxattr ++ ++ .globl sys32_getxattr_wrapper ++sys32_getxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_getxattr ++ ++ .globl sys32_lgetxattr_wrapper ++sys32_lgetxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_lgetxattr ++ ++ .globl sys32_fgetxattr_wrapper ++sys32_fgetxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgtr %r4,%r4 # void * ++ llgfr %r5,%r5 # size_t ++ jg sys_fgetxattr ++ ++ .globl sys32_listxattr_wrapper ++sys32_listxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_listxattr ++ ++ .globl sys32_llistxattr_wrapper ++sys32_llistxattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_llistxattr ++ ++ .globl sys32_flistxattr_wrapper ++sys32_flistxattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ llgfr %r4,%r4 # size_t ++ jg sys_flistxattr ++ ++ .globl sys32_removexattr_wrapper ++sys32_removexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_removexattr ++ ++ .globl sys32_lremovexattr_wrapper ++sys32_lremovexattr_wrapper: ++ llgtr %r2,%r2 # char * ++ llgtr %r3,%r3 # char * ++ jg sys_lremovexattr ++ ++ .globl sys32_fremovexattr_wrapper ++sys32_fremovexattr_wrapper: ++ lgfr %r2,%r2 # int ++ llgtr %r3,%r3 # char * ++ jg sys_fremovexattr ++ ++ +--- linux-2.4.20/arch/sparc/defconfig~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:43.000000000 +0800 ++++ linux-2.4.20-root/arch/sparc/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + CONFIG_UID16=y + CONFIG_HIGHMEM=y + +--- linux-2.4.20/arch/sparc/kernel/systbls.S~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:43.000000000 +0800 ++++ linux-2.4.20-root/arch/sparc/kernel/systbls.S 2003-05-07 18:08:03.000000000 +0800 +@@ -51,11 +51,11 @@ sys_call_table: + /*150*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + /*155*/ .long sys_fcntl64, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .long sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +-/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +-/*175*/ .long sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .long sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_sigpending, sys_query_module +-/*185*/ .long sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sys_newuname ++/*165*/ .long sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .long sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++/*175*/ .long sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .long sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_sigpending, sys_query_module ++/*185*/ .long sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sys_newuname + /*190*/ .long sys_init_module, sys_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + /*195*/ .long sys_nis_syscall, sys_nis_syscall, sys_getppid, sparc_sigaction, sys_sgetmask + /*200*/ .long sys_ssetmask, sys_sigsuspend, sys_newlstat, sys_uselib, old_readdir +--- linux-2.4.20/arch/sparc64/defconfig~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:12.000000000 +0800 ++++ linux-2.4.20-root/arch/sparc64/defconfig 2003-05-07 18:08:03.000000000 +0800 +@@ -1,6 +1,13 @@ + # + # Automatically generated make config: don't edit + # ++# CONFIG_EXT3_FS_XATTR is not set ++# CONFIG_EXT3_FS_XATTR_SHARING is not set ++# CONFIG_EXT3_FS_XATTR_USER is not set ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT2_FS_XATTR_SHARING is not set ++# CONFIG_EXT2_FS_XATTR_USER is not set ++# CONFIG_FS_MBCACHE is not set + + # + # Code maturity level options +--- linux-2.4.20/arch/sparc64/kernel/systbls.S~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:43.000000000 +0800 ++++ linux-2.4.20-root/arch/sparc64/kernel/systbls.S 2003-05-07 18:08:03.000000000 +0800 +@@ -52,11 +52,11 @@ sys_call_table32: + /*150*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys32_fcntl64, sys_nis_syscall, sys32_statfs, sys32_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_nis_syscall +- .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys32_sigpending, sys32_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys32_quotactl, sys_nis_syscall, sys32_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys32_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys32_sigpending, sys32_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys32_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys32_sigaction, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_sigsuspend, sys32_newlstat, sys_uselib, old32_readdir +@@ -111,11 +111,11 @@ sys_call_table: + /*150*/ .word sys_getsockname, sys_nis_syscall, sys_nis_syscall, sys_poll, sys_getdents64 + .word sys_nis_syscall, sys_nis_syscall, sys_statfs, sys_fstatfs, sys_oldumount + /*160*/ .word sys_nis_syscall, sys_nis_syscall, sys_getdomainname, sys_setdomainname, sys_utrap_install +- .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_nis_syscall +-/*170*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_getdents +- .word sys_setsid, sys_fchdir, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall +-/*180*/ .word sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall, sys_query_module +- .word sys_setpgid, sys_nis_syscall, sys_tkill, sys_nis_syscall, sparc64_newuname ++ .word sys_quotactl, sys_nis_syscall, sys_mount, sys_ustat, sys_setxattr ++/*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents ++ .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr ++/*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_query_module ++ .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_nis_syscall, sparc64_newuname + /*190*/ .word sys_init_module, sparc64_personality, sys_nis_syscall, sys_nis_syscall, sys_nis_syscall + .word sys_nis_syscall, sys_nis_syscall, sys_getppid, sys_nis_syscall, sys_sgetmask + /*200*/ .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall +--- linux-2.4.20/fs/Config.in~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/Config.in 2003-05-07 18:08:03.000000000 +0800 +@@ -25,6 +25,11 @@ dep_mbool ' Debug Befs' CONFIG_BEFS_DEB + dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + + tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS ++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS ++dep_bool ' Ext3 extended attribute block sharing' \ ++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR ++dep_bool ' Ext3 extended user attributes' \ ++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS +@@ -84,6 +89,11 @@ dep_mbool ' QNX4FS write support (DANGE + tristate 'ROM file system support' CONFIG_ROMFS_FS + + tristate 'Second extended fs support' CONFIG_EXT2_FS ++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS ++dep_bool ' Ext2 extended attribute block sharing' \ ++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR ++dep_bool ' Ext2 extended user attributes' \ ++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR + + tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS + +@@ -155,6 +165,10 @@ else + define_tristate CONFIG_ZISOFS_FS n + fi + ++# Meta block cache for Extended Attributes (ext2/ext3) ++#tristate 'Meta block cache' CONFIG_FS_MBCACHE ++define_tristate CONFIG_FS_MBCACHE y ++ + mainmenu_option next_comment + comment 'Partition Types' + source fs/partitions/Config.in +--- linux-2.4.20/fs/Makefile~linux-2.4.20-xattr-0.8.54 2003-05-05 19:00:58.000000000 +0800 ++++ linux-2.4.20-root/fs/Makefile 2003-05-07 18:08:03.000000000 +0800 +@@ -79,6 +79,9 @@ obj-y += binfmt_script.o + + obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o + ++export-objs += mbcache.o ++obj-$(CONFIG_FS_MBCACHE) += mbcache.o ++ + # persistent filesystems + obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) + +--- linux-2.4.20/fs/ext2/Makefile~linux-2.4.20-xattr-0.8.54 2001-10-11 23:05:18.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/Makefile 2003-05-07 18:08:03.000000000 +0800 +@@ -13,4 +13,8 @@ obj-y := balloc.o bitmap.o dir.o file + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux-2.4.20/fs/ext2/file.c~linux-2.4.20-xattr-0.8.54 2001-10-11 23:05:18.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/file.c 2003-05-07 18:08:03.000000000 +0800 +@@ -20,6 +20,7 @@ + + #include + #include ++#include + #include + + /* +@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati + + struct inode_operations ext2_file_inode_operations = { + truncate: ext2_truncate, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux-2.4.20/fs/ext2/ialloc.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/ialloc.c 2003-05-07 18:08:03.000000000 +0800 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino + */ + if (!is_bad_inode(inode)) { + /* Quota is already initialized in iput() */ ++ ext2_xattr_delete_inode(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + } +--- linux-2.4.20/fs/ext2/inode.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/inode.c 2003-05-07 18:08:03.000000000 +0800 +@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL"); + static int ext2_update_inode(struct inode * inode, int do_sync); + + /* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext2_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext2_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ ++/* + * Called at each iput() + */ + void ext2_put_inode (struct inode * inode) +@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i + { + lock_kernel(); + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + inode->u.ext2_i.i_dtime = CURRENT_TIME; + mark_inode_dirty(inode); +@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext2_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -888,8 +900,7 @@ void ext2_read_inode (struct inode * ino + unsigned long offset; + struct ext2_group_desc * gdp; + +- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && +- inode->i_ino != EXT2_ACL_DATA_INO && ++ if ((inode->i_ino != EXT2_ROOT_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_read_inode", +@@ -974,10 +985,7 @@ void ext2_read_inode (struct inode * ino + for (block = 0; block < EXT2_N_BLOCKS; block++) + inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; + +- if (inode->i_ino == EXT2_ACL_IDX_INO || +- inode->i_ino == EXT2_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + inode->i_mapping->a_ops = &ext2_aops; +@@ -986,15 +994,17 @@ void ext2_read_inode (struct inode * ino + inode->i_fop = &ext2_dir_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext2_inode_is_fast_symlink(inode)) + inode->i_op = &ext2_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + } +- } else ++ } else { ++ inode->i_op = &ext2_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(raw_inode->i_block[0])); ++ } + brelse (bh); + inode->i_attr_flags = 0; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { +--- linux-2.4.20/fs/ext2/namei.c~linux-2.4.20-xattr-0.8.54 2001-10-04 13:57:36.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/namei.c 2003-05-07 18:08:03.000000000 +0800 +@@ -31,6 +31,7 @@ + + #include + #include ++#include + #include + + /* +@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode * + + if (l > sizeof (inode->u.ext2_i.i_data)) { + /* slow symlink */ +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext2_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + err = block_symlink(inode, symname, l); + if (err) +@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o + rmdir: ext2_rmdir, + mknod: ext2_mknod, + rename: ext2_rename, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ ++struct inode_operations ext2_special_inode_operations = { ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- linux-2.4.20/fs/ext2/super.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/super.c 2003-05-07 18:08:03.000000000 +0800 +@@ -21,6 +21,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block + int db_count; + int i; + ++ ext2_xattr_put_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + +@@ -175,6 +177,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st + blocksize = BLOCK_SIZE; + + sb->u.ext2_sb.s_mount_opt = 0; ++#ifdef CONFIG_EXT2_FS_XATTR_USER ++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */ ++#endif + if (!parse_options ((char *) data, &sb_block, &resuid, &resgid, + &sb->u.ext2_sb.s_mount_opt)) { + return NULL; +@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type, + + static int __init init_ext2_fs(void) + { +- return register_filesystem(&ext2_fs_type); ++ int error = init_ext2_xattr(); ++ if (error) ++ return error; ++ error = init_ext2_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext2_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext2_xattr_user(); ++fail: ++ exit_ext2_xattr(); ++ return error; + } + + static void __exit exit_ext2_fs(void) + { + unregister_filesystem(&ext2_fs_type); ++ exit_ext2_xattr_user(); ++ exit_ext2_xattr(); + } + + EXPORT_NO_SYMBOLS; +--- linux-2.4.20/fs/ext2/symlink.c~linux-2.4.20-xattr-0.8.54 2000-09-28 04:41:33.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/symlink.c 2003-05-07 18:08:03.000000000 +0800 +@@ -19,6 +19,7 @@ + + #include + #include ++#include + + static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext2_symlink_inode_operations = { ++ readlink: page_readlink, ++ follow_link: page_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, ++}; ++ + struct inode_operations ext2_fast_symlink_inode_operations = { + readlink: ext2_readlink, + follow_link: ext2_follow_link, ++ setxattr: ext2_setxattr, ++ getxattr: ext2_getxattr, ++ listxattr: ext2_listxattr, ++ removexattr: ext2_removexattr, + }; +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/xattr.c 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,1212 @@ ++/* ++ * linux/fs/ext2/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT2_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* These symbols may be needed by a module. */ ++EXPORT_SYMBOL(ext2_xattr_register); ++EXPORT_SYMBOL(ext2_xattr_unregister); ++EXPORT_SYMBOL(ext2_xattr_get); ++EXPORT_SYMBOL(ext2_xattr_list); ++EXPORT_SYMBOL(ext2_xattr_set); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT2_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext2_xattr_set2(struct inode *, struct buffer_head *, ++ struct ext2_xattr_header *); ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++static int ext2_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext2_xattr_cache_find(struct inode *, ++ struct ext2_xattr_header *); ++static void ext2_xattr_cache_remove(struct buffer_head *); ++static void ext2_xattr_rehash(struct ext2_xattr_header *, ++ struct ext2_xattr_entry *); ++ ++static struct mb_cache *ext2_xattr_cache; ++ ++#else ++# define ext2_xattr_cache_insert(bh) 0 ++# define ext2_xattr_cache_find(inode, header) NULL ++# define ext2_xattr_cache_remove(bh) while(0) {} ++# define ext2_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext2_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext2_xattr_sem); ++ ++static inline int ++ext2_xattr_new_block(struct inode *inode, int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) + ++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext2_new_block(inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext2_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext2_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext2_xattr_free_block(struct inode * inode, unsigned long block) ++{ ++ ext2_free_blocks(inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext2_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext2_xattr_free_block(inode, block) \ ++ ext2_free_blocks(inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX]; ++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ if (!ext2_xattr_handlers[name_index-1]) { ++ ext2_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext2_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) { ++ write_lock(&ext2_handler_lock); ++ ext2_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext2_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static struct ext2_xattr_handler * ++ext2_xattr_resolve_name(const char **name) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext2_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext2_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext2_handler_lock); ++ return handler; ++} ++ ++static inline struct ext2_xattr_handler * ++ext2_xattr_handler(int name_index) ++{ ++ struct ext2_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) { ++ read_lock(&ext2_handler_lock); ++ handler = ext2_xattr_handlers[name_index-1]; ++ read_unlock(&ext2_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext2_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext2_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext2_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext2_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext2_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT2_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT2_I(inode)->i_file_acl) ++ return 0; ++ block = EXT2_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ struct ext2_xattr_entry *next = ++ EXT2_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext2_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT2_XATTR_NEXT(entry)) { ++ struct ext2_xattr_handler *handler; ++ ++ handler = ext2_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext2_xattr_update_super_block(struct super_block *sb) ++{ ++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT2_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext2_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext2_xattr_header *header = NULL; ++ struct ext2_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT2_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext2_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext2_error(sb, "ext2_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext2_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT2_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT2_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT2_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext2_xattr_cache_remove(bh); ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT2_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT2_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext2_xattr_set2(inode, bh, NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT2_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT2_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT2_XATTR_PAD, 0, ++ EXT2_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext2_xattr_rehash(header, here); ++ ++ error = ext2_xattr_set2(inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext2_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext2_xattr_set(): Update the file system. ++ */ ++static int ++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ++ struct ext2_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext2_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext2_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext2_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT2_I(inode)->i_file_acl != 0; ++ int block = ext2_xattr_new_block(inode, &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++ ext2_xattr_free_block(inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext2_xattr_cache_insert(new_bh); ++ ++ ext2_xattr_update_super_block(sb); ++ } ++ mark_buffer_dirty(new_bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &new_bh); ++ wait_on_buffer(new_bh); ++ error = -EIO; ++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh)) ++ goto cleanup; ++ } ++ } ++ ++ /* Update the inode. */ ++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ if (IS_SYNC(inode)) { ++ error = ext2_sync_inode (inode); ++ if (error) ++ goto cleanup; ++ } else ++ mark_inode_dirty(inode); ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext2_xattr_free_block(inode, old_bh->b_blocknr); ++ mark_buffer_clean(old_bh); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext2_xattr_quota_free(inode); ++ mark_buffer_dirty(old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext2_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT2_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext2_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext2_xattr_cache_remove(bh); ++ ext2_xattr_free_block(inode, block); ++ bforget(bh); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ mark_buffer_dirty(bh); ++ if (IS_SYNC(inode)) { ++ ll_rw_block(WRITE, 1, &bh); ++ wait_on_buffer(bh); ++ } ++ ext2_xattr_quota_free(inode); ++ } ++ EXT2_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext2_xattr_sem); ++} ++ ++/* ++ * ext2_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT2_FS_XATTR_SHARING ++ ++/* ++ * ext2_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext2_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext2_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext2_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext2_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext2_xattr_cmp(struct ext2_xattr_header *header1, ++ struct ext2_xattr_header *header2) ++{ ++ struct ext2_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT2_XATTR_NEXT(entry1); ++ entry2 = EXT2_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext2_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext2_error(inode->i_sb, "ext2_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT2_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT2_XATTR_REFCOUNT_MAX); ++ } else if (!ext2_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext2_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext2_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext2_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext2_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext2_xattr_rehash(struct ext2_xattr_header *header, ++ struct ext2_xattr_entry *entry) ++{ ++ struct ext2_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext2_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT2_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext2_xattr(void) ++{ ++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext2_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++ mb_cache_destroy(ext2_xattr_cache); ++} ++ ++#else /* CONFIG_EXT2_FS_XATTR_SHARING */ ++ ++int __init ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext2_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/fs/ext2/xattr_user.c 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,103 @@ ++/* ++ * linux/fs/ext2/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext2_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext2_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext2_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT2_FS_POSIX_ACL ++ error = ext2_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, ++ value, size, flags); ++} ++ ++struct ext2_xattr_handler ext2_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext2_xattr_user_list, ++ get: ext2_xattr_user_get, ++ set: ext2_xattr_user_set, ++}; ++ ++int __init ++init_ext2_xattr_user(void) ++{ ++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} ++ ++void ++exit_ext2_xattr_user(void) ++{ ++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER, ++ &ext2_xattr_user_handler); ++} +--- linux-2.4.20/fs/ext3/Makefile~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:02.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/Makefile 2003-05-07 18:10:33.000000000 +0800 +@@ -1,5 +1,5 @@ + # +-# Makefile for the linux ext2-filesystem routines. ++# Makefile for the linux ext3-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here +@@ -9,10 +9,14 @@ + + O_TARGET := ext3.o + +-export-objs := super.o inode.o ++export-objs := ext3-exports.o + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +- ioctl.o namei.o super.o symlink.o hash.o ++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o + obj-m := $(O_TARGET) + ++export-objs += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o ++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o ++ + include $(TOPDIR)/Rules.make +--- linux-2.4.20/fs/ext3/file.c~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:02.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/file.c 2003-05-07 18:08:03.000000000 +0800 +@@ -23,6 +23,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -126,5 +127,9 @@ struct file_operations ext3_file_operati + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; + +--- linux-2.4.20/fs/ext3/ialloc.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/ialloc.c 2003-05-07 18:08:03.000000000 +0800 +@@ -17,6 +17,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -216,6 +217,7 @@ void ext3_free_inode (handle_t *handle, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); ++ ext3_xattr_delete_inode(handle, inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + +--- linux-2.4.20/fs/ext3/inode.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/inode.c 2003-05-07 18:08:03.000000000 +0800 +@@ -39,6 +39,18 @@ + */ + #undef SEARCH_FROM_ZERO + ++/* ++ * Test whether an inode is a fast symlink. ++ */ ++static inline int ext3_inode_is_fast_symlink(struct inode *inode) ++{ ++ int ea_blocks = inode->u.ext3_i.i_file_acl ? ++ (inode->i_sb->s_blocksize >> 9) : 0; ++ ++ return (S_ISLNK(inode->i_mode) && ++ inode->i_blocks - ea_blocks == 0); ++} ++ + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. +@@ -48,7 +60,7 @@ + * still needs to be revoked. + */ + +-static int ext3_forget(handle_t *handle, int is_metadata, ++int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { +@@ -164,9 +176,7 @@ void ext3_delete_inode (struct inode * i + { + handle_t *handle; + +- if (is_bad_inode(inode) || +- inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) ++ if (is_bad_inode(inode)) + goto no_delete; + + lock_kernel(); +@@ -1855,6 +1865,8 @@ void ext3_truncate(struct inode * inode) + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; ++ if (ext3_inode_is_fast_symlink(inode)) ++ return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + +@@ -2002,8 +2014,6 @@ int ext3_get_inode_loc (struct inode *in + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && +- inode->i_ino != EXT3_ACL_IDX_INO && +- inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( +@@ -2130,10 +2140,7 @@ void ext3_read_inode(struct inode * inod + + brelse (iloc.bh); + +- if (inode->i_ino == EXT3_ACL_IDX_INO || +- inode->i_ino == EXT3_ACL_DATA_INO) +- /* Nothing to do */ ; +- else if (S_ISREG(inode->i_mode)) { ++ if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; +@@ -2141,15 +2148,17 @@ void ext3_read_inode(struct inode * inod + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { +- if (!inode->i_blocks) ++ if (ext3_inode_is_fast_symlink(inode)) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } +- } else ++ } else { ++ inode->i_op = &ext3_special_inode_operations; + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); ++ } + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ +--- linux-2.4.20/fs/ext3/namei.c~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:05.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/namei.c 2003-05-07 18:08:03.000000000 +0800 +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1611,7 +1612,7 @@ static int ext3_mkdir(struct inode * dir + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = ext3_new_inode (handle, dir, S_IFDIR); ++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1619,7 +1620,6 @@ static int ext3_mkdir(struct inode * dir + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize; +- inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ +@@ -1646,9 +1646,6 @@ static int ext3_mkdir(struct inode * dir + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); +- inode->i_mode = S_IFDIR | mode; +- if (dir->i_mode & S_ISGID) +- inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) { +@@ -2017,7 +2014,7 @@ static int ext3_symlink (struct inode * + goto out_stop; + + if (l > sizeof (EXT3_I(inode)->i_data)) { +- inode->i_op = &page_symlink_inode_operations; ++ inode->i_op = &ext3_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. +@@ -2244,4 +2241,16 @@ struct inode_operations ext3_dir_inode_o + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; ++ ++struct inode_operations ext3_special_inode_operations = { ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ +--- linux-2.4.20/fs/ext3/super.c~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:02.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/super.c 2003-05-07 18:08:39.000000000 +0800 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -404,6 +405,7 @@ void ext3_put_super (struct super_block + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + ++ ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); +@@ -499,6 +501,7 @@ static int parse_options (char * options + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; ++ + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; +@@ -511,6 +514,13 @@ static int parse_options (char * options + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ if (!strcmp (this_char, "user_xattr")) ++ set_opt (*mount_options, XATTR_USER); ++ else if (!strcmp (this_char, "nouser_xattr")) ++ clear_opt (*mount_options, XATTR_USER); ++ else ++#endif + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { +@@ -928,6 +938,12 @@ struct super_block * ext3_read_super (st + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; ++ ++ /* Default extended attribute flags */ ++#ifdef CONFIG_EXT3_FS_XATTR_USER ++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */ ++#endif ++ + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; +@@ -1767,17 +1783,29 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type, + + static int __init init_ext3_fs(void) + { +- return register_filesystem(&ext3_fs_type); ++ int error = init_ext3_xattr(); ++ if (error) ++ return error; ++ error = init_ext3_xattr_user(); ++ if (error) ++ goto fail; ++ error = register_filesystem(&ext3_fs_type); ++ if (!error) ++ return 0; ++ ++ exit_ext3_xattr_user(); ++fail: ++ exit_ext3_xattr(); ++ return error; + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); ++ exit_ext3_xattr_user(); ++ exit_ext3_xattr(); + } + +-EXPORT_SYMBOL(ext3_force_commit); +-EXPORT_SYMBOL(ext3_bread); +- + MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); + MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); + MODULE_LICENSE("GPL"); +--- linux-2.4.20/fs/ext3/symlink.c~linux-2.4.20-xattr-0.8.54 2001-11-10 06:25:04.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/symlink.c 2003-05-07 18:08:03.000000000 +0800 +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { +@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr + return vfs_follow_link(nd, s); + } + ++struct inode_operations ext3_symlink_inode_operations = { ++ readlink: page_readlink, /* BKL not held. Don't need */ ++ follow_link: page_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ ++}; ++ + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ ++ setxattr: ext3_setxattr, /* BKL held */ ++ getxattr: ext3_getxattr, /* BKL held */ ++ listxattr: ext3_listxattr, /* BKL held */ ++ removexattr: ext3_removexattr, /* BKL held */ + }; +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/xattr.c 2003-05-07 18:09:23.000000000 +0800 +@@ -0,0 +1,1225 @@ ++/* ++ * linux/fs/ext3/xattr.c ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ * ++ * Fix by Harrison Xing . ++ * Ext3 code with a lot of help from Eric Jarman . ++ * Extended attributes for symlinks and special files added per ++ * suggestion of Luka Renko . ++ */ ++ ++/* ++ * Extended attributes are stored on disk blocks allocated outside of ++ * any inode. The i_file_acl field is then made to point to this allocated ++ * block. If all extended attributes of an inode are identical, these ++ * inodes may share the same extended attribute block. Such situations ++ * are automatically detected by keeping a cache of recent attribute block ++ * numbers and hashes over the block's contents in memory. ++ * ++ * ++ * Extended attribute block layout: ++ * ++ * +------------------+ ++ * | header | ++ * | entry 1 | | ++ * | entry 2 | | growing downwards ++ * | entry 3 | v ++ * | four null bytes | ++ * | . . . | ++ * | value 1 | ^ ++ * | value 3 | | growing upwards ++ * | value 2 | | ++ * +------------------+ ++ * ++ * The block header is followed by multiple entry descriptors. These entry ++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD ++ * byte boundaries. The entry descriptors are sorted by attribute name, ++ * so that two extended attribute blocks can be compared efficiently. ++ * ++ * Attribute values are aligned to the end of the block, stored in ++ * no specific order. They are also padded to EXT3_XATTR_PAD byte ++ * boundaries. No additional gaps are left between them. ++ * ++ * Locking strategy ++ * ---------------- ++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of ++ * the xattr inode operations are called, so we are guaranteed that only one ++ * processes accesses extended attributes of an inode at any time. ++ * ++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that ++ * only a single process is modifying an extended attribute block, even ++ * if the block is shared among inodes. ++ * ++ * Note for porting to 2.5 ++ * ----------------------- ++ * The BKL will no longer be held in the xattr inode operations. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define EXT3_EA_USER "user." ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1) ++#endif ++ ++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data)) ++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr)) ++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1) ++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) ++ ++#ifdef EXT3_XATTR_DEBUG ++# define ea_idebug(inode, f...) do { \ ++ printk(KERN_DEBUG "inode %s:%ld: ", \ ++ kdevname(inode->i_dev), inode->i_ino); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++# define ea_bdebug(bh, f...) do { \ ++ printk(KERN_DEBUG "block %s:%ld: ", \ ++ kdevname(bh->b_dev), bh->b_blocknr); \ ++ printk(f); \ ++ printk("\n"); \ ++ } while (0) ++#else ++# define ea_idebug(f...) ++# define ea_bdebug(f...) ++#endif ++ ++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *, ++ struct ext3_xattr_header *); ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++static int ext3_xattr_cache_insert(struct buffer_head *); ++static struct buffer_head *ext3_xattr_cache_find(struct inode *, ++ struct ext3_xattr_header *); ++static void ext3_xattr_cache_remove(struct buffer_head *); ++static void ext3_xattr_rehash(struct ext3_xattr_header *, ++ struct ext3_xattr_entry *); ++ ++static struct mb_cache *ext3_xattr_cache; ++ ++#else ++# define ext3_xattr_cache_insert(bh) 0 ++# define ext3_xattr_cache_find(inode, header) NULL ++# define ext3_xattr_cache_remove(bh) while(0) {} ++# define ext3_xattr_rehash(header, entry) while(0) {} ++#endif ++ ++/* ++ * If a file system does not share extended attributes among inodes, ++ * we should not need the ext3_xattr_sem semaphore. However, the ++ * filesystem may still contain shared blocks, so we always take ++ * the lock. ++ */ ++ ++DECLARE_MUTEX(ext3_xattr_sem); ++ ++static inline int ++ext3_xattr_new_block(handle_t *handle, struct inode *inode, ++ int * errp, int force) ++{ ++ struct super_block *sb = inode->i_sb; ++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) + ++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb); ++ ++ /* How can we enforce the allocation? */ ++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp); ++#ifdef OLD_QUOTAS ++ if (!*errp) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#endif ++ return block; ++} ++ ++static inline int ++ext3_xattr_quota_alloc(struct inode *inode, int force) ++{ ++ /* How can we enforce the allocation? */ ++#ifdef OLD_QUOTAS ++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1); ++ if (!error) ++ inode->i_blocks += inode->i_sb->s_blocksize >> 9; ++#else ++ int error = DQUOT_ALLOC_BLOCK(inode, 1); ++#endif ++ return error; ++} ++ ++#ifdef OLD_QUOTAS ++ ++static inline void ++ext3_xattr_quota_free(struct inode *inode) ++{ ++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++static inline void ++ext3_xattr_free_block(handle_t *handle, struct inode * inode, ++ unsigned long block) ++{ ++ ext3_free_blocks(handle, inode, block, 1); ++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9; ++} ++ ++#else ++# define ext3_xattr_quota_free(inode) \ ++ DQUOT_FREE_BLOCK(inode, 1) ++# define ext3_xattr_free_block(handle, inode, block) \ ++ ext3_free_blocks(handle, inode, block, 1) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18) ++ ++static inline struct buffer_head * ++sb_bread(struct super_block *sb, int block) ++{ ++ return bread(sb->s_dev, block, sb->s_blocksize); ++} ++ ++static inline struct buffer_head * ++sb_getblk(struct super_block *sb, int block) ++{ ++ return getblk(sb->s_dev, block, sb->s_blocksize); ++} ++ ++#endif ++ ++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX]; ++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED; ++ ++int ++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler) ++{ ++ int error = -EINVAL; ++ ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ if (!ext3_xattr_handlers[name_index-1]) { ++ ext3_xattr_handlers[name_index-1] = handler; ++ error = 0; ++ } ++ write_unlock(&ext3_handler_lock); ++ } ++ return error; ++} ++ ++void ++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler) ++{ ++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) { ++ write_lock(&ext3_handler_lock); ++ ext3_xattr_handlers[name_index-1] = NULL; ++ write_unlock(&ext3_handler_lock); ++ } ++} ++ ++static inline const char * ++strcmp_prefix(const char *a, const char *a_prefix) ++{ ++ while (*a_prefix && *a == *a_prefix) { ++ a++; ++ a_prefix++; ++ } ++ return *a_prefix ? NULL : a; ++} ++ ++/* ++ * Decode the extended attribute name, and translate it into ++ * the name_index and name suffix. ++ */ ++static inline struct ext3_xattr_handler * ++ext3_xattr_resolve_name(const char **name) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ int i; ++ ++ if (!*name) ++ return NULL; ++ read_lock(&ext3_handler_lock); ++ for (i=0; iprefix); ++ if (n) { ++ handler = ext3_xattr_handlers[i]; ++ *name = n; ++ break; ++ } ++ } ++ } ++ read_unlock(&ext3_handler_lock); ++ return handler; ++} ++ ++static inline struct ext3_xattr_handler * ++ext3_xattr_handler(int name_index) ++{ ++ struct ext3_xattr_handler *handler = NULL; ++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) { ++ read_lock(&ext3_handler_lock); ++ handler = ext3_xattr_handlers[name_index-1]; ++ read_unlock(&ext3_handler_lock); ++ } ++ return handler; ++} ++ ++/* ++ * Inode operation getxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_getxattr(struct dentry *dentry, const char *name, ++ void *buffer, size_t size) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->get(inode, name, buffer, size); ++} ++ ++/* ++ * Inode operation listxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++ssize_t ++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) ++{ ++ return ext3_xattr_list(dentry->d_inode, buffer, size); ++} ++ ++/* ++ * Inode operation setxattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_setxattr(struct dentry *dentry, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ if (size == 0) ++ value = ""; /* empty EA, do not remove */ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, value, size, flags); ++} ++ ++/* ++ * Inode operation removexattr() ++ * ++ * dentry->d_inode->i_sem down ++ * BKL held [before 2.5.x] ++ */ ++int ++ext3_removexattr(struct dentry *dentry, const char *name) ++{ ++ struct ext3_xattr_handler *handler; ++ struct inode *inode = dentry->d_inode; ++ ++ handler = ext3_xattr_resolve_name(&name); ++ if (!handler) ++ return -ENOTSUP; ++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE); ++} ++ ++/* ++ * ext3_xattr_get() ++ * ++ * Copy an extended attribute into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size; ++ char *end; ++ int name_len, error; ++ ++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", ++ name_index, name, buffer, (long)buffer_size); ++ ++ if (name == NULL) ++ return -EINVAL; ++ if (!EXT3_I(inode)->i_file_acl) ++ return -ENOATTR; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* find named attribute */ ++ name_len = strlen(name); ++ ++ error = -ERANGE; ++ if (name_len > 255) ++ goto cleanup; ++ entry = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (name_index == entry->e_name_index && ++ name_len == entry->e_name_len && ++ memcmp(name, entry->e_name, name_len) == 0) ++ goto found; ++ entry = next; ++ } ++ /* Check the remaining name entries */ ++ while (!IS_LAST_ENTRY(entry)) { ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ entry = next; ++ } ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ error = -ENOATTR; ++ goto cleanup; ++found: ++ /* check the buffer size */ ++ if (entry->e_value_block != 0) ++ goto bad_block; ++ size = le32_to_cpu(entry->e_value_size); ++ if (size > inode->i_sb->s_blocksize || ++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize) ++ goto bad_block; ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (buffer) { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ /* return value of attribute */ ++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs), ++ size); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_list() ++ * ++ * Copy a list of attribute names into the buffer ++ * provided, or compute the buffer size required. ++ * Buffer is NULL to compute the size of the buffer required. ++ * ++ * Returns a negative error number on failure, or the number of bytes ++ * used / required on success. ++ */ ++int ++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) ++{ ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_entry *entry; ++ unsigned int block, size = 0; ++ char *buf, *end; ++ int error; ++ ++ ea_idebug(inode, "buffer=%p, buffer_size=%ld", ++ buffer, (long)buffer_size); ++ ++ if (!EXT3_I(inode)->i_file_acl) ++ return 0; ++ block = EXT3_I(inode)->i_file_acl; ++ ea_idebug(inode, "reading block %d", block); ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) ++ return -EIO; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount)); ++ end = bh->b_data + bh->b_size; ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* compute the size required for the list of attribute names */ ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ struct ext3_xattr_entry *next = ++ EXT3_XATTR_NEXT(entry); ++ if ((char *)next >= end) ++ goto bad_block; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ size += handler->list(NULL, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ ++ if (ext3_xattr_cache_insert(bh)) ++ ea_idebug(inode, "cache insert failed"); ++ if (!buffer) { ++ error = size; ++ goto cleanup; ++ } else { ++ error = -ERANGE; ++ if (size > buffer_size) ++ goto cleanup; ++ } ++ ++ /* list the attribute names */ ++ buf = buffer; ++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); ++ entry = EXT3_XATTR_NEXT(entry)) { ++ struct ext3_xattr_handler *handler; ++ ++ handler = ext3_xattr_handler(entry->e_name_index); ++ if (handler) ++ buf += handler->list(buf, inode, entry->e_name, ++ entry->e_name_len); ++ } ++ error = size; ++ ++cleanup: ++ brelse(bh); ++ ++ return error; ++} ++ ++/* ++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is ++ * not set, set it. ++ */ ++static void ext3_xattr_update_super_block(handle_t *handle, ++ struct super_block *sb) ++{ ++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR)) ++ return; ++ ++ lock_super(sb); ++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0) ++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR; ++#endif ++ EXT3_SB(sb)->s_es->s_feature_compat |= ++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR); ++ sb->s_dirt = 1; ++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); ++ unlock_super(sb); ++} ++ ++/* ++ * ext3_xattr_set() ++ * ++ * Create, replace or remove an extended attribute for this inode. Buffer ++ * is NULL to remove an existing extended attribute, and non-NULL to ++ * either replace an existing extended attribute, or create a new extended ++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE ++ * specify that an extended attribute must exist and must not exist ++ * previous to the call, respectively. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t value_len, int flags) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *bh = NULL; ++ struct ext3_xattr_header *header = NULL; ++ struct ext3_xattr_entry *here, *last; ++ unsigned int name_len; ++ int block = EXT3_I(inode)->i_file_acl; ++ int min_offs = sb->s_blocksize, not_found = 1, free, error; ++ char *end; ++ ++ /* ++ * header -- Points either into bh, or to a temporarily ++ * allocated buffer. ++ * here -- The named entry found, or the place for inserting, within ++ * the block pointed to by header. ++ * last -- Points right after the last named entry within the block ++ * pointed to by header. ++ * min_offs -- The offset of the first value (values are aligned ++ * towards the end of the block). ++ * end -- Points right after the block pointed to by header. ++ */ ++ ++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", ++ name_index, name, value, (long)value_len); ++ ++ if (IS_RDONLY(inode)) ++ return -EROFS; ++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) ++ return -EPERM; ++ if (value == NULL) ++ value_len = 0; ++ if (name == NULL) ++ return -EINVAL; ++ name_len = strlen(name); ++ if (name_len > 255 || value_len > sb->s_blocksize) ++ return -ERANGE; ++ down(&ext3_xattr_sem); ++ ++ if (block) { ++ /* The inode already has an extended attribute block. */ ++ bh = sb_bread(sb, block); ++ error = -EIO; ++ if (!bh) ++ goto cleanup; ++ ea_bdebug(bh, "b_count=%d, refcount=%d", ++ atomic_read(&(bh->b_count)), ++ le32_to_cpu(HDR(bh)->h_refcount)); ++ header = HDR(bh); ++ end = bh->b_data + bh->b_size; ++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ header->h_blocks != cpu_to_le32(1)) { ++bad_block: ext3_error(sb, "ext3_xattr_set", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ /* Find the named attribute. */ ++ here = FIRST_ENTRY(bh); ++ while (!IS_LAST_ENTRY(here)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!here->e_value_block && here->e_value_size) { ++ int offs = le16_to_cpu(here->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ not_found = name_index - here->e_name_index; ++ if (!not_found) ++ not_found = name_len - here->e_name_len; ++ if (!not_found) ++ not_found = memcmp(name, here->e_name,name_len); ++ if (not_found <= 0) ++ break; ++ here = next; ++ } ++ last = here; ++ /* We still need to compute min_offs and last. */ ++ while (!IS_LAST_ENTRY(last)) { ++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last); ++ if ((char *)next >= end) ++ goto bad_block; ++ if (!last->e_value_block && last->e_value_size) { ++ int offs = le16_to_cpu(last->e_value_offs); ++ if (offs < min_offs) ++ min_offs = offs; ++ } ++ last = next; ++ } ++ ++ /* Check whether we have enough space left. */ ++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32); ++ } else { ++ /* We will use a new extended attribute block. */ ++ free = sb->s_blocksize - ++ sizeof(struct ext3_xattr_header) - sizeof(__u32); ++ here = last = NULL; /* avoid gcc uninitialized warning. */ ++ } ++ ++ if (not_found) { ++ /* Request to remove a nonexistent attribute? */ ++ error = -ENOATTR; ++ if (flags & XATTR_REPLACE) ++ goto cleanup; ++ error = 0; ++ if (value == NULL) ++ goto cleanup; ++ else ++ free -= EXT3_XATTR_LEN(name_len); ++ } else { ++ /* Request to create an existing attribute? */ ++ error = -EEXIST; ++ if (flags & XATTR_CREATE) ++ goto cleanup; ++ if (!here->e_value_block && here->e_value_size) { ++ unsigned int size = le32_to_cpu(here->e_value_size); ++ ++ if (le16_to_cpu(here->e_value_offs) + size > ++ sb->s_blocksize || size > sb->s_blocksize) ++ goto bad_block; ++ free += EXT3_XATTR_SIZE(size); ++ } ++ } ++ free -= EXT3_XATTR_SIZE(value_len); ++ error = -ENOSPC; ++ if (free < 0) ++ goto cleanup; ++ ++ /* Here we know that we can set the new attribute. */ ++ ++ if (header) { ++ if (header->h_refcount == cpu_to_le32(1)) { ++ ea_bdebug(bh, "modifying in-place"); ++ ext3_xattr_cache_remove(bh); ++ error = ext3_journal_get_write_access(handle, bh); ++ if (error) ++ goto cleanup; ++ } else { ++ int offset; ++ ++ ea_bdebug(bh, "cloning"); ++ header = kmalloc(bh->b_size, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memcpy(header, HDR(bh), bh->b_size); ++ header->h_refcount = cpu_to_le32(1); ++ offset = (char *)header - bh->b_data; ++ here = ENTRY((char *)here + offset); ++ last = ENTRY((char *)last + offset); ++ } ++ } else { ++ /* Allocate a buffer where we construct the new block. */ ++ header = kmalloc(sb->s_blocksize, GFP_KERNEL); ++ error = -ENOMEM; ++ if (header == NULL) ++ goto cleanup; ++ memset(header, 0, sb->s_blocksize); ++ end = (char *)header + sb->s_blocksize; ++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC); ++ header->h_blocks = header->h_refcount = cpu_to_le32(1); ++ last = here = ENTRY(header+1); ++ } ++ ++ if (not_found) { ++ /* Insert the new name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ int rest = (char *)last - (char *)here; ++ memmove((char *)here + size, here, rest); ++ memset(here, 0, size); ++ here->e_name_index = name_index; ++ here->e_name_len = name_len; ++ memcpy(here->e_name, name, name_len); ++ } else { ++ /* Remove the old value. */ ++ if (!here->e_value_block && here->e_value_size) { ++ char *first_val = (char *)header + min_offs; ++ int offs = le16_to_cpu(here->e_value_offs); ++ char *val = (char *)header + offs; ++ size_t size = EXT3_XATTR_SIZE( ++ le32_to_cpu(here->e_value_size)); ++ memmove(first_val + size, first_val, val - first_val); ++ memset(first_val, 0, size); ++ here->e_value_offs = 0; ++ min_offs += size; ++ ++ /* Adjust all value offsets. */ ++ last = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(last)) { ++ int o = le16_to_cpu(last->e_value_offs); ++ if (!last->e_value_block && o < offs) ++ last->e_value_offs = ++ cpu_to_le16(o + size); ++ last = EXT3_XATTR_NEXT(last); ++ } ++ } ++ if (value == NULL) { ++ /* Remove this attribute. */ ++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) { ++ /* This block is now empty. */ ++ error = ext3_xattr_set2(handle, inode, bh,NULL); ++ goto cleanup; ++ } else { ++ /* Remove the old name. */ ++ int size = EXT3_XATTR_LEN(name_len); ++ last = ENTRY((char *)last - size); ++ memmove(here, (char*)here + size, ++ (char*)last - (char*)here); ++ memset(last, 0, size); ++ } ++ } ++ } ++ ++ if (value != NULL) { ++ /* Insert the new value. */ ++ here->e_value_size = cpu_to_le32(value_len); ++ if (value_len) { ++ size_t size = EXT3_XATTR_SIZE(value_len); ++ char *val = (char *)header + min_offs - size; ++ here->e_value_offs = ++ cpu_to_le16((char *)val - (char *)header); ++ memset(val + size - EXT3_XATTR_PAD, 0, ++ EXT3_XATTR_PAD); /* Clear the pad bytes. */ ++ memcpy(val, value, value_len); ++ } ++ } ++ ext3_xattr_rehash(header, here); ++ ++ error = ext3_xattr_set2(handle, inode, bh, header); ++ ++cleanup: ++ brelse(bh); ++ if (!(bh && header == HDR(bh))) ++ kfree(header); ++ up(&ext3_xattr_sem); ++ ++ return error; ++} ++ ++/* ++ * Second half of ext3_xattr_set(): Update the file system. ++ */ ++static int ++ext3_xattr_set2(handle_t *handle, struct inode *inode, ++ struct buffer_head *old_bh, struct ext3_xattr_header *header) ++{ ++ struct super_block *sb = inode->i_sb; ++ struct buffer_head *new_bh = NULL; ++ int error; ++ ++ if (header) { ++ new_bh = ext3_xattr_cache_find(inode, header); ++ if (new_bh) { ++ /* ++ * We found an identical block in the cache. ++ * The old block will be released after updating ++ * the inode. ++ */ ++ ea_bdebug(old_bh, "reusing block %ld", ++ new_bh->b_blocknr); ++ ++ error = -EDQUOT; ++ if (ext3_xattr_quota_alloc(inode, 1)) ++ goto cleanup; ++ ++ error = ext3_journal_get_write_access(handle, new_bh); ++ if (error) ++ goto cleanup; ++ HDR(new_bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1); ++ ea_bdebug(new_bh, "refcount now=%d", ++ le32_to_cpu(HDR(new_bh)->h_refcount)); ++ } else if (old_bh && header == HDR(old_bh)) { ++ /* Keep this block. */ ++ new_bh = old_bh; ++ ext3_xattr_cache_insert(new_bh); ++ } else { ++ /* We need to allocate a new block */ ++ int force = EXT3_I(inode)->i_file_acl != 0; ++ int block = ext3_xattr_new_block(handle, inode, ++ &error, force); ++ if (error) ++ goto cleanup; ++ ea_idebug(inode, "creating block %d", block); ++ ++ new_bh = sb_getblk(sb, block); ++ if (!new_bh) { ++getblk_failed: ext3_xattr_free_block(handle, inode, block); ++ error = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(new_bh); ++ error = ext3_journal_get_create_access(handle, new_bh); ++ if (error) { ++ unlock_buffer(new_bh); ++ goto getblk_failed; ++ } ++ memcpy(new_bh->b_data, header, new_bh->b_size); ++ mark_buffer_uptodate(new_bh, 1); ++ unlock_buffer(new_bh); ++ ext3_xattr_cache_insert(new_bh); ++ ++ ext3_xattr_update_super_block(handle, sb); ++ } ++ error = ext3_journal_dirty_metadata(handle, new_bh); ++ if (error) ++ goto cleanup; ++ } ++ ++ /* Update the inode. */ ++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; ++ inode->i_ctime = CURRENT_TIME; ++ ext3_mark_inode_dirty(handle, inode); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++ error = 0; ++ if (old_bh && old_bh != new_bh) { ++ /* ++ * If there was an old block, and we are not still using it, ++ * we now release the old block. ++ */ ++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount); ++ ++ error = ext3_journal_get_write_access(handle, old_bh); ++ if (error) ++ goto cleanup; ++ if (refcount == 1) { ++ /* Free the old block. */ ++ ea_bdebug(old_bh, "freeing"); ++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr); ++ ++ /* ext3_forget() calls bforget() for us, but we ++ let our caller release old_bh, so we need to ++ duplicate the handle before. */ ++ get_bh(old_bh); ++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr); ++ } else { ++ /* Decrement the refcount only. */ ++ refcount--; ++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount); ++ ext3_xattr_quota_free(inode); ++ ext3_journal_dirty_metadata(handle, old_bh); ++ ea_bdebug(old_bh, "refcount now=%d", refcount); ++ } ++ } ++ ++cleanup: ++ if (old_bh != new_bh) ++ brelse(new_bh); ++ ++ return error; ++} ++ ++/* ++ * ext3_xattr_delete_inode() ++ * ++ * Free extended attribute resources associated with this inode. This ++ * is called immediately before an inode is freed. ++ */ ++void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++ struct buffer_head *bh; ++ unsigned int block = EXT3_I(inode)->i_file_acl; ++ ++ if (!block) ++ return; ++ down(&ext3_xattr_sem); ++ ++ bh = sb_bread(inode->i_sb, block); ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: block %d read error", inode->i_ino, block); ++ goto cleanup; ++ } ++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count))); ++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || ++ HDR(bh)->h_blocks != cpu_to_le32(1)) { ++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode", ++ "inode %ld: bad block %d", inode->i_ino, block); ++ goto cleanup; ++ } ++ ext3_journal_get_write_access(handle, bh); ++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) { ++ ext3_xattr_cache_remove(bh); ++ ext3_xattr_free_block(handle, inode, block); ++ ext3_forget(handle, 1, inode, bh, block); ++ bh = NULL; ++ } else { ++ HDR(bh)->h_refcount = cpu_to_le32( ++ le32_to_cpu(HDR(bh)->h_refcount) - 1); ++ ext3_journal_dirty_metadata(handle, bh); ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ext3_xattr_quota_free(inode); ++ } ++ EXT3_I(inode)->i_file_acl = 0; ++ ++cleanup: ++ brelse(bh); ++ up(&ext3_xattr_sem); ++} ++ ++/* ++ * ext3_xattr_put_super() ++ * ++ * This is called when a file system is unmounted. ++ */ ++void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev); ++#endif ++} ++ ++#ifdef CONFIG_EXT3_FS_XATTR_SHARING ++ ++/* ++ * ext3_xattr_cache_insert() ++ * ++ * Create a new entry in the extended attribute cache, and insert ++ * it unless such an entry is already in the cache. ++ * ++ * Returns 0, or a negative error number on failure. ++ */ ++static int ++ext3_xattr_cache_insert(struct buffer_head *bh) ++{ ++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash); ++ struct mb_cache_entry *ce; ++ int error; ++ ++ ce = mb_cache_entry_alloc(ext3_xattr_cache); ++ if (!ce) ++ return -ENOMEM; ++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash); ++ if (error) { ++ mb_cache_entry_free(ce); ++ if (error == -EBUSY) { ++ ea_bdebug(bh, "already in cache (%d cache entries)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ error = 0; ++ } ++ } else { ++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash, ++ atomic_read(&ext3_xattr_cache->c_entry_count)); ++ mb_cache_entry_release(ce); ++ } ++ return error; ++} ++ ++/* ++ * ext3_xattr_cmp() ++ * ++ * Compare two extended attribute blocks for equality. ++ * ++ * Returns 0 if the blocks are equal, 1 if they differ, and ++ * a negative error number on errors. ++ */ ++static int ++ext3_xattr_cmp(struct ext3_xattr_header *header1, ++ struct ext3_xattr_header *header2) ++{ ++ struct ext3_xattr_entry *entry1, *entry2; ++ ++ entry1 = ENTRY(header1+1); ++ entry2 = ENTRY(header2+1); ++ while (!IS_LAST_ENTRY(entry1)) { ++ if (IS_LAST_ENTRY(entry2)) ++ return 1; ++ if (entry1->e_hash != entry2->e_hash || ++ entry1->e_name_len != entry2->e_name_len || ++ entry1->e_value_size != entry2->e_value_size || ++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) ++ return 1; ++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0) ++ return -EIO; ++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), ++ (char *)header2 + le16_to_cpu(entry2->e_value_offs), ++ le32_to_cpu(entry1->e_value_size))) ++ return 1; ++ ++ entry1 = EXT3_XATTR_NEXT(entry1); ++ entry2 = EXT3_XATTR_NEXT(entry2); ++ } ++ if (!IS_LAST_ENTRY(entry2)) ++ return 1; ++ return 0; ++} ++ ++/* ++ * ext3_xattr_cache_find() ++ * ++ * Find an identical extended attribute block. ++ * ++ * Returns a pointer to the block found, or NULL if such a block was ++ * not found or an error occurred. ++ */ ++static struct buffer_head * ++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header) ++{ ++ __u32 hash = le32_to_cpu(header->h_hash); ++ struct mb_cache_entry *ce; ++ ++ if (!header->h_hash) ++ return NULL; /* never share */ ++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash); ++ while (ce) { ++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block); ++ ++ if (!bh) { ++ ext3_error(inode->i_sb, "ext3_xattr_cache_find", ++ "inode %ld: block %ld read error", ++ inode->i_ino, ce->e_block); ++ } else if (le32_to_cpu(HDR(bh)->h_refcount) > ++ EXT3_XATTR_REFCOUNT_MAX) { ++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block, ++ le32_to_cpu(HDR(bh)->h_refcount), ++ EXT3_XATTR_REFCOUNT_MAX); ++ } else if (!ext3_xattr_cmp(header, HDR(bh))) { ++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count))); ++ mb_cache_entry_release(ce); ++ return bh; ++ } ++ brelse(bh); ++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash); ++ } ++ return NULL; ++} ++ ++/* ++ * ext3_xattr_cache_remove() ++ * ++ * Remove the cache entry of a block from the cache. Called when a ++ * block becomes invalid. ++ */ ++static void ++ext3_xattr_cache_remove(struct buffer_head *bh) ++{ ++ struct mb_cache_entry *ce; ++ ++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr); ++ if (ce) { ++ ea_bdebug(bh, "removing (%d cache entries remaining)", ++ atomic_read(&ext3_xattr_cache->c_entry_count)-1); ++ mb_cache_entry_free(ce); ++ } else ++ ea_bdebug(bh, "no cache entry"); ++} ++ ++#define NAME_HASH_SHIFT 5 ++#define VALUE_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_hash_entry() ++ * ++ * Compute the hash of an extended attribute. ++ */ ++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ __u32 hash = 0; ++ char *name = entry->e_name; ++ int n; ++ ++ for (n=0; n < entry->e_name_len; n++) { ++ hash = (hash << NAME_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ ++ *name++; ++ } ++ ++ if (entry->e_value_block == 0 && entry->e_value_size != 0) { ++ __u32 *value = (__u32 *)((char *)header + ++ le16_to_cpu(entry->e_value_offs)); ++ for (n = (le32_to_cpu(entry->e_value_size) + ++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) { ++ hash = (hash << VALUE_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ ++ le32_to_cpu(*value++); ++ } ++ } ++ entry->e_hash = cpu_to_le32(hash); ++} ++ ++#undef NAME_HASH_SHIFT ++#undef VALUE_HASH_SHIFT ++ ++#define BLOCK_HASH_SHIFT 16 ++ ++/* ++ * ext3_xattr_rehash() ++ * ++ * Re-compute the extended attribute hash value after an entry has changed. ++ */ ++static void ext3_xattr_rehash(struct ext3_xattr_header *header, ++ struct ext3_xattr_entry *entry) ++{ ++ struct ext3_xattr_entry *here; ++ __u32 hash = 0; ++ ++ ext3_xattr_hash_entry(header, entry); ++ here = ENTRY(header+1); ++ while (!IS_LAST_ENTRY(here)) { ++ if (!here->e_hash) { ++ /* Block is not shared if an entry's hash value == 0 */ ++ hash = 0; ++ break; ++ } ++ hash = (hash << BLOCK_HASH_SHIFT) ^ ++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ ++ le32_to_cpu(here->e_hash); ++ here = EXT3_XATTR_NEXT(here); ++ } ++ header->h_hash = cpu_to_le32(hash); ++} ++ ++#undef BLOCK_HASH_SHIFT ++ ++int __init ++init_ext3_xattr(void) ++{ ++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL, ++ sizeof(struct mb_cache_entry) + ++ sizeof(struct mb_cache_entry_index), 1, 61); ++ if (!ext3_xattr_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++ if (ext3_xattr_cache) ++ mb_cache_destroy(ext3_xattr_cache); ++ ext3_xattr_cache = NULL; ++} ++ ++#else /* CONFIG_EXT3_FS_XATTR_SHARING */ ++ ++int __init ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++void ++exit_ext3_xattr(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/fs/ext3/xattr_user.c 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,111 @@ ++/* ++ * linux/fs/ext3/xattr_user.c ++ * Handler for extended user attributes. ++ * ++ * Copyright (C) 2001 by Andreas Gruenbacher, ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++# include ++#endif ++ ++#define XATTR_USER_PREFIX "user." ++ ++static size_t ++ext3_xattr_user_list(char *list, struct inode *inode, ++ const char *name, int name_len) ++{ ++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1; ++ ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return 0; ++ ++ if (list) { ++ memcpy(list, XATTR_USER_PREFIX, prefix_len); ++ memcpy(list+prefix_len, name, name_len); ++ list[prefix_len + name_len] = '\0'; ++ } ++ return prefix_len + name_len + 1; ++} ++ ++static int ++ext3_xattr_user_get(struct inode *inode, const char *name, ++ void *buffer, size_t size) ++{ ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_READ); ++#else ++ error = permission(inode, MAY_READ); ++#endif ++ if (error) ++ return error; ++ ++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, ++ buffer, size); ++} ++ ++static int ++ext3_xattr_user_set(struct inode *inode, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ handle_t *handle; ++ int error; ++ ++ if (strcmp(name, "") == 0) ++ return -EINVAL; ++ if (!test_opt(inode->i_sb, XATTR_USER)) ++ return -ENOTSUP; ++ if ( !S_ISREG(inode->i_mode) && ++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) ++ return -EPERM; ++#ifdef CONFIG_EXT3_FS_POSIX_ACL ++ error = ext3_permission_locked(inode, MAY_WRITE); ++#else ++ error = permission(inode, MAY_WRITE); ++#endif ++ if (error) ++ return error; ++ ++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name, ++ value, size, flags); ++ ext3_journal_stop(handle, inode); ++ ++ return error; ++} ++ ++struct ext3_xattr_handler ext3_xattr_user_handler = { ++ prefix: XATTR_USER_PREFIX, ++ list: ext3_xattr_user_list, ++ get: ext3_xattr_user_get, ++ set: ext3_xattr_user_set, ++}; ++ ++int __init ++init_ext3_xattr_user(void) ++{ ++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} ++ ++void ++exit_ext3_xattr_user(void) ++{ ++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER, ++ &ext3_xattr_user_handler); ++} +--- linux-2.4.20/fs/jfs/jfs_xattr.h~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/jfs/jfs_xattr.h 2003-05-07 18:08:03.000000000 +0800 +@@ -52,8 +52,10 @@ struct jfs_ea_list { + #define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int); +-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int); ++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t, ++ int); ++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t, ++ int); + extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); + extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t); + extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); +--- linux-2.4.20/fs/jfs/xattr.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/fs/jfs/xattr.c 2003-05-07 18:08:03.000000000 +0800 +@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s + } + + static int can_set_xattr(struct inode *inode, const char *name, +- void *value, size_t value_len) ++ const void *value, size_t value_len) + { + if (IS_RDONLY(inode)) + return -EROFS; +@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i + return permission(inode, MAY_WRITE); + } + +-int __jfs_setxattr(struct inode *inode, const char *name, void *value, ++int __jfs_setxattr(struct inode *inode, const char *name, const void *value, + size_t value_len, int flags) + { + struct jfs_ea_list *ealist; +@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode, + return rc; + } + +-int jfs_setxattr(struct dentry *dentry, const char *name, void *value, ++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t value_len, int flags) + { + if (value == NULL) { /* empty EA, do not remove */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/fs/mbcache.c 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,648 @@ ++/* ++ * linux/fs/mbcache.c ++ * (C) 2001-2002 Andreas Gruenbacher, ++ */ ++ ++/* ++ * Filesystem Meta Information Block Cache (mbcache) ++ * ++ * The mbcache caches blocks of block devices that need to be located ++ * by their device/block number, as well as by other criteria (such ++ * as the block's contents). ++ * ++ * There can only be one cache entry in a cache per device and block number. ++ * Additional indexes need not be unique in this sense. The number of ++ * additional indexes (=other criteria) can be hardwired at compile time ++ * or specified at cache create time. ++ * ++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid' ++ * in the cache. A valid entry is in the main hash tables of the cache, ++ * and may also be in the lru list. An invalid entry is not in any hashes ++ * or lists. ++ * ++ * A valid cache entry is only in the lru list if no handles refer to it. ++ * Invalid cache entries will be freed when the last handle to the cache ++ * entry is released. Entries that cannot be freed immediately are put ++ * back on the lru list. ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#ifdef MB_CACHE_DEBUG ++# define mb_debug(f...) do { \ ++ printk(KERN_DEBUG f); \ ++ printk("\n"); \ ++ } while (0) ++#define mb_assert(c) do { if (!(c)) \ ++ printk(KERN_ERR "assertion " #c " failed\n"); \ ++ } while(0) ++#else ++# define mb_debug(f...) do { } while(0) ++# define mb_assert(c) do { } while(0) ++#endif ++#define mb_error(f...) do { \ ++ printk(KERN_ERR f); \ ++ printk("\n"); \ ++ } while(0) ++ ++MODULE_AUTHOR("Andreas Gruenbacher "); ++MODULE_DESCRIPTION("Meta block cache (for extended attributes)"); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) ++MODULE_LICENSE("GPL"); ++#endif ++ ++EXPORT_SYMBOL(mb_cache_create); ++EXPORT_SYMBOL(mb_cache_shrink); ++EXPORT_SYMBOL(mb_cache_destroy); ++EXPORT_SYMBOL(mb_cache_entry_alloc); ++EXPORT_SYMBOL(mb_cache_entry_insert); ++EXPORT_SYMBOL(mb_cache_entry_release); ++EXPORT_SYMBOL(mb_cache_entry_takeout); ++EXPORT_SYMBOL(mb_cache_entry_free); ++EXPORT_SYMBOL(mb_cache_entry_dup); ++EXPORT_SYMBOL(mb_cache_entry_get); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++EXPORT_SYMBOL(mb_cache_entry_find_first); ++EXPORT_SYMBOL(mb_cache_entry_find_next); ++#endif ++ ++ ++/* ++ * Global data: list of all mbcache's, lru list, and a spinlock for ++ * accessing cache data structures on SMP machines. The lru list is ++ * global across all mbcaches. ++ */ ++ ++static LIST_HEAD(mb_cache_list); ++static LIST_HEAD(mb_cache_lru_list); ++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED; ++ ++static inline int ++mb_cache_indexes(struct mb_cache *cache) ++{ ++#ifdef MB_CACHE_INDEXES_COUNT ++ return MB_CACHE_INDEXES_COUNT; ++#else ++ return cache->c_indexes_count; ++#endif ++} ++ ++/* ++ * What the mbcache registers as to get shrunk dynamically. ++ */ ++ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask); ++ ++static struct cache_definition mb_cache_definition = { ++ "mb_cache", ++ mb_cache_memory_pressure ++}; ++ ++ ++static inline int ++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) ++{ ++ return !list_empty(&ce->e_block_list); ++} ++ ++ ++static inline void ++__mb_cache_entry_unhash(struct mb_cache_entry *ce) ++{ ++ int n; ++ ++ if (__mb_cache_entry_is_hashed(ce)) { ++ list_del_init(&ce->e_block_list); ++ for (n=0; ne_cache); n++) ++ list_del(&ce->e_indexes[n].o_list); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ ++ mb_assert(atomic_read(&ce->e_used) == 0); ++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) { ++ /* free failed -- put back on the lru list ++ for freeing later. */ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&ce->e_lru_list, &mb_cache_lru_list); ++ spin_unlock(&mb_cache_spinlock); ++ } else { ++ kmem_cache_free(cache->c_entry_cache, ce); ++ atomic_dec(&cache->c_entry_count); ++ } ++} ++ ++ ++static inline void ++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce) ++{ ++ if (atomic_dec_and_test(&ce->e_used)) { ++ if (__mb_cache_entry_is_hashed(ce)) ++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); ++ else { ++ spin_unlock(&mb_cache_spinlock); ++ __mb_cache_entry_forget(ce, GFP_KERNEL); ++ return; ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_memory_pressure() memory pressure callback ++ * ++ * This function is called by the kernel memory management when memory ++ * gets low. ++ * ++ * @priority: Amount by which to shrink the cache (0 = highes priority) ++ * @gfp_mask: (ignored) ++ */ ++static void ++mb_cache_memory_pressure(int priority, unsigned int gfp_mask) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int count = 0; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &mb_cache_list) { ++ struct mb_cache *cache = ++ list_entry(l, struct mb_cache, c_cache_list); ++ mb_debug("cache %s (%d)", cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ count += atomic_read(&cache->c_entry_count); ++ } ++ mb_debug("trying to free %d of %d entries", ++ count / (priority ? priority : 1), count); ++ if (priority) ++ count /= priority; ++ while (count-- && !list_empty(&mb_cache_lru_list)) { ++ struct mb_cache_entry *ce = ++ list_entry(mb_cache_lru_list.next, ++ struct mb_cache_entry, e_lru_list); ++ list_del(&ce->e_lru_list); ++ __mb_cache_entry_unhash(ce); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), gfp_mask); ++ } ++} ++ ++ ++/* ++ * mb_cache_create() create a new cache ++ * ++ * All entries in one cache are equal size. Cache entries may be from ++ * multiple devices. If this is the first mbcache created, registers ++ * the cache with kernel memory management. Returns NULL if no more ++ * memory was available. ++ * ++ * @name: name of the cache (informal) ++ * @cache_op: contains the callback called when freeing a cache entry ++ * @entry_size: The size of a cache entry, including ++ * struct mb_cache_entry ++ * @indexes_count: number of additional indexes in the cache. Must equal ++ * MB_CACHE_INDEXES_COUNT if the number of indexes is ++ * hardwired. ++ * @bucket_count: number of hash buckets ++ */ ++struct mb_cache * ++mb_cache_create(const char *name, struct mb_cache_op *cache_op, ++ size_t entry_size, int indexes_count, int bucket_count) ++{ ++ int m=0, n; ++ struct mb_cache *cache = NULL; ++ ++ if(entry_size < sizeof(struct mb_cache_entry) + ++ indexes_count * sizeof(struct mb_cache_entry_index)) ++ return NULL; ++ ++ MOD_INC_USE_COUNT; ++ cache = kmalloc(sizeof(struct mb_cache) + ++ indexes_count * sizeof(struct list_head), GFP_KERNEL); ++ if (!cache) ++ goto fail; ++ cache->c_name = name; ++ cache->c_op.free = NULL; ++ if (cache_op) ++ cache->c_op.free = cache_op->free; ++ atomic_set(&cache->c_entry_count, 0); ++ cache->c_bucket_count = bucket_count; ++#ifdef MB_CACHE_INDEXES_COUNT ++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT); ++#else ++ cache->c_indexes_count = indexes_count; ++#endif ++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_block_hash) ++ goto fail; ++ for (n=0; nc_block_hash[n]); ++ for (m=0; mc_indexes_hash[m] = kmalloc(bucket_count * ++ sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!cache->c_indexes_hash[m]) ++ goto fail; ++ for (n=0; nc_indexes_hash[m][n]); ++ } ++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0, ++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL); ++ if (!cache->c_entry_cache) ++ goto fail; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_add(&cache->c_cache_list, &mb_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ return cache; ++ ++fail: ++ if (cache) { ++ while (--m >= 0) ++ kfree(cache->c_indexes_hash[m]); ++ if (cache->c_block_hash) ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ } ++ MOD_DEC_USE_COUNT; ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_shrink() ++ * ++ * Removes all cache entires of a device from the cache. All cache entries ++ * currently in use cannot be freed, and thus remain in the cache. ++ * ++ * @cache: which cache to shrink ++ * @dev: which device's cache entries to shrink ++ */ ++void ++mb_cache_shrink(struct mb_cache *cache, kdev_t dev) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_dev == dev) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++} ++ ++ ++/* ++ * mb_cache_destroy() ++ * ++ * Shrinks the cache to its minimum possible size (hopefully 0 entries), ++ * and then destroys it. If this was the last mbcache, un-registers the ++ * mbcache from kernel memory management. ++ */ ++void ++mb_cache_destroy(struct mb_cache *cache) ++{ ++ LIST_HEAD(free_list); ++ struct list_head *l, *ltmp; ++ int n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_lru_list); ++ if (ce->e_cache == cache) { ++ list_del(&ce->e_lru_list); ++ list_add_tail(&ce->e_lru_list, &free_list); ++ __mb_cache_entry_unhash(ce); ++ } ++ } ++ list_del(&cache->c_cache_list); ++ spin_unlock(&mb_cache_spinlock); ++ list_for_each_safe(l, ltmp, &free_list) { ++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, ++ e_lru_list), GFP_KERNEL); ++ } ++ ++ if (atomic_read(&cache->c_entry_count) > 0) { ++ mb_error("cache %s: %d orphaned entries", ++ cache->c_name, ++ atomic_read(&cache->c_entry_count)); ++ } ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0)) ++ /* We don't have kmem_cache_destroy() in 2.2.x */ ++ kmem_cache_shrink(cache->c_entry_cache); ++#else ++ kmem_cache_destroy(cache->c_entry_cache); ++#endif ++ for (n=0; n < mb_cache_indexes(cache); n++) ++ kfree(cache->c_indexes_hash[n]); ++ kfree(cache->c_block_hash); ++ kfree(cache); ++ ++ MOD_DEC_USE_COUNT; ++} ++ ++ ++/* ++ * mb_cache_entry_alloc() ++ * ++ * Allocates a new cache entry. The new entry will not be valid initially, ++ * and thus cannot be looked up yet. It should be filled with data, and ++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL ++ * if no more memory was available. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_alloc(struct mb_cache *cache) ++{ ++ struct mb_cache_entry *ce; ++ ++ atomic_inc(&cache->c_entry_count); ++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL); ++ if (ce) { ++ INIT_LIST_HEAD(&ce->e_lru_list); ++ INIT_LIST_HEAD(&ce->e_block_list); ++ ce->e_cache = cache; ++ atomic_set(&ce->e_used, 1); ++ } ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_insert() ++ * ++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into ++ * the cache. After this, the cache entry can be looked up, but is not yet ++ * in the lru list as the caller still holds a handle to it. Returns 0 on ++ * success, or -EBUSY if a cache entry for that device + inode exists ++ * already (this may happen after a failed lookup, if another process has ++ * inserted the same cache entry in the meantime). ++ * ++ * @dev: device the cache entry belongs to ++ * @block: block number ++ * @keys: array of additional keys. There must be indexes_count entries ++ * in the array (as specified when creating the cache). ++ */ ++int ++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev, ++ unsigned long block, unsigned int keys[]) ++{ ++ struct mb_cache *cache = ce->e_cache; ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ int error = -EBUSY, n; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) ++ goto out; ++ } ++ __mb_cache_entry_unhash(ce); ++ ce->e_dev = dev; ++ ce->e_block = block; ++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); ++ for (n=0; ne_indexes[n].o_key = keys[n]; ++ bucket = keys[n] % cache->c_bucket_count; ++ list_add(&ce->e_indexes[n].o_list, ++ &cache->c_indexes_hash[n][bucket]); ++ } ++out: ++ spin_unlock(&mb_cache_spinlock); ++ return error; ++} ++ ++ ++/* ++ * mb_cache_entry_release() ++ * ++ * Release a handle to a cache entry. When the last handle to a cache entry ++ * is released it is either freed (if it is invalid) or otherwise inserted ++ * in to the lru list. ++ */ ++void ++mb_cache_entry_release(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_takeout() ++ * ++ * Take a cache entry out of the cache, making it invalid. The entry can later ++ * be re-inserted using mb_cache_entry_insert(), or released using ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_takeout(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ spin_unlock(&mb_cache_spinlock); ++} ++ ++ ++/* ++ * mb_cache_entry_free() ++ * ++ * This is equivalent to the sequence mb_cache_entry_takeout() -- ++ * mb_cache_entry_release(). ++ */ ++void ++mb_cache_entry_free(struct mb_cache_entry *ce) ++{ ++ spin_lock(&mb_cache_spinlock); ++ mb_assert(list_empty(&ce->e_lru_list)); ++ __mb_cache_entry_unhash(ce); ++ __mb_cache_entry_release_unlock(ce); ++} ++ ++ ++/* ++ * mb_cache_entry_dup() ++ * ++ * Duplicate a handle to a cache entry (does not duplicate the cache entry ++ * itself). After the call, both the old and the new handle must be released. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_dup(struct mb_cache_entry *ce) ++{ ++ atomic_inc(&ce->e_used); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_get() ++ * ++ * Get a cache entry by device / block number. (There can only be one entry ++ * in the cache per device and block.) Returns NULL if no such cache entry ++ * exists. ++ */ ++struct mb_cache_entry * ++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block) ++{ ++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ spin_lock(&mb_cache_spinlock); ++ list_for_each(l, &cache->c_block_hash[bucket]) { ++ ce = list_entry(l, struct mb_cache_entry, e_block_list); ++ if (ce->e_dev == dev && ce->e_block == block) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ goto cleanup; ++ } ++ } ++ ce = NULL; ++ ++cleanup: ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++ ++static struct mb_cache_entry * ++__mb_cache_entry_find(struct list_head *l, struct list_head *head, ++ int index, kdev_t dev, unsigned int key) ++{ ++ while (l != head) { ++ struct mb_cache_entry *ce = ++ list_entry(l, struct mb_cache_entry, ++ e_indexes[index].o_list); ++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) { ++ if (!list_empty(&ce->e_lru_list)) ++ list_del_init(&ce->e_lru_list); ++ atomic_inc(&ce->e_used); ++ return ce; ++ } ++ l = l->next; ++ } ++ return NULL; ++} ++ ++ ++/* ++ * mb_cache_entry_find_first() ++ * ++ * Find the first cache entry on a given device with a certain key in ++ * an additional index. Additonal matches can be found with ++ * mb_cache_entry_find_next(). Returns NULL if no match was found. ++ * ++ * @cache: the cache to search ++ * @index: the number of the additonal index to search (0<=indexc_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = cache->c_indexes_hash[index][bucket].next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ spin_unlock(&mb_cache_spinlock); ++ return ce; ++} ++ ++ ++/* ++ * mb_cache_entry_find_next() ++ * ++ * Find the next cache entry on a given device with a certain key in an ++ * additional index. Returns NULL if no match could be found. The previous ++ * entry is atomatically released, so that mb_cache_entry_find_next() can ++ * be called like this: ++ * ++ * entry = mb_cache_entry_find_first(); ++ * while (entry) { ++ * ... ++ * entry = mb_cache_entry_find_next(entry, ...); ++ * } ++ * ++ * @prev: The previous match ++ * @index: the number of the additonal index to search (0<=indexe_cache; ++ unsigned int bucket = key % cache->c_bucket_count; ++ struct list_head *l; ++ struct mb_cache_entry *ce; ++ ++ mb_assert(index < mb_cache_indexes(cache)); ++ spin_lock(&mb_cache_spinlock); ++ l = prev->e_indexes[index].o_list.next; ++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket], ++ index, dev, key); ++ __mb_cache_entry_release_unlock(prev); ++ return ce; ++} ++ ++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */ ++ ++static int __init init_mbcache(void) ++{ ++ register_cache(&mb_cache_definition); ++ return 0; ++} ++ ++static void __exit exit_mbcache(void) ++{ ++ unregister_cache(&mb_cache_definition); ++} ++ ++module_init(init_mbcache) ++module_exit(exit_mbcache) ++ +--- linux-2.4.20/include/asm-arm/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-arm/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -244,7 +244,6 @@ + #define __NR_security (__NR_SYSCALL_BASE+223) + #define __NR_gettid (__NR_SYSCALL_BASE+224) + #define __NR_readahead (__NR_SYSCALL_BASE+225) +-#if 0 /* allocated in 2.5 */ + #define __NR_setxattr (__NR_SYSCALL_BASE+226) + #define __NR_lsetxattr (__NR_SYSCALL_BASE+227) + #define __NR_fsetxattr (__NR_SYSCALL_BASE+228) +@@ -257,7 +256,6 @@ + #define __NR_removexattr (__NR_SYSCALL_BASE+235) + #define __NR_lremovexattr (__NR_SYSCALL_BASE+236) + #define __NR_fremovexattr (__NR_SYSCALL_BASE+237) +-#endif + #define __NR_tkill (__NR_SYSCALL_BASE+238) + /* + * Please check 2.5 _before_ adding calls here, +--- linux-2.4.20/include/asm-ia64/unistd.h~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/include/asm-ia64/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -206,8 +206,19 @@ + #define __NR_getdents64 1214 + #define __NR_getunwind 1215 + #define __NR_readahead 1216 ++#define __NR_setxattr 1217 ++#define __NR_lsetxattr 1218 ++#define __NR_fsetxattr 1219 ++#define __NR_getxattr 1220 ++#define __NR_lgetxattr 1221 ++#define __NR_fgetxattr 1222 ++#define __NR_listxattr 1223 ++#define __NR_llistxattr 1224 ++#define __NR_flistxattr 1225 ++#define __NR_removexattr 1226 ++#define __NR_lremovexattr 1227 ++#define __NR_fremovexattr 1228 + /* +- * 1217-1228: reserved for xattr + * 1230-1232: reserved for futex and sched_[sg]etaffinity. + */ + #define __NR_tkill 1229 +--- linux-2.4.20/include/asm-ppc64/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-ppc64/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -218,6 +218,7 @@ + #define __NR_gettid 207 + #if 0 /* Reserved syscalls */ + #define __NR_tkill 208 ++#endif + #define __NR_setxattr 209 + #define __NR_lsetxattr 210 + #define __NR_fsetxattr 211 +@@ -230,6 +231,7 @@ + #define __NR_removexattr 218 + #define __NR_lremovexattr 219 + #define __NR_fremovexattr 220 ++#if 0 /* Reserved syscalls */ + #define __NR_futex 221 + #endif + +--- linux-2.4.20/include/asm-s390/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-s390/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -212,9 +212,18 @@ + #define __NR_madvise 219 + #define __NR_getdents64 220 + #define __NR_fcntl64 221 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux-2.4.20/include/asm-s390x/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-s390x/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -180,9 +180,18 @@ + #define __NR_pivot_root 217 + #define __NR_mincore 218 + #define __NR_madvise 219 +-/* +- * Numbers 224-235 are reserved for posix acl +- */ ++#define __NR_setxattr 224 ++#define __NR_lsetxattr 225 ++#define __NR_fsetxattr 226 ++#define __NR_getxattr 227 ++#define __NR_lgetxattr 228 ++#define __NR_fgetxattr 229 ++#define __NR_listxattr 230 ++#define __NR_llistxattr 231 ++#define __NR_flistxattr 232 ++#define __NR_removexattr 233 ++#define __NR_lremovexattr 234 ++#define __NR_fremovexattr 235 + #define __NR_gettid 236 + #define __NR_tkill 237 + +--- linux-2.4.20/include/asm-sparc/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-sparc/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- linux-2.4.20/include/asm-sparc64/unistd.h~linux-2.4.20-xattr-0.8.54 2002-08-03 08:39:45.000000000 +0800 ++++ linux-2.4.20-root/include/asm-sparc64/unistd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -184,24 +184,24 @@ + /* #define __NR_exportfs 166 SunOS Specific */ + #define __NR_mount 167 /* Common */ + #define __NR_ustat 168 /* Common */ +-/* #define __NR_semsys 169 SunOS Specific */ +-/* #define __NR_msgsys 170 SunOS Specific */ +-/* #define __NR_shmsys 171 SunOS Specific */ +-/* #define __NR_auditsys 172 SunOS Specific */ +-/* #define __NR_rfssys 173 SunOS Specific */ ++#define __NR_setxattr 169 /* SunOS: semsys */ ++#define __NR_lsetxattr 170 /* SunOS: msgsys */ ++#define __NR_fsetxattr 171 /* SunOS: shmsys */ ++#define __NR_getxattr 172 /* SunOS: auditsys */ ++#define __NR_lgetxattr 173 /* SunOS: rfssys */ + #define __NR_getdents 174 /* Common */ + #define __NR_setsid 175 /* Common */ + #define __NR_fchdir 176 /* Common */ +-/* #define __NR_fchroot 177 SunOS Specific */ +-/* #define __NR_vpixsys 178 SunOS Specific */ +-/* #define __NR_aioread 179 SunOS Specific */ +-/* #define __NR_aiowrite 180 SunOS Specific */ +-/* #define __NR_aiowait 181 SunOS Specific */ +-/* #define __NR_aiocancel 182 SunOS Specific */ ++#define __NR_fgetxattr 177 /* SunOS: fchroot */ ++#define __NR_listxattr 178 /* SunOS: vpixsys */ ++#define __NR_llistxattr 179 /* SunOS: aioread */ ++#define __NR_flistxattr 180 /* SunOS: aiowrite */ ++#define __NR_removexattr 181 /* SunOS: aiowait */ ++#define __NR_lremovexattr 182 /* SunOS: aiocancel */ + #define __NR_sigpending 183 /* Common */ + #define __NR_query_module 184 /* Linux Specific */ + #define __NR_setpgid 185 /* Common */ +-/* #define __NR_pathconf 186 SunOS Specific */ ++#define __NR_fremovexattr 186 /* SunOS: pathconf */ + #define __NR_tkill 187 /* SunOS: fpathconf */ + /* #define __NR_sysconf 188 SunOS Specific */ + #define __NR_uname 189 /* Linux Specific */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/include/linux/cache_def.h 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,15 @@ ++/* ++ * linux/cache_def.h ++ * Handling of caches defined in drivers, filesystems, ... ++ * ++ * Copyright (C) 2002 by Andreas Gruenbacher, ++ */ ++ ++struct cache_definition { ++ const char *name; ++ void (*shrink)(int, unsigned int); ++ struct list_head link; ++}; ++ ++extern void register_cache(struct cache_definition *); ++extern void unregister_cache(struct cache_definition *); +--- linux-2.4.20/include/linux/errno.h~linux-2.4.20-xattr-0.8.54 2003-04-14 16:39:03.000000000 +0800 ++++ linux-2.4.20-root/include/linux/errno.h 2003-05-07 18:08:03.000000000 +0800 +@@ -23,4 +23,8 @@ + + #endif + ++/* Defined for extended attributes */ ++#define ENOATTR ENODATA /* No such attribute */ ++#define ENOTSUP EOPNOTSUPP /* Operation not supported */ ++ + #endif +--- linux-2.4.20/include/linux/ext2_fs.h~linux-2.4.20-xattr-0.8.54 2003-04-14 16:39:08.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext2_fs.h 2003-05-07 18:08:03.000000000 +0800 +@@ -57,8 +57,6 @@ + */ + #define EXT2_BAD_INO 1 /* Bad blocks inode */ + #define EXT2_ROOT_INO 2 /* Root inode */ +-#define EXT2_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT2_ACL_DATA_INO 4 /* ACL inode */ + #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +@@ -86,7 +84,6 @@ + #else + # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) + #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -121,28 +118,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext2_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext2_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext2_group_desc +@@ -314,6 +289,7 @@ struct ext2_inode { + #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ ++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt + #define set_opt(o, opt) o |= EXT2_MOUNT_##opt +@@ -397,6 +373,7 @@ struct ext2_super_block { + + #ifdef __KERNEL__ + #define EXT2_SB(sb) (&((sb)->u.ext2_sb)) ++#define EXT2_I(inode) (&((inode)->u.ext2_i)) + #else + /* Assume that user mode programs are passing in an ext2fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +@@ -466,7 +443,7 @@ struct ext2_super_block { + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 + #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +-#define EXT2_FEATURE_COMPAT_SUPP 0 ++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE + #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ +@@ -623,8 +600,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext2_dir_inode_operations; ++extern struct inode_operations ext2_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext2_symlink_inode_operations; + extern struct inode_operations ext2_fast_symlink_inode_operations; + + #endif /* __KERNEL__ */ +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext2_xattr.h 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext2_xattr.h ++ ++ On-disk format of extended attributes for the ext2 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT2_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT2_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT2_XATTR_INDEX_MAX 10 ++#define EXT2_XATTR_INDEX_USER 1 ++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext2_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext2_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT2_XATTR_PAD_BITS 2 ++#define EXT2_XATTR_PAD (1<e_name_len)) ) ++#define EXT2_XATTR_SIZE(size) \ ++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT2_FS_XATTR ++ ++struct ext2_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext2_xattr_register(int, struct ext2_xattr_handler *); ++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *); ++ ++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); ++extern int ext2_removexattr(struct dentry *, const char *); ++ ++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext2_xattr_list(struct inode *, char *, size_t); ++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext2_xattr_delete_inode(struct inode *); ++extern void ext2_xattr_put_super(struct super_block *); ++ ++extern int init_ext2_xattr(void) __init; ++extern void exit_ext2_xattr(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR */ ++# define ext2_setxattr NULL ++# define ext2_getxattr NULL ++# define ext2_listxattr NULL ++# define ext2_removexattr NULL ++ ++static inline int ++ext2_xattr_get(struct inode *inode, int name_index, ++ const char *name, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_list(struct inode *inode, char *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext2_xattr_set(struct inode *inode, int name_index, const char *name, ++ const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext2_xattr_delete_inode(struct inode *inode) ++{ ++} ++ ++static inline void ++ext2_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext2_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR */ ++ ++# ifdef CONFIG_EXT2_FS_XATTR_USER ++ ++extern int init_ext2_xattr_user(void) __init; ++extern void exit_ext2_xattr_user(void); ++ ++# else /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++static inline int ++init_ext2_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext2_xattr_user(void) ++{ ++} ++ ++# endif /* CONFIG_EXT2_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux-2.4.20/include/linux/ext3_fs.h~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:04.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext3_fs.h 2003-05-07 18:08:03.000000000 +0800 +@@ -63,8 +63,6 @@ + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ +-#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +-#define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +@@ -94,7 +92,6 @@ + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif +-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +@@ -129,28 +126,6 @@ + #endif + + /* +- * ACL structures +- */ +-struct ext3_acl_header /* Header of Access Control Lists */ +-{ +- __u32 aclh_size; +- __u32 aclh_file_count; +- __u32 aclh_acle_count; +- __u32 aclh_first_acle; +-}; +- +-struct ext3_acl_entry /* Access Control List Entry */ +-{ +- __u32 acle_size; +- __u16 acle_perms; /* Access permissions */ +- __u16 acle_type; /* Type of entry */ +- __u16 acle_tag; /* User or group identity */ +- __u16 acle_pad1; +- __u32 acle_next; /* Pointer on next entry for the */ +- /* same inode or on next free entry */ +-}; +- +-/* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc +@@ -344,6 +319,7 @@ struct ext3_inode { + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ ++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -520,7 +496,7 @@ struct ext3_super_block { + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +-#define EXT3_FEATURE_COMPAT_SUPP 0 ++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ +@@ -703,6 +679,7 @@ extern void ext3_check_inodes_bitmap (st + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +@@ -771,8 +748,10 @@ extern struct address_space_operations e + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; ++extern struct inode_operations ext3_special_inode_operations; + + /* symlink.c */ ++extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + + +--- linux-2.4.20/include/linux/ext3_jbd.h~linux-2.4.20-xattr-0.8.54 2003-05-05 19:01:02.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext3_jbd.h 2003-05-07 18:08:03.000000000 +0800 +@@ -30,13 +30,19 @@ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + ++/* Extended attributes may touch two data buffers, two bitmap buffers, ++ * and two group and summaries. */ ++ ++#define EXT3_XATTR_TRANS_BLOCKS 8 ++ + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) ++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \ ++ EXT3_XATTR_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/include/linux/ext3_xattr.h 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,157 @@ ++/* ++ File: linux/ext3_xattr.h ++ ++ On-disk format of extended attributes for the ext3 filesystem. ++ ++ (C) 2001 Andreas Gruenbacher, ++*/ ++ ++#include ++#include ++#include ++ ++/* Magic value in attribute blocks */ ++#define EXT3_XATTR_MAGIC 0xEA020000 ++ ++/* Maximum number of references to one attribute block */ ++#define EXT3_XATTR_REFCOUNT_MAX 1024 ++ ++/* Name indexes */ ++#define EXT3_XATTR_INDEX_MAX 10 ++#define EXT3_XATTR_INDEX_USER 1 ++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2 ++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3 ++ ++struct ext3_xattr_header { ++ __u32 h_magic; /* magic number for identification */ ++ __u32 h_refcount; /* reference count */ ++ __u32 h_blocks; /* number of disk blocks used */ ++ __u32 h_hash; /* hash value of all attributes */ ++ __u32 h_reserved[4]; /* zero right now */ ++}; ++ ++struct ext3_xattr_entry { ++ __u8 e_name_len; /* length of name */ ++ __u8 e_name_index; /* attribute name index */ ++ __u16 e_value_offs; /* offset in disk block of value */ ++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */ ++ __u32 e_value_size; /* size of attribute value */ ++ __u32 e_hash; /* hash value of name and value */ ++ char e_name[0]; /* attribute name */ ++}; ++ ++#define EXT3_XATTR_PAD_BITS 2 ++#define EXT3_XATTR_PAD (1<e_name_len)) ) ++#define EXT3_XATTR_SIZE(size) \ ++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND) ++ ++#ifdef __KERNEL__ ++ ++# ifdef CONFIG_EXT3_FS_XATTR ++ ++struct ext3_xattr_handler { ++ char *prefix; ++ size_t (*list)(char *list, struct inode *inode, const char *name, ++ int name_len); ++ int (*get)(struct inode *inode, const char *name, void *buffer, ++ size_t size); ++ int (*set)(struct inode *inode, const char *name, const void *buffer, ++ size_t size, int flags); ++}; ++ ++extern int ext3_xattr_register(int, struct ext3_xattr_handler *); ++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *); ++ ++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int); ++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t); ++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); ++extern int ext3_removexattr(struct dentry *, const char *); ++ ++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); ++extern int ext3_xattr_list(struct inode *, char *, size_t); ++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int); ++ ++extern void ext3_xattr_delete_inode(handle_t *, struct inode *); ++extern void ext3_xattr_put_super(struct super_block *); ++ ++extern int init_ext3_xattr(void) __init; ++extern void exit_ext3_xattr(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR */ ++# define ext3_setxattr NULL ++# define ext3_getxattr NULL ++# define ext3_listxattr NULL ++# define ext3_removexattr NULL ++ ++static inline int ++ext3_xattr_get(struct inode *inode, int name_index, const char *name, ++ void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_list(struct inode *inode, void *buffer, size_t size) ++{ ++ return -ENOTSUP; ++} ++ ++static inline int ++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index, ++ const char *name, const void *value, size_t size, int flags) ++{ ++ return -ENOTSUP; ++} ++ ++static inline void ++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) ++{ ++} ++ ++static inline void ++ext3_xattr_put_super(struct super_block *sb) ++{ ++} ++ ++static inline int ++init_ext3_xattr(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr(void) ++{ ++} ++ ++# endif /* CONFIG_EXT3_FS_XATTR */ ++ ++# ifdef CONFIG_EXT3_FS_XATTR_USER ++ ++extern int init_ext3_xattr_user(void) __init; ++extern void exit_ext3_xattr_user(void); ++ ++# else /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++static inline int ++init_ext3_xattr_user(void) ++{ ++ return 0; ++} ++ ++static inline void ++exit_ext3_xattr_user(void) ++{ ++} ++ ++#endif /* CONFIG_EXT3_FS_XATTR_USER */ ++ ++#endif /* __KERNEL__ */ ++ +--- linux-2.4.20/include/linux/fs.h~linux-2.4.20-xattr-0.8.54 2003-05-05 19:00:55.000000000 +0800 ++++ linux-2.4.20-root/include/linux/fs.h 2003-05-07 18:08:03.000000000 +0800 +@@ -888,7 +888,7 @@ struct inode_operations { + int (*setattr) (struct dentry *, struct iattr *); + int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); +- int (*setxattr) (struct dentry *, const char *, void *, size_t, int); ++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-2.4.20-root/include/linux/mbcache.h 2003-05-07 18:08:03.000000000 +0800 +@@ -0,0 +1,69 @@ ++/* ++ File: linux/mbcache.h ++ ++ (C) 2001 by Andreas Gruenbacher, ++*/ ++ ++/* Hardwire the number of additional indexes */ ++#define MB_CACHE_INDEXES_COUNT 1 ++ ++struct mb_cache_entry; ++ ++struct mb_cache_op { ++ int (*free)(struct mb_cache_entry *, int); ++}; ++ ++struct mb_cache { ++ struct list_head c_cache_list; ++ const char *c_name; ++ struct mb_cache_op c_op; ++ atomic_t c_entry_count; ++ int c_bucket_count; ++#ifndef MB_CACHE_INDEXES_COUNT ++ int c_indexes_count; ++#endif ++ kmem_cache_t *c_entry_cache; ++ struct list_head *c_block_hash; ++ struct list_head *c_indexes_hash[0]; ++}; ++ ++struct mb_cache_entry_index { ++ struct list_head o_list; ++ unsigned int o_key; ++}; ++ ++struct mb_cache_entry { ++ struct list_head e_lru_list; ++ struct mb_cache *e_cache; ++ atomic_t e_used; ++ kdev_t e_dev; ++ unsigned long e_block; ++ struct list_head e_block_list; ++ struct mb_cache_entry_index e_indexes[0]; ++}; ++ ++/* Functions on caches */ ++ ++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t, ++ int, int); ++void mb_cache_shrink(struct mb_cache *, kdev_t); ++void mb_cache_destroy(struct mb_cache *); ++ ++/* Functions on cache entries */ ++ ++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *); ++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long, ++ unsigned int[]); ++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]); ++void mb_cache_entry_release(struct mb_cache_entry *); ++void mb_cache_entry_takeout(struct mb_cache_entry *); ++void mb_cache_entry_free(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *); ++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t, ++ unsigned long); ++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) ++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int, ++ kdev_t, unsigned int); ++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int, ++ kdev_t, unsigned int); ++#endif +--- linux-2.4.20/kernel/ksyms.c~linux-2.4.20-xattr-0.8.54 2003-05-05 17:43:15.000000000 +0800 ++++ linux-2.4.20-root/kernel/ksyms.c 2003-05-07 18:08:03.000000000 +0800 +@@ -11,6 +11,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -89,6 +90,7 @@ EXPORT_SYMBOL(exit_mm); + EXPORT_SYMBOL(exit_files); + EXPORT_SYMBOL(exit_fs); + EXPORT_SYMBOL(exit_sighand); ++EXPORT_SYMBOL(copy_fs_struct); + + /* internal kernel memory management */ + EXPORT_SYMBOL(_alloc_pages); +@@ -107,6 +109,8 @@ EXPORT_SYMBOL(kmem_cache_validate); + EXPORT_SYMBOL(kmem_cache_alloc); + EXPORT_SYMBOL(kmem_cache_free); + EXPORT_SYMBOL(kmem_cache_size); ++EXPORT_SYMBOL(register_cache); ++EXPORT_SYMBOL(unregister_cache); + EXPORT_SYMBOL(kmalloc); + EXPORT_SYMBOL(kfree); + EXPORT_SYMBOL(vfree); +--- linux-2.4.20/mm/vmscan.c~linux-2.4.20-xattr-0.8.54 2002-11-29 07:53:15.000000000 +0800 ++++ linux-2.4.20-root/mm/vmscan.c 2003-05-07 18:08:03.000000000 +0800 +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -34,6 +35,39 @@ + */ + #define DEF_PRIORITY (6) + ++static DECLARE_MUTEX(other_caches_sem); ++static LIST_HEAD(cache_definitions); ++ ++void register_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_add(&cache->link, &cache_definitions); ++ up(&other_caches_sem); ++} ++ ++void unregister_cache(struct cache_definition *cache) ++{ ++ down(&other_caches_sem); ++ list_del(&cache->link); ++ up(&other_caches_sem); ++} ++ ++static void shrink_other_caches(unsigned int priority, int gfp_mask) ++{ ++ struct list_head *p; ++ ++ if (down_trylock(&other_caches_sem)) ++ return; ++ ++ list_for_each_prev(p, &cache_definitions) { ++ struct cache_definition *cache = ++ list_entry(p, struct cache_definition, link); ++ ++ cache->shrink(priority, gfp_mask); ++ } ++ up(&other_caches_sem); ++} ++ + /* + * The swap-out function returns 1 if it successfully + * scanned all the pages it was asked to (`count'). +@@ -577,6 +611,7 @@ static int shrink_caches(zone_t * classz + + shrink_dcache_memory(priority, gfp_mask); + shrink_icache_memory(priority, gfp_mask); ++ shrink_other_caches(priority, gfp_mask); + #ifdef CONFIG_QUOTA + shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); + #endif +--- /dev/null 2003-01-30 18:24:37.000000000 +0800 ++++ linux-root/fs/ext3/ext3-exports.c 2003-05-05 18:19:11.000000000 +0800 +@@ -0,0 +1,13 @@ ++#include ++#include ++#include ++#include ++#include ++ ++EXPORT_SYMBOL(ext3_force_commit); ++EXPORT_SYMBOL(ext3_bread); ++EXPORT_SYMBOL(ext3_xattr_register); ++EXPORT_SYMBOL(ext3_xattr_unregister); ++EXPORT_SYMBOL(ext3_xattr_get); ++EXPORT_SYMBOL(ext3_xattr_list); ++EXPORT_SYMBOL(ext3_xattr_set); + +_ diff --git a/lustre/kernel_patches/patches/lustre-2.5.63.patch b/lustre/kernel_patches/patches/lustre-2.5.63.patch new file mode 100644 index 0000000..40e6a90 --- /dev/null +++ b/lustre/kernel_patches/patches/lustre-2.5.63.patch @@ -0,0 +1,862 @@ + arch/um/kernel/mem.c | 18 ++++++ + fs/dcache.c | 12 +++- + fs/namei.c | 132 ++++++++++++++++++++++++++++++++++++++----------- + fs/namespace.c | 1 + fs/nfsd/vfs.c | 2 + fs/open.c | 39 ++++++++++++-- + fs/stat.c | 2 + fs/sysfs/inode.c | 2 + include/linux/dcache.h | 28 ++++++++++ + include/linux/fs.h | 20 +++++++ + include/linux/namei.h | 3 - + include/linux/slab.h | 1 + kernel/ksyms.c | 7 ++ + mm/slab.c | 5 + + net/unix/af_unix.c | 2 + 15 files changed, 231 insertions(+), 43 deletions(-) + +--- linux-2.5.63-nointent/arch/um/kernel/mem.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/arch/um/kernel/mem.c Tue Mar 18 15:02:10 2003 +@@ -660,6 +660,22 @@ struct page *pte_mem_map(pte_t pte) + return(phys_mem_map(pte_val(pte))); + } + ++struct page *check_get_page(unsigned long kaddr) ++{ ++ struct page *page; ++ struct mem_region *mr; ++ unsigned long phys = __pa(kaddr); ++ unsigned int n = phys_region_index(phys); ++ ++ if(regions[n] == NULL) ++ return NULL; ++ ++ mr = regions[n]; ++ page = (struct page *) mr->mem_map; ++ return page + ((phys_addr(phys)) >> PAGE_SHIFT); ++} ++ ++ + struct mem_region *page_region(struct page *page, int *index_out) + { + int i; +@@ -747,7 +763,7 @@ extern unsigned long region_pa(void *vir + (addr <= region->start + region->len)) + return(mk_phys(addr - region->start, i)); + } +- panic("region_pa : no region for virtual address"); ++ //panic("region_pa : no region for virtual address"); + return(0); + } + +--- linux-2.5.63-nointent/fs/namei.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/fs/namei.c Mon Mar 24 17:08:18 2003 +@@ -101,6 +101,14 @@ + * any extra contention... + */ + ++void intent_release(struct dentry *de, struct lookup_intent *it) ++{ ++ if (it && de->d_op && de->d_op->d_intent_release) ++ de->d_op->d_intent_release(de, it); ++ ++} ++ ++ + /* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. +@@ -273,10 +281,18 @@ void path_release(struct nameidata *nd) + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it) + { + struct dentry * dentry = d_lookup(parent, name); + ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { ++ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && ++ !d_invalidate(dentry)) { ++ dput(dentry); ++ dentry = NULL; ++ } ++ return dentry; ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); +@@ -330,7 +346,7 @@ ok: + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) ++static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it) + { + struct dentry * result; + struct inode *dir = parent->d_inode; +@@ -348,7 +364,10 @@ static struct dentry * real_lookup(struc + struct dentry * dentry = d_alloc(parent, name); + result = ERR_PTR(-ENOMEM); + if (dentry) { +- result = dir->i_op->lookup(dir, dentry); ++ if (dir->i_op->lookup2) ++ result = dir->i_op->lookup2(dir, dentry, it); ++ else ++ result = dir->i_op->lookup(dir, dentry); + if (result) + dput(dentry); + else { +@@ -370,6 +389,12 @@ static struct dentry * real_lookup(struc + dput(result); + result = ERR_PTR(-ENOENT); + } ++ } else if (result->d_op && result->d_op->d_revalidate2) { ++ if (!result->d_op->d_revalidate2(result, flags, it) && ++ !d_invalidate(result)) { ++ dput(result); ++ result = ERR_PTR(-ENOENT); ++ } + } + return result; + } +@@ -402,6 +427,7 @@ static inline int do_follow_link(struct + current->link_count--; + return err; + loop: ++ intent_release(dentry, &nd->it); + path_release(nd); + return err; + } +@@ -447,15 +473,26 @@ static int follow_mount(struct vfsmount + return res; + } + +-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) ++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry, ++ struct lookup_intent *it) + { + struct vfsmount *mounted; + + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (mounted) { ++ int opc = 0, mode = 0; + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); ++ if (it) { ++ opc = it->it_op; ++ mode = it->it_mode; ++ } ++ intent_release(*dentry, it); ++ if (it) { ++ it->it_op = opc; ++ it->it_mode = mode; ++ } + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); +@@ -467,7 +504,7 @@ static inline int __follow_down(struct v + + int follow_down(struct vfsmount **mnt, struct dentry **dentry) + { +- return __follow_down(mnt,dentry); ++ return __follow_down(mnt,dentry,NULL); + } + + static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry) +@@ -531,7 +568,7 @@ done: + return 0; + + need_lookup: +- dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE); ++ dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE, &nd->it); + if (IS_ERR(dentry)) + goto fail; + goto done; +@@ -665,7 +702,7 @@ int link_path_walk(const char * name, st + nd->dentry = next.dentry; + } + err = -ENOTDIR; +- if (!inode->i_op->lookup) ++ if (!inode->i_op->lookup && !inode->i_op->lookup2) + break; + continue; + /* here ends the main loop */ +@@ -716,7 +753,8 @@ last_component: + break; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; +- if (!inode->i_op || !inode->i_op->lookup) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup2)) + break; + } + goto return_base; +@@ -735,6 +773,7 @@ out_dput: + dput(next.dentry); + break; + } ++ intent_release(nd->dentry, &nd->it); + path_release(nd); + return_err: + return err; +@@ -857,7 +896,8 @@ int path_lookup(const char *name, unsign + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +-struct dentry * lookup_hash(struct qstr *name, struct dentry * base) ++struct dentry * lookup_hash(struct qstr *name, struct dentry * base, ++ struct lookup_intent *it) + { + struct dentry * dentry; + struct inode *inode; +@@ -880,13 +920,16 @@ struct dentry * lookup_hash(struct qstr + goto out; + } + +- dentry = cached_lookup(base, name, 0); ++ dentry = cached_lookup(base, name, 0, it); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; +- dentry = inode->i_op->lookup(inode, new); ++ if (inode->i_op->lookup2) ++ dentry = inode->i_op->lookup2(inode, new, it); ++ else ++ dentry = inode->i_op->lookup(inode, new); + if (!dentry) { + dentry = new; + security_inode_post_lookup(inode, dentry); +@@ -898,7 +941,7 @@ out: + } + + /* SMP-safe */ +-struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct lookup_intent *it) + { + unsigned long hash; + struct qstr this; +@@ -918,11 +961,16 @@ struct dentry * lookup_one_len(const cha + } + this.hash = end_name_hash(hash); + +- return lookup_hash(&this, base); ++ return lookup_hash(&this, base, it); + access: + return ERR_PTR(-EACCES); + } + ++struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) ++{ ++ return lookup_one_len_it(name, base, len, NULL); ++} ++ + /* + * namei() + * +@@ -1224,6 +1272,9 @@ int open_namei(const char * pathname, in + /* + * Create - we need to know the parent. + */ ++ nd->it.it_mode = mode; ++ nd->it.it_op |= IT_CREAT; ++ + error = path_lookup(pathname, LOOKUP_PARENT, nd); + if (error) + return error; +@@ -1239,7 +1290,7 @@ int open_namei(const char * pathname, in + + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); + + do_last: + error = PTR_ERR(dentry); +@@ -1247,7 +1298,8 @@ do_last: + up(&dir->d_inode->i_sem); + goto exit; + } +- ++ ++ nd->it.it_mode = mode; + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { + if (!IS_POSIXACL(dir->d_inode)) +@@ -1277,7 +1329,7 @@ do_last: + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; +- while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry)); ++ while (__follow_down(&nd->mnt,&dentry,&nd->it) && d_mountpoint(dentry)); + } + error = -ENOENT; + if (!dentry->d_inode) +@@ -1297,8 +1349,10 @@ ok: + return 0; + + exit_dput: ++ intent_release(dentry, &nd->it); + dput(dentry); + exit: ++ intent_release(nd->dentry, &nd->it); + path_release(nd); + return error; + +@@ -1320,7 +1374,12 @@ do_link: + if (error) + goto exit_dput; + UPDATE_ATIME(dentry->d_inode); +- error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (dentry->d_inode->i_op->follow_link2) ++ error = dentry->d_inode->i_op->follow_link2(dentry, nd, &nd->it); ++ else ++ error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) ++ intent_release(dentry, &nd->it); + dput(dentry); + if (error) + return error; +@@ -1342,7 +1401,7 @@ do_link: + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); + putname(nd->last.name); + goto do_last; + } +@@ -1356,7 +1415,7 @@ static struct dentry *lookup_create(stru + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) +@@ -1588,7 +1647,7 @@ asmlinkage long sys_rmdir(const char * p + goto exit1; + } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash(&nd.last, nd.dentry, &nd.it); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); +@@ -1654,8 +1713,18 @@ asmlinkage long sys_unlink(const char * + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; ++ if (nd.dentry->d_inode->i_op->unlink2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit1; ++ } + down(&nd.dentry->d_inode->i_sem); +- dentry = lookup_hash(&nd.last, nd.dentry); ++// dentry = lookup_hash(&nd.last, nd.dentry, &nd.it); ++ dentry = lookup_hash(&nd.last, nd.dentry, NULL); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ +@@ -1859,7 +1928,8 @@ exit: + * locking]. + */ + int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + int error = 0; + struct inode *target; +@@ -1887,6 +1957,7 @@ int vfs_rename_dir(struct inode *old_dir + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); ++ intent_release(new_dentry, it); + if (target) { + if (!error) + target->i_flags |= S_DEAD; +@@ -1904,7 +1975,8 @@ int vfs_rename_dir(struct inode *old_dir + } + + int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + struct inode *target; + int error; +@@ -1921,6 +1993,7 @@ int vfs_rename_other(struct inode *old_d + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); ++ intent_release(new_dentry, it); + if (!error) { + /* The following d_move() should become unconditional */ + if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME)) +@@ -1934,7 +2007,8 @@ int vfs_rename_other(struct inode *old_d + } + + int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, +- struct inode *new_dir, struct dentry *new_dentry) ++ struct inode *new_dir, struct dentry *new_dentry, ++ struct lookup_intent *it) + { + int error; + int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); +@@ -1960,9 +2034,9 @@ int vfs_rename(struct inode *old_dir, st + DQUOT_INIT(new_dir); + + if (is_dir) +- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); ++ error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry, it); + else +- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); ++ error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry, it); + if (!error) { + if (old_dir == new_dir) + inode_dir_notify(old_dir, DN_RENAME); +@@ -2005,7 +2079,7 @@ static inline int do_rename(const char * + + trap = lock_rename(new_dir, old_dir); + +- old_dentry = lookup_hash(&oldnd.last, old_dir); ++ old_dentry = lookup_hash(&oldnd.last, old_dir, &oldnd.it); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; +@@ -2025,7 +2099,7 @@ static inline int do_rename(const char * + error = -EINVAL; + if (old_dentry == trap) + goto exit4; +- new_dentry = lookup_hash(&newnd.last, new_dir); ++ new_dentry = lookup_hash(&newnd.last, new_dir, &newnd.it); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; +@@ -2035,7 +2109,7 @@ static inline int do_rename(const char * + goto exit5; + + error = vfs_rename(old_dir->d_inode, old_dentry, +- new_dir->d_inode, new_dentry); ++ new_dir->d_inode, new_dentry, NULL); + exit5: + dput(new_dentry); + exit4: +--- linux-2.5.63-nointent/fs/nfsd/vfs.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/fs/nfsd/vfs.c Tue Mar 18 15:02:10 2003 +@@ -1337,7 +1337,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru + err = nfserr_perm; + } else + #endif +- err = vfs_rename(fdir, odentry, tdir, ndentry); ++ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); + if (!err && EX_ISSYNC(tfhp->fh_export)) { + nfsd_sync_dir(tdentry); + nfsd_sync_dir(fdentry); +--- linux-2.5.63-nointent/fs/sysfs/inode.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/fs/sysfs/inode.c Tue Mar 18 15:02:10 2003 +@@ -540,7 +540,7 @@ static struct dentry * get_dentry(struct + qstr.name = name; + qstr.len = strlen(name); + qstr.hash = full_name_hash(name,qstr.len); +- return lookup_hash(&qstr,parent); ++ return lookup_hash(&qstr,parent,NULL); + } + + +--- linux-2.5.63-nointent/include/linux/dcache.h~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/include/linux/dcache.h Tue Mar 18 15:02:10 2003 +@@ -12,6 +12,27 @@ + + struct vfsmount; + ++#define IT_OPEN (1) ++#define IT_CREAT (1<<1) ++#define IT_READDIR (1<<2) ++#define IT_GETATTR (1<<3) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++ ++ ++struct lookup_intent { ++ int it_op; ++ int it_mode; ++ int it_flags; ++ int it_disposition; ++ int it_status; ++ struct iattr *it_iattr; ++ __u64 it_lock_handle[2]; ++ int it_lock_mode; ++ void *it_data; ++}; ++ ++ + /* + * linux/include/linux/dcache.h + * +@@ -34,6 +55,8 @@ struct qstr { + char name_str[0]; + }; + ++#include ++ + struct dentry_stat_t { + int nr_dentry; + int nr_unused; +@@ -87,6 +110,7 @@ struct dentry { + struct list_head d_subdirs; /* our children */ + struct list_head d_alias; /* inode alias list */ + int d_mounted; ++ struct lookup_intent *d_it; + struct qstr d_name; + struct qstr * d_qstr; /* quick str ptr used in lockless lookup and concurrent d_move */ + unsigned long d_time; /* used by d_revalidate */ +@@ -107,6 +131,8 @@ struct dentry_operations { + int (*d_delete)(struct dentry *); + void (*d_release)(struct dentry *); + void (*d_iput)(struct dentry *, struct inode *); ++ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); ++ void (*d_intent_release)(struct dentry *, struct lookup_intent *); + }; + + /* the dentry parameter passed to d_hash and d_compare is the parent +@@ -147,6 +173,8 @@ d_iput: no no yes + + #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ + #define DCACHE_UNHASHED 0x0010 ++#define DCACHE_LUSTRE_INVALID 0x0011 /* Lustre invalidated */ ++ + + extern spinlock_t dcache_lock; + extern rwlock_t dparent_lock; +--- linux-2.5.63-nointent/include/linux/fs.h~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/include/linux/fs.h Tue Mar 18 15:02:10 2003 +@@ -234,6 +234,9 @@ typedef int (get_blocks_t)(struct inode + #define ATTR_ATTR_FLAG 1024 + #define ATTR_KILL_SUID 2048 + #define ATTR_KILL_SGID 4096 ++#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ ++ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -642,7 +645,7 @@ extern int vfs_symlink(struct inode *, s + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); +-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); ++extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct lookup_intent *it); + + /* + * File types +@@ -728,19 +731,33 @@ struct file_operations { + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); + struct dentry * (*lookup) (struct inode *,struct dentry *); ++ struct dentry * (*lookup2) (struct inode *,struct dentry *, ++ struct lookup_intent *); + int (*link) (struct dentry *,struct inode *,struct dentry *); ++ int (*link2) (struct inode *,struct inode *, const char *, int); + int (*unlink) (struct inode *,struct dentry *); ++ int (*unlink2) (struct inode *, const char *, int); + int (*symlink) (struct inode *,struct dentry *,const char *); ++ int (*symlink2) (struct inode *, const char *, int, const char *); + int (*mkdir) (struct inode *,struct dentry *,int); ++ int (*mkdir2) (struct inode *, const char *, int,int); + int (*rmdir) (struct inode *,struct dentry *); ++ int (*rmdir2) (struct inode *, const char *, int); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); ++ int (*mknod2) (struct inode *, const char *, int,int,int); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); ++ int (*rename2) (struct inode *, struct inode *, ++ const char *oldname, int oldlen, ++ const char *newname, int newlen); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); ++ int (*follow_link2) (struct dentry *, struct nameidata *, ++ struct lookup_intent *it); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t,int); +@@ -953,6 +970,7 @@ extern int register_filesystem(struct fi + extern int unregister_filesystem(struct file_system_type *); + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); ++struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + extern int vfs_statfs(struct super_block *, struct statfs *); +--- linux-2.5.63-nointent/include/linux/namei.h~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/include/linux/namei.h Tue Mar 18 15:02:10 2003 +@@ -11,6 +11,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent it; + }; + + /* +@@ -44,7 +45,7 @@ extern int FASTCALL(link_path_walk(const + extern void path_release(struct nameidata *); + + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); +-extern struct dentry * lookup_hash(struct qstr *, struct dentry *); ++extern struct dentry * lookup_hash(struct qstr *, struct dentry *, struct lookup_intent *); + + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); +--- linux-2.5.63-nointent/include/linux/slab.h~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/include/linux/slab.h Tue Mar 18 15:02:10 2003 +@@ -55,6 +55,7 @@ extern int kmem_cache_destroy(kmem_cache + extern int kmem_cache_shrink(kmem_cache_t *); + extern void *kmem_cache_alloc(kmem_cache_t *, int); + extern void kmem_cache_free(kmem_cache_t *, void *); ++extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); + extern unsigned int kmem_cache_size(kmem_cache_t *); + + extern void *kmalloc(size_t, int); +--- linux-2.5.63-nointent/kernel/ksyms.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/kernel/ksyms.c Tue Mar 18 15:02:10 2003 +@@ -377,6 +377,7 @@ EXPORT_SYMBOL(unregister_filesystem); + EXPORT_SYMBOL(kern_mount); + EXPORT_SYMBOL(__mntput); + EXPORT_SYMBOL(may_umount); ++EXPORT_SYMBOL(reparent_to_init); + + /* executable format registration */ + EXPORT_SYMBOL(register_binfmt); +@@ -407,6 +408,12 @@ EXPORT_SYMBOL(request_irq); + EXPORT_SYMBOL(free_irq); + EXPORT_SYMBOL(irq_stat); + ++/* lustre */ ++EXPORT_SYMBOL(do_kern_mount); ++EXPORT_SYMBOL(exit_files); ++EXPORT_SYMBOL(kmem_cache_validate); ++ ++ + /* waitqueue handling */ + EXPORT_SYMBOL(add_wait_queue); + EXPORT_SYMBOL(add_wait_queue_exclusive); +--- linux-2.5.63-nointent/mm/slab.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/mm/slab.c Tue Mar 18 15:02:10 2003 +@@ -1792,6 +1792,11 @@ static inline void __cache_free (kmem_ca + } + } + ++int kmem_cache_validate(kmem_cache_t *cachep, void *objp) ++{ ++ return 1; ++} ++ + /** + * kmem_cache_alloc - Allocate an object + * @cachep: The cache to allocate from. +--- linux-2.5.63-nointent/net/unix/af_unix.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/net/unix/af_unix.c Tue Mar 18 15:02:10 2003 +@@ -720,7 +720,7 @@ static int unix_bind(struct socket *sock + /* + * Do the final lookup. + */ +- dentry = lookup_hash(&nd.last, nd.dentry); ++ dentry = lookup_hash(&nd.last, nd.dentry, NULL); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_mknod_unlock; +--- linux-2.5.63-nointent/fs/dcache.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/fs/dcache.c Tue Mar 18 15:02:10 2003 +@@ -1111,15 +1111,21 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + if (!list_empty(&entry->d_hash) && !d_unhashed(entry)) BUG(); + entry->d_vfs_flags &= ~DCACHE_UNHASHED; + entry->d_bucket = list; + list_add_rcu(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +--- linux-2.5.63-nointent/fs/namespace.c~lustre-2.5.63 Tue Mar 18 15:02:10 2003 ++++ linux-2.5.63-nointent-root/fs/namespace.c Tue Mar 18 15:02:10 2003 +@@ -925,6 +925,7 @@ void set_fs_pwd(struct fs_struct *fs, st + mntput(old_pwdmnt); + } + } ++EXPORT_SYMBOL(set_fs_pwd); + + static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) + { +--- linux-2.5.63-nointent/fs/open.c~lustre-2.5.63 Thu Mar 20 12:43:39 2003 ++++ linux-2.5.63-nointent-root/fs/open.c Mon Mar 24 16:25:47 2003 +@@ -97,7 +97,8 @@ static inline long do_sys_truncate(const + struct nameidata nd; + struct inode * inode; + int error; +- ++ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ nd.it=it; + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; +@@ -142,11 +143,13 @@ static inline long do_sys_truncate(const + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); ++ intent_release(nd.dentry, &nd.it); + error = do_truncate(nd.dentry, length); + } + put_write_access(inode); + + dput_and_out: ++ intent_release(nd.dentry, &nd.it); + path_release(&nd); + out: + return error; +@@ -340,6 +343,8 @@ asmlinkage long sys_access(const char * + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ nd.it=it; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; +@@ -371,6 +376,8 @@ asmlinkage long sys_access(const char * + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; ++ ++ intent_release(nd.dentry, &nd.it); + path_release(&nd); + } + +@@ -385,6 +392,8 @@ asmlinkage long sys_chdir(const char * f + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ nd.it=it; + + error = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &nd); + if (error) +@@ -397,6 +406,7 @@ asmlinkage long sys_chdir(const char * f + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + + dput_and_out: ++ intent_release(nd.dentry, &nd.it); + path_release(&nd); + out: + return error; +@@ -436,6 +446,8 @@ asmlinkage long sys_chroot(const char * + { + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ nd.it=it; + + error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + if (error) +@@ -508,6 +520,18 @@ asmlinkage long sys_chmod(const char * f + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) +@@ -619,7 +643,10 @@ asmlinkage long sys_fchown(unsigned int + struct file *filp_open(const char * filename, int flags, int mode) + { + int namei_flags, error; ++ struct file * temp_filp; + struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags }; ++ nd.it=it; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) +@@ -628,9 +655,11 @@ struct file *filp_open(const char * file + namei_flags |= 2; + + error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); +- ++ if (!error) { ++ temp_filp = dentry_open(nd.dentry, nd.mnt, flags); ++ intent_release(nd.dentry,&nd.it); ++ return temp_filp; ++ } + return ERR_PTR(error); + } + +@@ -675,7 +704,7 @@ struct file *dentry_open(struct dentry * + goto cleanup_all; + } + } +- ++ + return f; + + cleanup_all: +--- linux-2.5.63-nointent/fs/stat.c~lustre-2.5.63 Fri Mar 21 21:15:40 2003 ++++ linux-2.5.63-nointent-root/fs/stat.c Fri Mar 21 21:16:53 2003 +@@ -65,6 +65,7 @@ int vfs_stat(char *name, struct kstat *s + error = user_path_walk(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ intent_release(nd.dentry, &nd.it); + path_release(&nd); + } + return error; +@@ -80,6 +81,7 @@ int vfs_lstat(char *name, struct kstat * + error = user_path_walk_link(name, &nd); + if (!error) { + error = vfs_getattr(nd.mnt, nd.dentry, stat); ++ intent_release(nd.dentry, &nd.it); + path_release(&nd); + } + return error; + +_ diff --git a/lustre/kernel_patches/patches/lustre-2.5.patch b/lustre/kernel_patches/patches/lustre-2.5.patch deleted file mode 100644 index 71d372f..0000000 --- a/lustre/kernel_patches/patches/lustre-2.5.patch +++ /dev/null @@ -1,507 +0,0 @@ - arch/um/kernel/mem.c | 18 +++++++++++- - fs/namei.c | 71 +++++++++++++++++++++++++++++++++++-------------- - fs/nfsd/vfs.c | 2 - - fs/sysfs/inode.c | 2 - - include/linux/dcache.h | 27 ++++++++++++++++++ - include/linux/fs.h | 20 +++++++++++++ - include/linux/namei.h | 3 +- - include/linux/slab.h | 1 - kernel/ksyms.c | 7 ++++ - mm/slab.c | 5 +++ - net/unix/af_unix.c | 2 - - 11 files changed, 132 insertions(+), 26 deletions(-) - ---- linux-2.5.59/arch/um/kernel/mem.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/arch/um/kernel/mem.c 2003-02-22 21:56:58.000000000 +0800 -@@ -639,6 +639,22 @@ struct page *pte_mem_map(pte_t pte) - return(phys_mem_map(pte_val(pte))); - } - -+struct page *check_get_page(unsigned long kaddr) -+{ -+ struct page *page; -+ struct mem_region *mr; -+ unsigned long phys = __pa(kaddr); -+ unsigned int n = phys_region_index(phys); -+ -+ if(regions[n] == NULL) -+ return NULL; -+ -+ mr = regions[n]; -+ page = (struct page *) mr->mem_map; -+ return page + ((phys_addr(phys)) >> PAGE_SHIFT); -+} -+ -+ - struct mem_region *page_region(struct page *page, int *index_out) - { - int i; -@@ -726,7 +742,7 @@ extern unsigned long region_pa(void *vir - (addr <= region->start + region->len)) - return(mk_phys(addr - region->start, i)); - } -- panic("region_pa : no region for virtual address"); -+ //panic("region_pa : no region for virtual address"); - return(0); - } - ---- linux-2.5.59/fs/namei.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/fs/namei.c 2003-02-22 21:56:58.000000000 +0800 -@@ -265,6 +265,9 @@ int deny_write_access(struct file * file - - void path_release(struct nameidata *nd) - { -+ if (nd->dentry && nd->dentry->d_op && -+ nd->dentry->d_op->d_intent_release) -+ nd->dentry->d_op->d_intent_release(nd->dentry, &nd->it); - dput(nd->dentry); - mntput(nd->mnt); - } -@@ -273,10 +276,18 @@ void path_release(struct nameidata *nd) - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { -+ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -351,7 +362,7 @@ ok: - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -@@ -369,7 +380,10 @@ static struct dentry * real_lookup(struc - struct dentry * dentry = d_alloc(parent, name); - result = ERR_PTR(-ENOMEM); - if (dentry) { -- result = dir->i_op->lookup(dir, dentry); -+ if (dir->i_op->lookup2) -+ result = dir->i_op->lookup2(dir, dentry, it); -+ else -+ result = dir->i_op->lookup(dir, dentry); - if (result) - dput(dentry); - else { -@@ -391,6 +405,12 @@ static struct dentry * real_lookup(struc - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate2) { -+ if (!result->d_op->d_revalidate2(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ result = ERR_PTR(-ENOENT); -+ } - } - return result; - } -@@ -534,7 +554,7 @@ dcache_miss: - unlock_nd(nd); - - need_lookup: -- dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE, &nd->it); - if (IS_ERR(dentry)) - goto fail; - mntget(mnt); -@@ -684,7 +704,7 @@ int link_path_walk(const char * name, st - nd->dentry = next.dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup2) - break; - continue; - /* here ends the main loop */ -@@ -737,7 +757,8 @@ last_component: - break; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup2)) - break; - } - goto return_base; -@@ -886,7 +907,8 @@ int path_lookup(const char *name, unsign - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -909,13 +931,16 @@ struct dentry * lookup_hash(struct qstr - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; -- dentry = inode->i_op->lookup(inode, new); -+ if (inode->i_op->lookup2) -+ dentry = inode->i_op->lookup2(inode, new, it); -+ else -+ dentry = inode->i_op->lookup(inode, new); - if (!dentry) { - dentry = new; - security_inode_post_lookup(inode, dentry); -@@ -927,7 +952,7 @@ out: - } - - /* SMP-safe */ --struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+struct dentry * lookup_one_len_it(const char * name, struct dentry * base, int len, struct lookup_intent *it) - { - unsigned long hash; - struct qstr this; -@@ -947,11 +972,16 @@ struct dentry * lookup_one_len(const cha - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash(&this, base, it); - access: - return ERR_PTR(-EACCES); - } - -+struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) -+{ -+ return lookup_one_len_it(name, base, len, NULL); -+} -+ - /* - * namei() - * -@@ -1268,7 +1298,7 @@ int open_namei(const char * pathname, in - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); - - do_last: - error = PTR_ERR(dentry); -@@ -1371,7 +1401,7 @@ do_link: - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); - putname(nd->last.name); - goto do_last; - } -@@ -1385,7 +1415,7 @@ static struct dentry *lookup_create(stru - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash(&nd->last, nd->dentry, &nd->it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1617,7 +1647,7 @@ asmlinkage long sys_rmdir(const char * p - goto exit1; - } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash(&nd.last, nd.dentry, &nd.it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1677,7 +1707,7 @@ asmlinkage long sys_unlink(const char * - if (nd.last_type != LAST_NORM) - goto exit1; - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash(&nd.last, nd.dentry, &nd.it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1951,7 +1981,8 @@ int vfs_rename_other(struct inode *old_d - } - - int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - int is_dir = S_ISDIR(old_dentry->d_inode->i_mode); -@@ -2022,7 +2053,7 @@ static inline int do_rename(const char * - - trap = lock_rename(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash(&oldnd.last, old_dir, &oldnd.it); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -2042,7 +2073,7 @@ static inline int do_rename(const char * - error = -EINVAL; - if (old_dentry == trap) - goto exit4; -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ new_dentry = lookup_hash(&newnd.last, new_dir, &newnd.it); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; -@@ -2052,7 +2083,7 @@ static inline int do_rename(const char * - goto exit5; - - error = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, NULL); - exit5: - dput(new_dentry); - exit4: ---- linux-2.5.59/fs/nfsd/vfs.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/fs/nfsd/vfs.c 2003-02-22 21:56:58.000000000 +0800 -@@ -1337,7 +1337,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); ---- linux-2.5.59/fs/sysfs/inode.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/fs/sysfs/inode.c 2003-02-22 21:56:58.000000000 +0800 -@@ -539,7 +539,7 @@ static struct dentry * get_dentry(struct - qstr.name = name; - qstr.len = strlen(name); - qstr.hash = full_name_hash(name,qstr.len); -- return lookup_hash(&qstr,parent); -+ return lookup_hash(&qstr,parent,NULL); - } - - ---- linux-2.5.59/include/linux/dcache.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/include/linux/dcache.h 2003-02-22 22:02:55.000000000 +0800 -@@ -11,6 +11,27 @@ - - struct vfsmount; - -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_READDIR (1<<2) -+#define IT_GETATTR (1<<3) -+#define IT_LOOKUP (1<<4) -+#define IT_UNLINK (1<<5) -+ -+ -+struct lookup_intent { -+ int it_op; -+ int it_mode; -+ int it_flags; -+ int it_disposition; -+ int it_status; -+ struct iattr *it_iattr; -+ __u64 it_lock_handle[2]; -+ int it_lock_mode; -+ void *it_data; -+}; -+ -+ - /* - * linux/include/linux/dcache.h - * -@@ -32,6 +53,8 @@ struct qstr { - unsigned int hash; - }; - -+#include -+ - struct dentry_stat_t { - int nr_dentry; - int nr_unused; -@@ -81,6 +104,7 @@ struct dentry { - struct list_head d_subdirs; /* our children */ - struct list_head d_alias; /* inode alias list */ - int d_mounted; -+ struct lookup_intent *d_it; - struct qstr d_name; - unsigned long d_time; /* used by d_revalidate */ - struct dentry_operations *d_op; -@@ -100,6 +124,8 @@ struct dentry_operations { - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); -+ void (*d_intent_release)(struct dentry *, struct lookup_intent *); - }; - - /* the dentry parameter passed to d_hash and d_compare is the parent -@@ -139,6 +165,7 @@ d_iput: no no yes - */ - - #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ -+#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */ - - extern spinlock_t dcache_lock; - extern rwlock_t dparent_lock; ---- linux-2.5.59/include/linux/fs.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/include/linux/fs.h 2003-02-22 22:52:58.000000000 +0800 -@@ -234,6 +234,9 @@ typedef int (get_blocks_t)(struct inode - #define ATTR_ATTR_FLAG 1024 - #define ATTR_KILL_SUID 2048 - #define ATTR_KILL_SGID 4096 -+#define ATTR_RAW 8192 /* file system, not vfs will massage attrs */ -+#define ATTR_FROM_OPEN 16384 /* called from open path, ie O_TRUNC */ -+ - - /* - * This is the Inode Attributes structure, used for notify_change(). It -@@ -676,7 +679,7 @@ extern int vfs_symlink(struct inode *, s - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct lookup_intent *it); - - /* - * File types -@@ -762,19 +765,33 @@ struct file_operations { - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup2) (struct inode *,struct dentry *, -+ struct lookup_intent *); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link2) (struct inode *,struct inode *, const char *, int); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink2) (struct inode *, const char *, int); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink2) (struct inode *, const char *, int, const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir2) (struct inode *, const char *, int,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir2) (struct inode *, const char *, int); - int (*mknod) (struct inode *,struct dentry *,int,dev_t); -+ int (*mknod2) (struct inode *, const char *, int,int,int); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename2) (struct inode *, struct inode *, -+ const char *oldname, int oldlen, -+ const char *newname, int newlen); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*setattr) (struct dentry *, struct iattr *); -+ int (*setattr_raw) (struct inode *, struct iattr *); - int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); - int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); - ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -@@ -987,6 +1004,7 @@ extern int register_filesystem(struct fi - extern int unregister_filesystem(struct file_system_type *); - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); -+struct vfsmount *do_kern_mount(const char *type, int flags, char *name, void *data); - extern long do_mount(char *, char *, char *, unsigned long, void *); - - extern int vfs_statfs(struct super_block *, struct statfs *); ---- linux-2.5.59/include/linux/namei.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/include/linux/namei.h 2003-02-22 21:56:58.000000000 +0800 -@@ -13,6 +13,7 @@ struct nameidata { - int last_type; - struct dentry *old_dentry; - struct vfsmount *old_mnt; -+ struct lookup_intent it; - }; - - /* -@@ -46,7 +47,7 @@ extern int FASTCALL(link_path_walk(const - extern void path_release(struct nameidata *); - - extern struct dentry * lookup_one_len(const char *, struct dentry *, int); --extern struct dentry * lookup_hash(struct qstr *, struct dentry *); -+extern struct dentry * lookup_hash(struct qstr *, struct dentry *, struct lookup_intent *); - - extern int follow_down(struct vfsmount **, struct dentry **); - extern int follow_up(struct vfsmount **, struct dentry **); ---- linux-2.5.59/include/linux/slab.h~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/include/linux/slab.h 2003-02-22 21:56:58.000000000 +0800 -@@ -56,6 +56,7 @@ extern int kmem_cache_destroy(kmem_cache - extern int kmem_cache_shrink(kmem_cache_t *); - extern void *kmem_cache_alloc(kmem_cache_t *, int); - extern void kmem_cache_free(kmem_cache_t *, void *); -+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); - extern unsigned int kmem_cache_size(kmem_cache_t *); - - extern void *kmalloc(size_t, int); ---- linux-2.5.59/kernel/ksyms.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/kernel/ksyms.c 2003-02-22 21:56:58.000000000 +0800 -@@ -376,6 +376,7 @@ EXPORT_SYMBOL(unregister_filesystem); - EXPORT_SYMBOL(kern_mount); - EXPORT_SYMBOL(__mntput); - EXPORT_SYMBOL(may_umount); -+EXPORT_SYMBOL(reparent_to_init); - - /* executable format registration */ - EXPORT_SYMBOL(register_binfmt); -@@ -406,6 +407,12 @@ EXPORT_SYMBOL(request_irq); - EXPORT_SYMBOL(free_irq); - EXPORT_SYMBOL(irq_stat); - -+/* lustre */ -+EXPORT_SYMBOL(do_kern_mount); -+EXPORT_SYMBOL(exit_files); -+EXPORT_SYMBOL(kmem_cache_validate); -+ -+ - /* waitqueue handling */ - EXPORT_SYMBOL(add_wait_queue); - EXPORT_SYMBOL(add_wait_queue_exclusive); ---- linux-2.5.59/mm/slab.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/mm/slab.c 2003-02-22 21:56:58.000000000 +0800 -@@ -1793,6 +1793,11 @@ static inline void __cache_free (kmem_ca - } - } - -+int kmem_cache_validate(kmem_cache_t *cachep, void *objp) -+{ -+ return 1; -+} -+ - /** - * kmem_cache_alloc - Allocate an object - * @cachep: The cache to allocate from. ---- linux-2.5.59/net/unix/af_unix.c~lustre-2.5 2003-02-22 21:56:58.000000000 +0800 -+++ linux-2.5.59-root/net/unix/af_unix.c 2003-02-22 21:56:58.000000000 +0800 -@@ -719,7 +719,7 @@ static int unix_bind(struct socket *sock - /* - * Do the final lookup. - */ -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash(&nd.last, nd.dentry, NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out_mknod_unlock; - -_ diff --git a/lustre/kernel_patches/patches/lustre_version.patch b/lustre/kernel_patches/patches/lustre_version.patch index d7b6dce..78855ac 100644 --- a/lustre/kernel_patches/patches/lustre_version.patch +++ b/lustre/kernel_patches/patches/lustre_version.patch @@ -7,6 +7,6 @@ --- /dev/null Fri Aug 30 17:31:37 2002 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h Thu Feb 13 07:58:33 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 13 ++#define LUSTRE_KERNEL_VERSION 19 _ diff --git a/lustre/kernel_patches/patches/mcore-2.4.20-8.patch b/lustre/kernel_patches/patches/mcore-2.4.20-8.patch new file mode 100644 index 0000000..c8b80eb --- /dev/null +++ b/lustre/kernel_patches/patches/mcore-2.4.20-8.patch @@ -0,0 +1,2738 @@ +? linux/.config +? linux/include/linux/autoconf.h +? linux/include/linux/modules +Index: linux/Makefile +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/Makefile,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/Makefile 12 Mar 2003 19:48:52 -0000 1.3.2.1 ++++ linux/Makefile 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 +@@ -99,6 +99,10 @@ + CFLAGS += -fomit-frame-pointer + endif + AFLAGS := -D__ASSEMBLY__ $(CPPFLAGS) ++ifeq ($(CONFIG_MCL_COREDUMP),y) ++ CFLAGS += -g ++endif ++ + + # + # ROOT_DEV specifies the default root-device when making the image. +Index: linux/Documentation/Configure.help +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/Documentation/Configure.help,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/Documentation/Configure.help 12 Mar 2003 19:48:52 -0000 1.3.2.1 ++++ linux/Documentation/Configure.help 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 +@@ -21660,6 +21660,35 @@ + This option allows you to run the kernel with data cache disabled. + Say Y if you experience CPM lock-ups. + ++Boot kernel image support ++CONFIG_BOOTIMG ++ Add support for booting a new Linux kernel from a running Linux ++ system. You need to download the bootimg(8) utility from ++ ftp://icaftp.epfl.ch/pub/people/almesber/misc/bootimg-current.tar.gz ++ in order to use this functionality. ++ ++Protect SMP configuration tables ++CONFIG_BOOTIMG_SMP ++ On SMP systems, the BIOS stores tables with configuration data in ++ memory and an SMP-enabled kernel reads these tables. However, a ++ kernel without SMP support will overwrite such tables. If a kernel ++ without SMP support used bootimg to boot an SMP-enabled kernel, the ++ latter will probably crash when trying to read the SMP tables. The ++ CONFIG_BOOTIMG_SMP option enables minimal support for scanning and ++ protecting of SMP configuration tables also for kernels without SMP ++ support. ++ ++In-memory kernel core dump facility ++CONFIG_MCL_COREDUMP ++ In conjunction with bootimg, this allows you to get kernel core dumps ++ of your system at panic() time. The panic call is modified so that it ++ calls the core dump facility and reboots the system. On the way back ++ up, the kernel dump image is written out to disk by the accompanying ++ init script. You can use the crash analysis tool to analyze the core ++ dump. This tool can be found at : ++ ++ http://www.missioncriticallinux.com/download ++ + # + # m68k-specific kernel options + # Documented by Chris Lawrence et al. +Index: linux/arch/i386/config.in +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/config.in,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.2 +diff -u -r1.3.2.1 -r1.3.2.1.2.2 +--- linux/arch/i386/config.in 12 Mar 2003 19:49:05 -0000 1.3.2.1 ++++ linux/arch/i386/config.in 1 Apr 2003 19:35:12 -0000 1.3.2.1.2.2 +@@ -502,6 +502,12 @@ + bool ' Magic SysRq key' CONFIG_MAGIC_SYSRQ + bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK + bool ' Compile the kernel with frame pointers' CONFIG_FRAME_POINTER ++ if [ "$CONFIG_FRAME_POINTER " != "n" ]; then ++ bool ' Kernel Core Dump Facility' CONFIG_MCL_COREDUMP ++ if [ "$CONFIG_MCL_COREDUMP" = "y" ]; then ++ bool ' Reboot using bootimg' CONFIG_BOOTIMG ++ fi ++ fi + fi + + endmenu +Index: linux/arch/i386/vmlinux.lds +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/vmlinux.lds,v +retrieving revision 1.1.1.1.4.1 +retrieving revision 1.1.1.1.4.1.2.1 +diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 +--- linux/arch/i386/vmlinux.lds 12 Mar 2003 19:49:05 -0000 1.1.1.1.4.1 ++++ linux/arch/i386/vmlinux.lds 1 Apr 2003 12:17:40 -0000 1.1.1.1.4.1.2.1 +@@ -19,6 +19,13 @@ + .rodata : { *(.rodata) *(.rodata.*) } + .kstrtab : { *(.kstrtab) } + ++ . = ALIGN(16); /* Relocatable bootimage code */ ++ __bootimg_start = .; ++ .bootimg : { ++ *(.bootimg) ++ } ++ __bootimg_end = .; ++ + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } +Index: linux/arch/i386/boot/setup.S +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/setup.S,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.1 +diff -u -r1.2.2.1 -r1.2.2.1.2.1 +--- linux/arch/i386/boot/setup.S 12 Mar 2003 19:49:05 -0000 1.2.2.1 ++++ linux/arch/i386/boot/setup.S 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 +@@ -105,16 +105,22 @@ + # flags, unused bits must be zero (RFU) bit within loadflags + loadflags: + LOADED_HIGH = 1 # If set, the kernel is loaded high ++RELOADS_GDT = 2 # if set, kernel reloads GDT, such that ++ # boot loader does not have to provide ++ # GDT in a "safe" memory location + CAN_USE_HEAP = 0x80 # If set, the loader also has set + # heap_end_ptr to tell how much + # space behind setup.S can be used for + # heap purposes. + # Only the loader knows what is free +-#ifndef __BIG_KERNEL__ +- .byte 0 +-#else +- .byte LOADED_HIGH ++_FLAGS = 0 ++#ifdef __BIG_KERNEL__ ++ _FLAGS = _FLAGS | LOADED_HIGH + #endif ++#ifdef CONFIG_BOOTIMG ++ _FLAGS = _FLAGS | RELOADS_GDT ++#endif ++ .byte _FLAGS + + setup_move_size: .word 0x8000 # size to move, when setup is not + # loaded at 0x90000. We will move setup +Index: linux/arch/i386/kernel/Makefile +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/Makefile,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.1 +diff -u -r1.2.2.1 -r1.2.2.1.2.1 +--- linux/arch/i386/kernel/Makefile 12 Mar 2003 19:49:05 -0000 1.2.2.1 ++++ linux/arch/i386/kernel/Makefile 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 +@@ -49,6 +49,7 @@ + obj-$(CONFIG_X86_LONGRUN) += longrun.o + obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o + obj-$(CONFIG_PROFILING) += profile.o ++obj-$(CONFIG_MCL_COREDUMP) += crash.o + + + include $(TOPDIR)/Rules.make +Index: linux/arch/i386/kernel/crash.c +=================================================================== +RCS file: linux/arch/i386/kernel/crash.c +diff -N linux/arch/i386/kernel/crash.c +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/arch/i386/kernel/crash.c 1 Apr 2003 12:17:40 -0000 1.1.6.1 +@@ -0,0 +1,82 @@ ++/* ++ * linux/arch/i386/crash.c ++ * ++ * Architecture dependant code for MCL in-memory core dump. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++inline void crash_save_regs(void) { ++ static unsigned long regs[8]; ++ ++ __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs[0])); ++ __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs[1])); ++ __asm__ __volatile__("movl %%edx,%0" : "=m"(regs[2])); ++ __asm__ __volatile__("movl %%esi,%0" : "=m"(regs[3])); ++ __asm__ __volatile__("movl %%edi,%0" : "=m"(regs[4])); ++ __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs[5])); ++ __asm__ __volatile__("movl %%eax,%0" : "=m"(regs[6])); ++ __asm__ __volatile__("movl %%esp,%0" : "=m"(regs[7])); ++ ++ panic_regs = regs; ++} ++ ++/* ++ * Save the current stack pointer and EIP. ++ */ ++void crash_save_current_state(struct task_struct *tp) ++{ ++ /* ++ * Here we save ebp instead of esp just in case the compiler ++ * decides to put an extra push in before we execute this ++ * instruction (thus invalidating our frame pointer). ++ */ ++ asm volatile("movl %%ebp,%0":"=m" (*(u_long *)&tp->thread.esp)); ++ tp->thread.eip = (u_long)crash_save_current_state; ++ panic_ksp[smp_processor_id()] = tp->thread.esp; ++ mb(); ++ ++ save_core(); ++ ++ crash_halt_or_reboot(1); ++} ++ ++/* ++ * If we are not the panicking thread, we simply halt. Otherwise, ++ * we take care of calling the reboot code. ++ */ ++void crash_halt_or_reboot(int boot_cpu) ++{ ++#ifdef CONFIG_SMP ++ if (!boot_cpu) { ++ stop_this_cpu(NULL); ++ /* NOTREACHED */ ++ } ++#endif ++ machine_restart(NULL); ++} ++ ++void crash_cleanup_smp_state(void) ++{ ++ /* ++ * Here we duplicate smp_send_stop. Crash_halt_or_reboot() calls ++ * stop_this_cpu. We now know that we are the only one running, ++ * so we finish off the smp_send_stop function. ++ */ ++ __cli(); ++#ifdef CONFIG_SMP ++ disable_local_APIC(); ++#endif ++} ++ ++/* ++ * Core dump IPI ++ */ ++void smp_crash_funnel_cpu(void) ++{ ++ crash_save_current_state(current); ++} +Index: linux/arch/i386/kernel/nmi.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/nmi.c,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.1 +diff -u -r1.2.2.1 -r1.2.2.1.2.1 +--- linux/arch/i386/kernel/nmi.c 12 Mar 2003 19:49:06 -0000 1.2.2.1 ++++ linux/arch/i386/kernel/nmi.c 1 Apr 2003 12:17:40 -0000 1.2.2.1.2.1 +@@ -374,11 +374,18 @@ + bust_spinlocks(1); + printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip); + show_registers(regs); ++#ifdef CONFIG_MCL_COREDUMP ++ spin_unlock(&nmi_print_lock); ++ bust_spinlocks(0); ++ panic("die"); ++ /* NOTREACHED */ ++#else + printk("console shuts up ...\n"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + do_exit(SIGSEGV); ++#endif + } + } else { + last_irq_sums[cpu] = sum; +Index: linux/arch/i386/kernel/process.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/process.c,v +retrieving revision 1.2.2.2 +retrieving revision 1.2.2.2.2.1 +diff -u -r1.2.2.2 -r1.2.2.2.2.1 +--- linux/arch/i386/kernel/process.c 1 Apr 2003 02:11:17 -0000 1.2.2.2 ++++ linux/arch/i386/kernel/process.c 1 Apr 2003 12:17:40 -0000 1.2.2.2.2.1 +@@ -50,6 +50,9 @@ + #ifdef CONFIG_MATH_EMULATION + #include + #endif ++#ifdef CONFIG_BOOTIMG ++#include ++#endif + + #include + +@@ -377,7 +380,21 @@ + + void machine_restart(char * __unused) + { ++#ifdef CONFIG_MCL_COREDUMP ++ extern char *panicmsg; ++ /* ++ * Only call bootimg if we have a valid descriptor and ++ * we are in a panic() context. ++ */ ++ if (panicmsg) ++#endif ++#ifdef CONFIG_BOOTIMG ++ if (bootimg_dsc.page_dir) ++ boot_image(); ++#endif ++ + #if CONFIG_SMP ++{ + int cpuid; + + cpuid = GET_APIC_ID(apic_read(APIC_ID)); +@@ -413,6 +430,7 @@ + if (!netdump_func) + smp_send_stop(); + disable_IO_APIC(); ++} + #endif + + if(!reboot_thru_bios) { +Index: linux/arch/i386/kernel/setup.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/setup.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.2 +diff -u -r1.3.2.1 -r1.3.2.1.2.2 +--- linux/arch/i386/kernel/setup.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 ++++ linux/arch/i386/kernel/setup.c 1 Apr 2003 17:55:35 -0000 1.3.2.1.2.2 +@@ -116,6 +116,9 @@ + #include + #include + #include ++#ifdef CONFIG_MCL_COREDUMP ++#include ++#endif + /* + * Machine setup.. + */ +@@ -973,6 +976,7 @@ + static unsigned long __init setup_memory(void) + { + unsigned long bootmap_size, start_pfn, max_low_pfn; ++ unsigned long bootmap_pages = 0UL, crash_pages = 0UL; + + /* + * partially used pages are not usable - thus +@@ -992,6 +996,21 @@ + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + #endif ++ ++#ifdef CONFIG_MCL_COREDUMP ++ bootmap_pages = bootmem_bootmap_pages(max_low_pfn); ++ crash_pages = crash_pages_needed(); ++ ++ printk("start_pfn: %d, bootmap_pages: %d\n", start_pfn, bootmap_pages); ++ ++ crash_init((u_long)phys_to_virt(PFN_PHYS(start_pfn)), ++ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn)), ++ (u_long)phys_to_virt(PFN_PHYS(LOW_OFFSET + start_pfn + ++ crash_pages))); ++ ++ printk("new start_pfn: %08lx\n", PFN_PHYS(start_pfn)); ++ printk("crash map starts at %lx\n",(start_pfn+bootmap_pages)*PAGE_SIZE); ++#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + /* +@@ -1007,8 +1026,8 @@ + * the (very unlikely) case of us accidentally initializing the + * bootmem allocator with an invalid RAM area. + */ +- reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + +- bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); ++ reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(start_pfn) + bootmap_size + ++ ((1+crash_pages)*PAGE_SIZE) + PAGE_SIZE-1) - (HIGH_MEMORY)); + + /* + * reserve physical page 0 - it's a special BIOS page on many boxes, +@@ -1016,6 +1035,16 @@ + */ + reserve_bootmem(0, PAGE_SIZE); + ++#ifdef CONFIG_BOOTIMG ++ /* ++ * bootimg(8) reads the old parameter block. Note that the copy in ++ * empty_zero_page will vanish when mem_init runs. (Should we ++ * memcpy(phys_to_virt(0x90000), PARAM, PAGE_SIZE); ++ * now ?) ++ */ ++ reserve_bootmem(0x90000, PAGE_SIZE); ++#endif ++ + #ifdef CONFIG_SMP + /* + * But first pinch a few for the stack/trampoline stuff +@@ -1032,6 +1061,7 @@ + find_smp_config(); + #endif + #ifdef CONFIG_BLK_DEV_INITRD ++ printk("caution: initrd may overwrite dump\n"); /* phro */ + if (LOADER_TYPE && INITRD_START) { + if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) { + reserve_bootmem(INITRD_START, INITRD_SIZE); +@@ -1172,6 +1202,12 @@ + smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ + #endif + paging_init(); ++#ifdef CONFIG_MCL_COREDUMP ++ /* ++ * Reserve crash pages ++ */ ++ crash_mark_dump_reserved(); ++#endif + #ifdef CONFIG_X86_LOCAL_APIC + /* + * get boot-time SMP configuration: +Index: linux/arch/i386/kernel/smp.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/smp.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/arch/i386/kernel/smp.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 ++++ linux/arch/i386/kernel/smp.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 +@@ -23,6 +23,9 @@ + #include + #include + ++#ifdef CONFIG_MCL_COREDUMP ++#include ++#endif + /* + * Some notes on x86 processor bugs affecting SMP operation: + * +@@ -579,7 +582,7 @@ + return 0; + } + +-static void stop_this_cpu (void * dummy) ++void stop_this_cpu (void * dummy) + { + /* + * Remove this CPU: +Index: linux/arch/i386/kernel/traps.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/traps.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/arch/i386/kernel/traps.c 12 Mar 2003 19:49:06 -0000 1.3.2.1 ++++ linux/arch/i386/kernel/traps.c 1 Apr 2003 12:17:40 -0000 1.3.2.1.2.1 +@@ -52,6 +52,10 @@ + #include + #include + ++#ifdef CONFIG_MCL_COREDUMP ++#include ++#endif ++ + asmlinkage int system_call(void); + asmlinkage void lcall7(void); + asmlinkage void lcall27(void); +@@ -309,7 +313,11 @@ + netdump_func(regs); + bust_spinlocks(0); + spin_unlock_irq(&die_lock); +- do_exit(SIGSEGV); ++#ifdef CONFIG_MCL_COREDUMP ++ if(panic_on_oops) ++ panic("die"); ++#endif ++ do_exit(SIGSEGV);/* NOTREACHED */ + } + + static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) +Index: linux/drivers/char/misc.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/misc.c,v +retrieving revision 1.2 +retrieving revision 1.2.4.1 +diff -u -r1.2 -r1.2.4.1 +--- linux/drivers/char/misc.c 25 Sep 2002 17:11:05 -0000 1.2 ++++ linux/drivers/char/misc.c 1 Apr 2003 12:17:41 -0000 1.2.4.1 +@@ -78,6 +78,8 @@ + extern int i8k_init(void); + extern int lcd_init(void); + ++extern int crash_init_chrdev(void); ++ + static int misc_read_proc(char *buf, char **start, off_t offset, + int len, int *eof, void *private) + { +@@ -255,6 +257,9 @@ + int __init misc_init(void) + { + create_proc_read_entry("misc", 0, 0, misc_read_proc, NULL); ++#ifdef CONFIG_MCL_COREDUMP ++ crash_init_chrdev(); ++#endif + #ifdef CONFIG_MVME16x + rtc_MK48T08_init(); + #endif +Index: linux/drivers/char/sysrq.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/drivers/char/sysrq.c,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.2 +diff -u -r1.2.2.1 -r1.2.2.1.2.2 +--- linux/drivers/char/sysrq.c 12 Mar 2003 19:49:47 -0000 1.2.2.1 ++++ linux/drivers/char/sysrq.c 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 +@@ -97,7 +97,18 @@ + action_msg: "Resetting", + }; + +- ++#ifdef CONFIG_MCL_COREDUMP ++/* kernel core dump sysrq */ ++static void sysrq_handle_coredump(int key, struct pt_regs *pt_regs, ++ struct kbd_struct *kbd, struct tty_struct *ttty) { ++ panic("sysrq"); ++} ++static struct sysrq_key_op sysrq_coredump_op = { ++ handler: sysrq_handle_coredump, ++ help_msg: "Crash", ++ action_msg: "Dumping core", ++}; ++#endif + + /* SYNC SYSRQ HANDLERS BLOCK */ + +@@ -334,7 +345,11 @@ + it is handled specially on the spark + and will never arive */ + /* b */ &sysrq_reboot_op, ++#ifdef CONFIG_MCL_COREDUMP ++/* c */ &sysrq_coredump_op, ++#else + /* c */ NULL, ++#endif + /* d */ NULL, + /* e */ &sysrq_term_op, + /* f */ NULL, +Index: linux/include/asm-i386/bootimg.h +=================================================================== +RCS file: linux/include/asm-i386/bootimg.h +diff -N linux/include/asm-i386/bootimg.h +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/include/asm-i386/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,141 @@ ++/* asm-i386/bootimg.h - Boot image, i386-specific code */ ++ ++/* Written 2000 by Werner Almesberger */ ++ ++/* ++ * When porting bootimg(2) to a new architcture, you need to adapt the ++ * functions and definitions in this file. ++ */ ++ ++ ++#ifndef _ASM_I386_BOOTIMG_H ++#define _ASM_I386_BOOTIMG_H ++ ++#include ++#include ++ ++#ifdef CONFIG_SMP ++#include ++#include ++#endif ++ ++ ++/* ++ * The memory page with the code currently executing has been copied from ++ * old_page to new_page. Jump there. ++ * ++ * Note: flush_icache_range has already been called on the new page. ++ */ ++ ++static inline void jump_relocated(unsigned long old_page,unsigned long new_page) ++{ ++ int tmp; ++ ++ __asm__ __volatile__( ++ "stc\n\t" ++ "call 1f\n" ++ "1:\tjnc 2f\n\t" ++ "popl %0\n\t" ++ "addl %1,%0\n\t" ++ "addl %1,%%esp\n\t" ++ "clc\n\t" ++ "jmp *%0\n" ++ "2:" ++ : "=&r" (tmp) : "r" (new_page-old_page)); ++} ++ ++ ++/* ++ * Stop paging, such that ++ * - page tables can be overwritten ++ * - all physical memory can be accessed ++ * - all physical memory is identity-mapped ++ * ++ * (Other rules are possible, but need to be encoded in bootimg(8).) ++ */ ++ ++static inline void stop_paging(void) ++{ ++ unsigned long msw; ++ ++ __asm__ __volatile__( ++ "movl %%cr0,%0\n\t" ++ "andl $0x7fffffff,%0\n\t" ++ "movl %0,%%cr0\n\t" ++ "jmp 1f\n\t" /* i486 and such */ ++ "1:" ++ ++/* Clear the PAE bit in register %cr4 if we were in PAE mode. The initial ++ * page table set up by the new kernel's bootstrap code is non-PAE regardless ++ * of whether the new kernel is a PAE kernel. By clearing the PAE bit here, ++ * we make sure the bootstrap code doesn't accidentally enable PAE mode when ++ * it turns on address translation. ++ */ ++#ifdef CONFIG_X86_PAE ++ "movl %%cr4,%0\n\t" ++ "andl $0xffffffdf,%0\n\t" ++ "movl %0,%%cr4\n\t" ++#endif ++ ++ : "=&r" (msw) : : "memory"); ++} ++ ++ ++/* ++ * Stop any remaining concurrency in the system. If become_only_thread fails ++ * but the system is still usable, become_only_thread should return an error ++ * code. If no recovery is possible, it may as well panic. ++ */ ++ ++static inline int become_only_thread(void) ++{ ++#ifdef CONFIG_SMP ++ smp_send_stop(); ++ disable_IO_APIC(); ++#endif ++ cli(); ++ return 0; ++} ++ ++ ++/* ++ * A conservative estimate of the number of bytes relocate_and_jump allocated ++ * on the stack. This is only used for sanity checking before running code, ++ * because we can't recover from failure in relocate_and_jump. ++ */ ++ ++#define RESERVE_MIN_RELOC_STACK 256 ++ ++ ++/* ++ * Change the stack pointer such that stack is at the end of the specified ++ * page. No data on the old stack will be accessed anymore, so no copying is ++ * required. ++ */ ++ ++static inline void stack_on_page(void *page) ++{ ++ __asm__ __volatile__( ++ "push %%ds\n\t" ++ "pop %%ss\n\t" ++ "movl %0,%%esp\n\t" ++ "addl $0x1000,%%esp\n\t" ++ : : "r" (page)); ++} ++ ++/* ++ * Set up things such that the kernel will be comfortable (e.g. some ++ * architectures expect the boot loader to set registers in certain ways), ++ * and then jump to the kernel's entry address. ++ */ ++ ++static inline void jump_to_kernel(void (*kernel_entry)(void)) ++{ ++ __asm__ __volatile__( ++ "mov $0x90000,%%esi\n\t" ++ : : ); ++ ++ kernel_entry(); ++} ++ ++#endif +Index: linux/include/asm-i386/crash.h +=================================================================== +RCS file: linux/include/asm-i386/crash.h +diff -N linux/include/asm-i386/crash.h +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/include/asm-i386/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,15 @@ ++#ifndef __ASM_CRASH_H ++#define __ASM_CRASH_H ++ ++#define UPPER_MEM_BACKUP 0 ++#define LOWER_MEM_FORWARD 0 ++#define LOW_OFFSET 100 ++ ++/* ++ * These two functions are inlined on alpha. That's why they appear ++ * in the arch dependent include file. ++ */ ++void crash_save_current_state(struct task_struct *); ++void crash_halt_or_reboot(int); ++ ++#endif +Index: linux/include/linux/bootimg.h +=================================================================== +RCS file: linux/include/linux/bootimg.h +diff -N linux/include/linux/bootimg.h +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/include/linux/bootimg.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,84 @@ ++/* linux/bootimg.h - Boot image, general definitions */ ++ ++/* Written 2000 by Werner Almesberger */ ++ ++ ++#ifndef _LINUX_BOOTIMG_H ++#define _LINUX_BOOTIMG_H ++ ++ ++/* ++ * Constraints on image_map: ++ * - each image_map[n] is the virtual address of a page-sized memory region ++ * readable by the user ++ * - currently, image_map[n] is not required to be page-aligned, but this may ++ * change in the future if we want to map pages directly to lower memory ++ * pressure (NB: mapping works for ELF and plain binary images, but usually ++ * not for (b)zImages, because the prepended boot and setup sectors ++ * mis-align them) ++ * ++ * Constraints on load_map: ++ * - each load_map[] is the physical address of a page in RAM ++ */ ++ ++struct boot_image { ++ void **image_map; /* pointers to image pages in user memory */ ++ int pages; /* length in pages */ ++ unsigned long *load_map;/* list of destination pages (physical addr) */ ++ unsigned long start; /* jump to this physical address */ ++ int flags; /* for future use, must be zero for now */ ++}; ++ ++ ++#ifdef __KERNEL__ ++ ++#define __bootimg __attribute__ ((__section__ (".bootimg"))) ++ ++ ++struct bootimg_dsc { ++ unsigned long self; /* code page ALL ADDRESSES */ ++ unsigned long scratch; /* scratch page ARE PHYSICAL !*/ ++ unsigned long **page_dir; /* src & dst page tables */ ++ void (*jump_to)(void); /* start address */ ++ int pages; /* number of pages */ ++ unsigned long csum; /* Kernel Image checksum */ ++}; ++ ++/* ++ * page_dir contains pointers to pages containing pointers to pages. We call ++ * page_dir a "directory" and the page page_dir[n] points to a "table". The ++ * first PAGES_PER_TABLE/2 entries of page_dir are for source pages, and other ++ * half are for destination pages. ++ */ ++ ++/* ++ * Note that the definitions used here do not necessarily correspond to the ++ * architecture-specific PTRS_PER_PTE, __pte_offset, etc. ++ */ ++ ++#define PAGES_PER_TABLE (PAGE_SIZE/sizeof(void *)) ++#define FROM_TABLE(i) ((i)/PAGES_PER_TABLE) ++#define TO_TABLE(i) ((i)/PAGES_PER_TABLE+PAGES_PER_TABLE/2) ++#define PAGE_NR(i) ((i) % PAGES_PER_TABLE) ++ ++ ++extern char __bootimg_start,__bootimg_end; /* linker segment boundaries */ ++extern unsigned long *unity_page; /* unity-mapped page for i386 */ ++ ++/* ++ * relocate_and_jump runs in its own page with its own stack. This makes it ++ * difficult to pass parameters. The solution chosen here is to use the global ++ * variable bootimg_dsc, which is copied into an "auto" variable by ++ * relocate_and_jump before any copying or relocation takes place. ++ */ ++ ++extern struct bootimg_dsc bootimg_dsc; ++ ++typedef void (*relocate_and_jump_t)(void); ++ ++void relocate_and_jump(void); ++int boot_image(void); ++ ++#endif /* __KERNEL__ */ ++ ++#endif +Index: linux/include/linux/crash.h +=================================================================== +RCS file: linux/include/linux/crash.h +diff -N linux/include/linux/crash.h +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/include/linux/crash.h 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,119 @@ ++#ifndef __LINUX_CRASH_H ++#define __LINUX_CRASH_H ++ ++/* defines for interfacing with user-space (ioctls, etc) */ ++struct ioctl_getdump { ++ unsigned long kva; ++ unsigned long buf; ++}; ++ ++#define CRASH_IOC_MAGIC 'C' ++ ++#define CRASH_IOCFREEDUMP _IO(CRASH_IOC_MAGIC, 0) ++#define CRASH_IOCGETDUMP _IOWR(CRASH_IOC_MAGIC, 1, struct ioctl_getdump) ++#define CRASH_IOCBOOTIMG _IOWR(CRASH_IOC_MAGIC, 2, struct boot_image) ++#define CRASH_IOCVERSION _IO(CRASH_IOC_MAGIC, 3) ++ ++/* kernel-only part of crash.h */ ++#ifdef __KERNEL__ ++#include ++ ++#define CRASH_K_MINOR (1) ++#define CRASH_K_MAJOR (0) ++ ++/* ++ * Crash prototypes. ++ */ ++void save_core(void); ++void crash_mark_dump_reserved(void); ++void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va); ++u_long crash_pages_needed(void); ++void smp_crash_funnel_cpu(void); ++void crash_cleanup_smp_state(void); ++ ++/* ++ * Arch dependant crash.c funcs ++ */ ++void crash_save_current_state(struct task_struct *); ++void crash_halt_or_reboot(int); ++inline void crash_save_regs(void); ++ ++/* ++ * Crash globals ++ */ ++extern u_long crash_dump_header; ++extern volatile u_long panic_ksp[]; ++extern volatile int crash_release; ++extern int panic_on_oops; ++extern char *panicmsg; ++extern int panic_processor; ++extern int crash_perform_sync; ++extern unsigned long *panic_regs; ++ ++/* ++ * symbols not exported by linux header files ++ */ ++extern void stop_this_cpu(void *); ++ ++/* struct crash_map_hdr located at byte offset 0 */ ++/* on-disk formats */ ++ ++#define trunc_page(x) ((void *)(((unsigned long)(x)) & ~((unsigned long)(PAGE_SIZE - 1)))) ++#define round_page(x) trunc_page(((unsigned long)(x)) + ((unsigned long)(PAGE_SIZE - 1))) ++ ++#define CRASH_MAGIC 0x9a8bccdd ++#define CRASH_SOURCE_PAGES 128 ++#define CRASH_SUB_MAP_BYTES ((u_long)round_page((CRASH_SOURCE_PAGES+1)*sizeof(u_long))) ++#define CRASH_SUB_MAP_PAGES (CRASH_SUB_MAP_BYTES / PAGE_SIZE) ++#define CRASH_UNCOMPR_BUF_PAGES (CRASH_SOURCE_PAGES + CRASH_SUB_MAP_PAGES) ++#define CRASH_COMPR_BUF_PAGES (CRASH_UNCOMPR_BUF_PAGES + (CRASH_UNCOMPR_BUF_PAGES/4)) ++#define CRASH_COMPESS_PRIME_PAGES (2*CRASH_COMPR_BUF_PAGES) ++#define CRASH_ZALLOC_PAGES 16*5*2 /* 2 to handle crash in crash */ ++#define CRASH_LOW_WATER_PAGES 100 ++ ++#define CRASH_CPU_TIMEOUT 5000 /* 5 sec wait for other cpus to stop */ ++ ++#define CRASH_MARK_RESERVED(addr) (set_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) ++#define CRASH_CLEAR_RESERVED(addr) (clear_bit(PG_reserved,&mem_map[MAP_NR(addr)].flags)) ++#define CRASH_MARK_BOOT_RESERVED(addr) reserve_bootmem(virt_to_phys((void *)addr), PAGE_SIZE); ++ ++typedef int boolean_t; ++ ++#define TRUE 1 ++#define FALSE 0 ++ ++/* mem structure */ ++struct mem_crash_map_hdr { ++ long magic[4]; /* identify crash dump */ ++ u_long map; /* location of map */ ++ u_long map_pages; ++ u_long data_pages; ++ u_long compr_units; ++ u_long boot_reserved_start; ++ u_long boot_reserved_end; ++}; ++struct mem_crash_map_entry { ++ u_long src_va; /* source start of larger non-contig ++ * block. a src_va of -1 means that ++ * the dest_page_va is the location of ++ * the next map page */ ++ u_long dest_page_va; /* dest of this sub block */ ++ u_long check_sum; /* check_sum for dest data */ ++}; ++ ++/* file structure */ ++struct crash_map_hdr { ++ long magic[4]; /* identify crash dump */ ++ int blk_size; /* block size for this device */ ++ int map_block; /* location of map */ ++ int map_blocks; /* number of blocks for map */ ++}; ++struct crash_map_entry { ++ u_long start_va; /* virtual address */ ++ char *exp_data; /* expanded data in memory */ ++ int start_blk; /* device location */ ++ int num_blks; ++}; ++ ++#endif /* __KERNEL__ */ ++#endif /* __LINUX_CRASH_H */ +Index: linux/include/linux/mm.h +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/include/linux/mm.h,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.2 +diff -u -r1.2.2.1 -r1.2.2.1.2.2 +--- linux/include/linux/mm.h 12 Mar 2003 19:51:27 -0000 1.2.2.1 ++++ linux/include/linux/mm.h 1 Apr 2003 17:55:35 -0000 1.2.2.1.2.2 +@@ -331,6 +331,11 @@ + #define PG_lru 18 + #define PG_active_cache 19 + #define PG_fs_1 20 /* Filesystem specific */ ++#ifdef CONFIG_MCL_COREDUMP ++#define PG_free 21 ++#define PG_shm 22 ++#define PG_anon 23 ++#endif + + /* Make it prettier to test the above... */ + #define UnlockPage(page) unlock_page(page) +@@ -452,6 +457,11 @@ + #define PageSetSlab(page) set_bit(PG_slab, &(page)->flags) + #define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags) + #define PageReserved(page) test_bit(PG_reserved, &(page)->flags) ++#ifdef CONFIG_MCL_COREDUMP ++#define PageFree(page) (test_bit(PG_free, &(page)->flags)) ++#define PageAnon(page) (test_bit(PG_anon, &(page)->flags)) ++#define PageShm(page) (test_bit(PG_shm, &(page)->flags)) ++#endif + + #define PageActiveAnon(page) test_bit(PG_active_anon, &(page)->flags) + #define SetPageActiveAnon(page) set_bit(PG_active_anon, &(page)->flags) +Index: linux/include/linux/reboot.h +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/include/linux/reboot.h,v +retrieving revision 1.1.1.1 +retrieving revision 1.1.1.1.10.2 +diff -u -r1.1.1.1 -r1.1.1.1.10.2 +--- linux/include/linux/reboot.h 7 May 2002 21:53:47 -0000 1.1.1.1 ++++ linux/include/linux/reboot.h 1 Apr 2003 17:55:35 -0000 1.1.1.1.10.2 +@@ -20,6 +20,7 @@ + * CAD_OFF Ctrl-Alt-Del sequence sends SIGINT to init task. + * POWER_OFF Stop OS and remove all power from system, if possible. + * RESTART2 Restart system using given command string. ++ * COREDUMP We're taking a core dump, secondary cpus already stopped. + */ + + #define LINUX_REBOOT_CMD_RESTART 0x01234567 +@@ -28,7 +29,9 @@ + #define LINUX_REBOOT_CMD_CAD_OFF 0x00000000 + #define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC + #define LINUX_REBOOT_CMD_RESTART2 0xA1B2C3D4 +- ++#ifdef CONFIG_MCL_COREDUMP ++#define LINUX_REBOOT_CMD_COREDUMP 0x9A8BCCDD ++#endif + + #ifdef __KERNEL__ + +Index: linux/include/linux/sysctl.h +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/include/linux/sysctl.h,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/include/linux/sysctl.h 12 Mar 2003 19:51:30 -0000 1.3.2.1 ++++ linux/include/linux/sysctl.h 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 +@@ -126,6 +126,7 @@ + KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_CORE_PATTERN=56, /* string: pattern for core-files */ + KERN_PID_MAX=55, /* int: max PID value of processes */ ++ KERN_PANIC_ON_OOPS /* int: panic on oops enabled */ + }; + + +Index: linux/init/main.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/init/main.c,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.1 +diff -u -r1.2.2.1 -r1.2.2.1.2.1 +--- linux/init/main.c 12 Mar 2003 19:51:35 -0000 1.2.2.1 ++++ linux/init/main.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 +@@ -70,6 +70,10 @@ + #include + #endif + ++#ifdef CONFIG_BOOTIMG ++#include ++#endif ++ + /* + * Versions of gcc older than that listed below may actually compile + * and link okay, but the end product can have subtle run time bugs. +@@ -352,10 +356,14 @@ + { + char * command_line; + extern char saved_command_line[]; ++#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) ++ unsigned long value; ++#endif + /* + * Interrupts are still disabled. Do necessary setups, then + * enable them + */ ++ printk("start_kernel\n"); + lock_kernel(); + printk(linux_banner); + setup_arch(&command_line); +@@ -373,12 +381,26 @@ + * this. But we do want output early, in case something goes wrong. + */ + console_init(); ++ ++#ifdef CONFIG_BOOTIMG ++ unity_page = alloc_bootmem_pages(PAGE_SIZE); ++ printk("unity_page addr: %p\n",unity_page); ++#endif + #ifdef CONFIG_MODULES + init_modules(); + #endif + profile_init(); + kmem_cache_init(); + sti(); ++#if defined(CONFIG_BOOTIMG) && defined(CONFIG_X86_LOCAL_APIC) ++ /* If we don't make sure the APIC is enabled, AND the LVT0 ++ * register is programmed properly, we won't get timer interrupts ++ */ ++ setup_local_APIC(); ++ ++ value = apic_read(APIC_LVT0); ++ apic_write_around(APIC_LVT0, value & ~APIC_LVT_MASKED); ++#endif + calibrate_delay(); + #ifdef CONFIG_BLK_DEV_INITRD + if (initrd_start && !initrd_below_start_ok && +Index: linux/kernel/Makefile +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/kernel/Makefile,v +retrieving revision 1.1.1.1.4.1 +retrieving revision 1.1.1.1.4.1.2.1 +diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 +--- linux/kernel/Makefile 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 ++++ linux/kernel/Makefile 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 +@@ -22,7 +22,8 @@ + obj-$(CONFIG_PM) += pm.o + obj-$(CONFIG_KALLSYMS) += kallsyms.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o +- ++obj-$(CONFIG_BOOTIMG) += bootimg.o bootimg_pic.o ++obj-$(CONFIG_MCL_COREDUMP) += crash.o + + ifneq ($(CONFIG_IA64),y) + # According to Alan Modra , the -fno-omit-frame-pointer is +Index: linux/kernel/bootimg.c +=================================================================== +RCS file: linux/kernel/bootimg.c +diff -N linux/kernel/bootimg.c +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/kernel/bootimg.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,301 @@ ++/* bootimg.c - Boot another (kernel) image */ ++ ++/* Written 2000 by Werner Almesberger */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if 0 ++#define DPRINTK_CONT(format,args...) printk(format,##args) ++#else ++#define DPRINTK_CONT(format,args...) ++#endif ++#define DPRINTK(format,args...) DPRINTK_CONT(KERN_DEBUG format,##args) ++ ++unsigned long **bootimg_page_dir; ++ ++struct bootimg_dsc bootimg_dsc; /* communication with PIC */ ++unsigned long *unity_page; /* unity-mapped page for i386 */ ++ ++static unsigned long bootimg_checksum(unsigned long **page_dir, int num_pages) ++{ ++ unsigned long checksum, *page; ++ int i, j; ++ ++ checksum = 0; ++ ++ for (i = 0; i < num_pages; i++) { ++ page = __va((unsigned long *) ++ page_dir[FROM_TABLE(i)][PAGE_NR(i)]); ++ ++ for (j = 0; j < PAGES_PER_TABLE; j++) ++ checksum ^= page[j]; ++ ++ checksum ^= page_dir[TO_TABLE(i)][PAGE_NR(i)]; ++ } ++ ++ return checksum; ++} ++ ++#ifdef CONFIG_X86_PAE ++ ++static unsigned long get_identity_mapped_page(void) ++{ ++ pgd_t *pgd; ++ pmd_t *pmd; ++ unsigned long phys_addr, page_base; ++ ++ /* Set up a 2 Mb identity-mapped page. */ ++ ++ phys_addr = virt_to_phys(unity_page); ++ pgd = pgd_offset(current->active_mm, phys_addr); ++ pmd = pmd_offset(pgd, phys_addr); ++ ++ /* We hardcode this rather than using PMD_MASK just in case the PAE ++ * mode setup ever changes so that 2 Mb pages are no longer used. ++ */ ++ page_base = phys_addr & ~((1 << 21) - 1); ++ ++ set_pmd(pmd, __pmd(page_base | _PAGE_PSE | _KERNPG_TABLE)); ++ __flush_tlb_one(phys_addr); ++ ++ return (unsigned long) unity_page; ++} ++ ++#else ++ ++static unsigned long get_identity_mapped_page(void) ++{ ++ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)), ++ __pgd((_KERNPG_TABLE + _PAGE_PSE + (virt_to_phys(unity_page)&PGDIR_MASK)))); ++ __flush_tlb_one(virt_to_phys(unity_page)); ++ return (unsigned long)unity_page; ++} ++ ++#endif ++ ++#if 0 /* Perhaps we'll need this in the future? */ ++static void unmap_identity_mapped_page(void) ++{ ++ set_pgd(pgd_offset(current->active_mm,virt_to_phys(unity_page)),__pgd(0)); ++ __flush_tlb(); ++} ++#endif ++ ++static int fill_page_dir(unsigned long **page_dir,struct boot_image *image) ++{ ++ int i, count=0; ++ ++ memset(page_dir,0,PAGE_SIZE); ++ for (i = 0; i < image->pages; i += PAGES_PER_TABLE) { ++ unsigned long **table; ++ int bytes_left; ++ ++ table = page_dir+FROM_TABLE(i); ++ *table = (unsigned long *) get_free_page(GFP_KERNEL); ++ if (!*table) return -ENOMEM; ++ ++ memset(*table,0,PAGE_SIZE); ++ DPRINTK("page %d: from table %p @ %p\n",i,*table,table); ++ table = page_dir+TO_TABLE(i); ++ *table = (unsigned long *) get_free_page(GFP_KERNEL); ++ if (!*table) return -ENOMEM; ++ ++ bytes_left = (image->pages-i)*sizeof(unsigned long); ++ if (copy_from_user(*table,image->load_map+i, ++ bytes_left > PAGE_SIZE ? PAGE_SIZE : bytes_left)) ++ return -EFAULT; ++ DPRINTK("page %d: to table %p @ %p\n",i,*table,table); ++ count+=2; /* 2 pages per loop */ ++ } ++ ++ for (i = 0; i < image->pages; i++) { ++ unsigned long page = get_free_page(GFP_KERNEL); ++ void *src; ++ ++ if (!page) return -ENOMEM; ++ count++; ++ ++ page_dir[FROM_TABLE(i)][PAGE_NR(i)] = ++ virt_to_phys((void *) page); ++ if (get_user(src,image->image_map+i) || ++ copy_from_user((void *) page,src,PAGE_SIZE)) ++ return -EFAULT; ++ ++ DPRINTK("page %d: %p->%p->%p @ %p\n",i,src,(void *) page, ++ (void *) page_dir[FROM_TABLE(i)][PAGE_NR(i)], ++ &page_dir[FROM_TABLE(i)][PAGE_NR(i)]); ++ } ++ ++ DPRINTK("fill_page_dir: %d pages allocated\n", count); ++ ++ return 0; ++} ++ ++ ++static void free_page_dir(unsigned long **page_dir) ++{ ++ int i,j,count=0; ++ ++ for (i = 0; i < PAGES_PER_TABLE/2; i++) ++ if (page_dir[i]) ++ for (j = 0; j < PAGES_PER_TABLE; j++) ++ if (page_dir[i][j]) { ++ free_page((unsigned long) ++ phys_to_virt(page_dir[i][j])); ++ count++; ++ } ++ for (i = 0; i < PAGES_PER_TABLE; i++) ++ if (page_dir[i]) { ++ free_page((unsigned long) *page_dir[i]); ++ count++; ++ } ++ DPRINTK("free_page_dir: %d pages freed\n", count); ++} ++ ++ ++static void convert_table_refs_to_phys(unsigned long **page_dir) ++{ ++ int i; ++ ++ DPRINTK("PAGES_PER_TABLE: %d\n",PAGES_PER_TABLE); ++ for (i = 0; i < PAGES_PER_TABLE; i++) ++ if (page_dir[i]) { ++ DPRINTK("table %i: mapped %p -> ",i,page_dir[i]); ++ page_dir[i] = (unsigned long *) ++ virt_to_phys(page_dir[i]); ++ DPRINTK_CONT("%p\n",page_dir[i]); ++ } ++} ++ ++ ++ ++static int fill_bootimg_dsc(struct boot_image *image) ++{ ++ unsigned long scratch; ++ int error = -ENOMEM; ++ ++ if(bootimg_page_dir) { ++ /* free previously allocated memory */ ++ free_page_dir(bootimg_page_dir); ++ free_page((unsigned long) bootimg_page_dir); ++ DPRINTK("free_page (bootimg_page_dir)\n"); ++ } ++ ++ bootimg_page_dir = (unsigned long **) get_free_page(GFP_KERNEL); ++ if (!bootimg_page_dir) goto out0; ++ DPRINTK("get_free_page (bootimg_page_dir)\n"); ++ ++ error = fill_page_dir(bootimg_page_dir,image); ++ if (error) goto out1; ++ ++ if(!bootimg_dsc.scratch) { ++ scratch = get_free_page(GFP_KERNEL); ++ DPRINTK("get_free_page (scratch)\n"); ++ } else ++ scratch = 1; /* already allocated */ ++ ++ if (!scratch) goto out1; ++ /* ++ * Not all architectures need the code to be identity-mapped, but it ++ * can't hurt ... ++ */ ++ DPRINTK("bootimg_page_dir: mapped %p -> ",bootimg_page_dir); ++ bootimg_dsc.page_dir = (unsigned long **) virt_to_phys(bootimg_page_dir); ++ DPRINTK_CONT("%p\n",bootimg_dsc.page_dir); ++ if(!bootimg_dsc.scratch) ++ bootimg_dsc.scratch = virt_to_phys((void *) scratch); ++ bootimg_dsc.jump_to = (void (*)(void)) image->start; ++ bootimg_dsc.pages = image->pages; ++ bootimg_dsc.csum = bootimg_checksum(bootimg_page_dir, image->pages); ++ ++ return 0; ++ ++out1: ++ free_page_dir(bootimg_page_dir); ++ free_page((unsigned long) bootimg_page_dir); ++ DPRINTK("free_page (bootimg_page_dir)\n"); ++ bootimg_page_dir = 0; ++out0: ++ return error; ++} ++ ++extern char *panicmsg; ++int boot_image() ++{ ++ relocate_and_jump_t code; ++ unsigned long code_page; ++ int error = -ENOMEM; ++ ++ if (bootimg_checksum(__va(bootimg_dsc.page_dir),bootimg_dsc.pages) ++ != bootimg_dsc.csum) ++ printk("Checksum of kernel image failed. Rebooting via BIOS\n"); ++ ++ code_page = get_identity_mapped_page(); ++ if (!code_page) goto out3; ++ code = (relocate_and_jump_t) virt_to_phys((void *) code_page); ++ memcpy(code,&__bootimg_start,&__bootimg_end-&__bootimg_start); ++ flush_icache_range(&__bootimg_start, &__bootimg_end-&__bootimg_start); ++ ++ bootimg_dsc.self = (unsigned long) code; ++ printk(KERN_INFO "Running boot code at 0x%p\n",code); ++ ++ /* ++ * The point of no return. Not even printk may work after a successful ++ * return from become_only_thread. ++ */ ++ ++ if (!panicmsg) { ++ error = become_only_thread(); ++ if (error) goto out3; ++ } else { ++#ifdef CONFIG_SMP ++ disable_IO_APIC(); ++#endif ++ __cli(); ++ } ++ ++ convert_table_refs_to_phys((unsigned long **)__va(bootimg_dsc.page_dir)); ++ stack_on_page(code); ++ ++ code(); ++ ++ panic("PIC code exec failed"); ++out3: ++ printk("boot_image() failed!\n"); ++ for(;;); ++} ++ ++/* changed from asmlinkage because we're called via an IOCTL on /dev/crash now */ ++int sys_bootimg(struct boot_image *user_dsc) ++{ ++ struct boot_image dsc; ++ ++ if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_MODULE)) return -EPERM; ++ if (&__bootimg_end-&__bootimg_start > PAGE_SIZE-RESERVE_MIN_RELOC_STACK) ++ { ++ printk(KERN_ERR "boot_image: PIC too large (%d bytes)\n", ++ &__bootimg_end-&__bootimg_start); ++ return -EIO; ++ } ++ if ((void *) relocate_and_jump != (void *) &__bootimg_start) { ++ printk(KERN_ERR "boot_image: relocate_and_jump is mis-placed" ++ "(0x%p != 0x%p)\n",relocate_and_jump,&__bootimg_start); ++ return -EIO; ++ } ++ ++ if (copy_from_user(&dsc,user_dsc,sizeof(dsc))) return -EFAULT; ++ if (dsc.pages >= PAGES_PER_TABLE*PAGES_PER_TABLE/2) return -EFBIG; ++ if (dsc.flags) return -EINVAL; /* for future use */ ++ return fill_bootimg_dsc(&dsc); ++} +Index: linux/kernel/bootimg_pic.c +=================================================================== +RCS file: linux/kernel/bootimg_pic.c +diff -N linux/kernel/bootimg_pic.c +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/kernel/bootimg_pic.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,91 @@ ++/* bootimg_pic.c - Boot image, position-independent code */ ++ ++/* Written 2000 by Werner Almesberger */ ++ ++/* ++ * Strongly inspired by FiPaBoL designed mainly by Otfried Cheong and Roger ++ * Gammans, and written by the latter. ++ */ ++ ++/* ++ * This code is position-independent and must fit in a single page ! ++ * Furthermore, everything (text+data+stack) has to go into the ++ * .bootimg segment. ++ */ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define copy_and_swap(from,to) \ ++ ( { my_copy_page(from,to); \ ++ tmp = from; \ ++ from = to; \ ++ to = tmp; } ) ++ ++ ++static inline void my_copy_page(unsigned long from,unsigned long to) ++{ ++ unsigned long end = from+PAGE_SIZE; ++ ++ do *((unsigned long *) to)++ = *((unsigned long *) from)++; ++ while (from != end); ++} ++ ++ ++void __bootimg relocate_and_jump(void) ++{ ++ struct bootimg_dsc dsc = bootimg_dsc; ++ int i; ++ ++ stop_paging(); ++ for (i = 0; i < dsc.pages; i++) { ++ unsigned long from,to,tmp; ++ ++ from = dsc.page_dir[FROM_TABLE(i)][PAGE_NR(i)]; ++ to = dsc.page_dir[TO_TABLE(i)][PAGE_NR(i)]; ++ if (from == to) continue; ++ if (to == dsc.self) { ++ copy_and_swap(dsc.self,dsc.scratch); ++ /* WARNING: flush_icache_range MUST BE INLINED !!! */ ++ flush_icache_range(dsc.self,dsc.self+PAGE_SIZE-1); ++ jump_relocated(dsc.scratch,dsc.self); ++ } ++ else if (to == (unsigned long) dsc.page_dir) ++ copy_and_swap((unsigned long) dsc.page_dir,dsc.scratch); ++ else { ++ /* ++ * O((n^2-n)/2), sigh ... ++ */ ++ unsigned long **table; ++ int j; ++ ++ for (j = i+1; j < dsc.pages; j++) { ++ table = dsc.page_dir+FROM_TABLE(j); ++ if (((unsigned long) *table) == to) { ++ copy_and_swap(*table,dsc.scratch); ++ break; ++ } ++ if ((*table)[PAGE_NR(j)] == to) { ++ copy_and_swap((*table)[PAGE_NR(j)], ++ dsc.scratch); ++ break; ++ } ++ table = dsc.page_dir+TO_TABLE(j); ++ if (((unsigned long) *table) == to) { ++ copy_and_swap(*table,dsc.scratch); ++ break; ++ } ++ } ++ } ++ my_copy_page(from,to); ++ dsc.scratch = from; ++ } ++ jump_to_kernel(dsc.jump_to); ++} +Index: linux/kernel/crash.c +=================================================================== +RCS file: linux/kernel/crash.c +diff -N linux/kernel/crash.c +--- /dev/null 1 Jan 1970 00:00:00 -0000 ++++ linux/kernel/crash.c 1 Apr 2003 12:17:41 -0000 1.1.6.1 +@@ -0,0 +1,886 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_BOOTIMG ++#include ++#endif ++ ++static void crash_print_data_around(u_long p); ++static void crash_free_page(u_long addr); ++static int crash_chksum_page(u_long pg_addr, u_long * sum_addr); ++static void *czalloc(void *arg, unsigned int items, unsigned int size); ++static void czfree(void *arg, void *ptr); ++static u_long crash_alloc_dest_page(void); ++static void crash_free_dest_page(u_long dest); ++static void init_dest_page_alloc(void); ++static int crash_audit_maps(void); ++static u_long crash_get_source_page(void); ++static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages); ++static int crash_reset_stream(z_stream * stream); ++static boolean_t crash_is_kseg(u_long addr); ++static u_long *crash_link(u_long p); ++static int crash_chksum(u_long limit, u_long * sum_addr); ++static int crash_audit_map_page(u_long map); ++static void crash_wait_cpus(void); ++static int crash_is_dir_page(struct page *page); ++ ++/* for the /dev/crash interface */ ++int crash_init_chrdev(void); ++static int crashdev_ioctl(struct inode *, struct file *, unsigned int, unsigned long); ++ ++#define CRASH_DEBUG 1 ++ ++#ifdef CONFIG_BOOTIMG ++extern int sys_bootimg(struct boot_image *); ++#endif ++ ++static u_long crash_compr_buf; ++static u_long crash_uncompr_buf; ++static u_long crash_dump_header = 0; ++static u_long crash_dest_free_list = 0; ++static u_long crash_debug = 0; ++ ++static u_long crash_cur_pfn; ++ ++static u_long src_pages_skipped = 0; ++static u_long src_pages_saved = 0; ++static u_long dest_pages_free = 0; ++ ++/* this information is saved from within panic() */ ++char *panicmsg = (char *)0; ++int panic_processor = 0; ++int crash_perform_sync = 0; ++ ++u_int console_crash = 0; /* should be moved to alpha branch */ ++ ++// typedef struct task_struct *task_t; ++ ++/* ++ * Threads active at time of panic: ++ */ ++volatile task_t *panic_threads[NR_CPUS]; ++volatile unsigned long panic_ksp[NR_CPUS]; ++unsigned long *panic_regs = NULL; ++ ++int panic_on_oops; /* for /proc/sys/kernel/panic_on_oops */ ++ ++extern unsigned long max_low_pfn; ++ ++u_long crash_zalloc_start; // , crash_zalloc_end, crash_zalloc_cur; ++ ++/* ++ * Crash Kernel API functions below ++ * crash_pages_needed, computes pages needed for header and compression temp ++ * crash_init, partitions out the allocated pages, sets defaults and ++ * initializes the character device. ++ * crash_mark_dump_reserved, marks pages reserved from a previous dump. ++ * save_core, called at panic time to save a dump to memory. ++ */ ++u_long crash_pages_needed(void) ++{ ++ /* one for the header */ ++ return (1 + CRASH_ZALLOC_PAGES + CRASH_UNCOMPR_BUF_PAGES + CRASH_COMPR_BUF_PAGES); ++} ++ ++void crash_init(u_long bootmap_va, u_long crash_va, u_long end_alloc_va) ++{ ++ struct mem_crash_map_hdr *header; ++ int i; ++ ++ /* the default behavior is not NOT panic on a kernel OOPS */ ++ panic_on_oops = 0; ++ ++ printk("crash_init (crash_va: %08lx)\n", crash_va); ++ for (i = 0; i < NR_CPUS; i++) ++ panic_threads[i] = 0; ++ crash_dump_header = crash_va; ++ crash_va += PAGE_SIZE; ++ crash_zalloc_start = crash_va; ++ crash_va += CRASH_ZALLOC_PAGES * PAGE_SIZE; ++ crash_uncompr_buf = crash_va; ++ crash_va += CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE; ++ crash_compr_buf = crash_va; ++ crash_va += CRASH_COMPR_BUF_PAGES * PAGE_SIZE; ++#if 0 ++ if (crash_va != end_alloc_va) ++ panic("crash_init inconsistency-1\n"); ++#endif ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++#ifdef CRASH_DEBUG ++ printk("crash_dump_header %p {\n", header); ++ printk(" magic[0] = %lx\n", header->magic[0]); ++ printk(" map = %lx\n", header->map); ++ printk(" map_pages = %lx\n", header->map_pages); ++ printk(" data_pages = %lx\n", header->data_pages); ++ printk(" compr_units = %lx\n", header->compr_units); ++ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); ++ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); ++#endif ++ ++ if (header->magic[0] == CRASH_MAGIC) { ++ printk("crash found\n"); ++ if ((header->boot_reserved_start != bootmap_va) || ++ (header->boot_reserved_end != end_alloc_va)) { ++ /* crash audit will catch the corruption */ ++ printk("crash_init inconsistency, dump may be corrupted\n"); ++ } ++ } else { ++printk("memset..."); ++ memset(header, 0, sizeof(*header)); ++printk("done\n"); ++ } ++ ++ header->boot_reserved_start = bootmap_va; ++ header->boot_reserved_end = end_alloc_va; ++ ++} ++ ++void crash_mark_dump_reserved(void) ++{ ++ struct mem_crash_map_hdr *header; ++ struct mem_crash_map_entry *m; ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ if (header->magic[0] != CRASH_MAGIC) ++ return; ++ m = (struct mem_crash_map_entry *)header->map; ++#ifdef CRASH_DEBUG ++ printk("\n\n\ncrash_mark_dump_reserved\n\n"); ++ printk("crash_dump_header %p {\n", header); ++ printk(" magic[0] = %lx\n", header->magic[0]); ++ printk(" map = %lx\n", header->map); ++ printk(" map_pages = %lx\n", header->map_pages); ++ printk(" data_pages = %lx\n", header->data_pages); ++ printk(" compr_units = %lx\n", header->compr_units); ++ printk(" boot_reserved_start = %lx\n", header->boot_reserved_start); ++ printk(" boot_reserved_end = %lx\n", header->boot_reserved_end); ++ printk("mem_crash_map_entry %p {\n", m); ++ printk(" src_va = %lx\n", m->src_va); ++ printk(" dest_page_va = %lx\n", m->dest_page_va); ++ printk(" check_sum = %lx\n", m->check_sum); ++#endif ++ ++ if (crash_audit_maps()) { ++ header->magic[0] = 0; ++ return; ++ } ++ ++ m = (struct mem_crash_map_entry *)header->map; ++ again: ++ CRASH_MARK_BOOT_RESERVED(m); ++ for (; m->src_va; m++) { ++ if (m->src_va == -1) { ++ m = (struct mem_crash_map_entry *)m->dest_page_va; ++ goto again; ++ } ++ CRASH_MARK_BOOT_RESERVED(m->dest_page_va); ++ } ++ return; ++} ++ ++void save_core(void) ++{ ++ int i, j, k; ++ z_stream stream; ++ int err; ++ struct task_struct *tp; ++ struct mem_crash_map_hdr *header; ++ u_long *sub_map; ++ u_long map; ++ u_long src, dest, unc, cp, src_base, comp_pages; ++ ++ k = 0; ++ dest = 0; ++ __cli(); ++ tp = current; ++ mb(); ++ if (smp_processor_id() != 0) { /* boot_cpu_id is always 0, i think */ ++ panic_threads[smp_processor_id()] = tp; ++ crash_halt_or_reboot(0); ++ } else { ++ if (console_crash) ++ panic_threads[smp_processor_id()] = &init_task_union.task; ++ else ++ panic_threads[smp_processor_id()] = tp; ++ ++ crash_wait_cpus(); ++ } ++ ++ printk("save_core: started on CPU%d\n", smp_processor_id()); ++ if (!crash_dump_header) { ++ printk("save_core: not initialized\n"); ++ return; ++ } ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ header->magic[0] = 0; ++ header->map_pages = 0; ++ header->data_pages = 0; ++ header->compr_units = 0; ++ header->map = 0; ++ ++ stream.workspace=(void*)crash_zalloc_start; ++ // stream.zalloc = czalloc; ++ // stream.zfree = czfree; ++ // stream.opaque = (voidpf) 0; ++ stream.next_out = (Bytef *) crash_compr_buf; ++ stream.avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); ++ stream.next_in = (Bytef *) crash_uncompr_buf; ++ stream.avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); ++ err = zlib_deflateInit(&stream, Z_BEST_SPEED); ++ if (err != Z_OK) { ++ printk("save_core: bad return %d from deflateInit\n", err); ++ return; ++ } ++ ++ init_dest_page_alloc(); ++ header->map = map = crash_update_map(0, 0, 0, &header->map_pages); ++ if (!map) { ++ printk("save_core: no dest pages\n"); ++ return; ++ } ++ crash_cur_pfn = 0; ++ src_base = 0; ++ src = 0; ++ for (;;) { ++ sub_map = (u_long *) crash_uncompr_buf; ++ unc = crash_uncompr_buf + CRASH_SUB_MAP_PAGES * PAGE_SIZE; ++ for (i = 0; i < CRASH_SOURCE_PAGES; i++) { ++ src = crash_get_source_page(); ++ if (!src) ++ break; ++ if (!i) ++ src_base = src; ++ if (!crash_is_kseg(unc) || !crash_is_kseg(src)) { ++ printk("unc = 0x%lx, src = 0x%lx, i = %d\n", unc, src, i); ++ i = src = 0; ++ break; ++ } ++ memcpy((void *)unc, (void *)src, PAGE_SIZE); ++ unc += PAGE_SIZE; ++ *sub_map++ = src; ++ } ++ *sub_map = 0; ++ if (!i && !src) ++ break; ++ err = zlib_deflate(&stream, Z_FINISH); ++ if (!(err == Z_STREAM_END)) { ++ zlib_deflateEnd(&stream); ++ printk("save_core: bad return %d from deflate, src_base = 0x%lx\n", err, ++ src_base); ++ return; ++ } ++ comp_pages = (u_long) round_page(stream.total_out) / PAGE_SIZE; ++ if (crash_debug) ++ printk("src_base = 0x%lx compressed data in 0x%lx pages\n", src_base, ++ comp_pages); ++ ++ cp = crash_compr_buf; ++ j = 0; ++ if (crash_debug) ++ printk("\nsrc = %lx\n", src_base); ++ else { ++ printk("."); ++ if (!(k++ % 64)) ++ printk("\n"); ++ } ++ for (i = 0; i < comp_pages; i++) { ++ dest = crash_alloc_dest_page(); ++ if (crash_debug) { ++ printk("%lx ", dest); ++ if (!(j++ % 8)) ++ printk("\n"); ++ } ++ header->data_pages++; ++ if (!dest) { ++ printk("save_core: no dest pages\n"); ++ return; ++ } ++ if (!crash_is_kseg(dest) || !crash_is_kseg(cp)) { ++ printk("dest = 0x%lx, cp = 0x%lx, i = %d, comp_pages = 0x%lx\n", ++ dest, cp, i, comp_pages); ++ src = 0; ++ break; ++ } ++ memcpy((void *)dest, (void *)cp, PAGE_SIZE); ++ cp += PAGE_SIZE; ++ map = crash_update_map(map, src_base, dest, &header->map_pages); /* links a new map page, if necessary */ ++ if (!map) { ++ printk("save_core: no map\n"); ++ return; ++ } ++ } ++ header->compr_units++; ++ if (!src) ++ break; ++ if (crash_reset_stream(&stream)) ++ return; ++ } ++ ++ map = crash_update_map(map, 0, 0, &header->map_pages); ++ header->magic[0] = CRASH_MAGIC; ++ ++ if (crash_audit_maps()) { ++ header->magic[0] = 0; ++ return; ++ } ++ ++ printk("\nsave_core: src pages skipped = 0x%lx src pages saved = 0x%lx\n", ++ src_pages_skipped, src_pages_saved); ++ printk("save_core: data_pages = 0x%lx map_pages = 0x%lx\n", header->data_pages, ++ header->map_pages); ++ printk("save_core: completed, crash_dump_header = 0x%lx\n", crash_dump_header); ++} ++ ++/* helper functions private to this file */ ++static int crash_reset_stream(z_stream * stream) ++{ ++ int err; ++ ++ stream->workspace=(void*)crash_zalloc_start; ++ // stream->zalloc = czalloc; ++ // stream->zfree = czfree; ++ // stream->opaque = (voidpf) 0; ++ stream->next_out = (Bytef *) crash_compr_buf; ++ stream->avail_out = (uInt) (CRASH_COMPR_BUF_PAGES * PAGE_SIZE); ++ stream->next_in = (Bytef *) crash_uncompr_buf; ++ stream->avail_in = (uInt) (CRASH_UNCOMPR_BUF_PAGES * PAGE_SIZE); ++ err = zlib_deflateReset(stream); ++ if (err != Z_OK) { ++ printk("crash_reset_stream: bad return %d from deflateReset\n", err); ++ return 1; ++ } ++ return 0; ++} ++ ++static u_long crash_alloc_dest_page(void) ++{ ++ u_long addr; ++ ++ addr = crash_dest_free_list; ++ if (addr) { ++ crash_dest_free_list = *(u_long *) addr; ++ dest_pages_free--; ++ } else ++ printk("crash_alloc_dest_page: free list empty\n"); ++ return addr; ++} ++ ++static void crash_free_dest_page(u_long dest) ++{ ++ if (!dest) { ++ printk("crash_free_dest_page: freeing addr 0\n"); ++ return; ++ } ++ dest_pages_free++; ++ dest = (u_long) trunc_page(dest); ++ *(u_long *) dest = crash_dest_free_list; ++ crash_dest_free_list = dest; ++} ++ ++/* ++ * Stolen from setup.c ++ */ ++#define PFN_PHYS(x) ((x) << PAGE_SHIFT) ++ ++static void init_dest_page_alloc(void) ++{ ++ u_long va; ++ long i; ++ struct page *page; ++ struct mem_crash_map_hdr *header; ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ for (i = ((1 << 24) >> PAGE_SHIFT) + LOWER_MEM_FORWARD; ++ i < (max_low_pfn - UPPER_MEM_BACKUP); i++) { ++ va = (u_long) phys_to_virt(PFN_PHYS(i)); ++ if ((va >= header->boot_reserved_start) && (va < header->boot_reserved_end)) ++ continue; ++ page = mem_map + i; ++ if (PageLocked(page) || PageReserved(page)) ++ continue; ++ if (PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers) ++ crash_free_dest_page(va); ++ } ++ if (crash_debug) ++ printk("init_dest_page_alloc: dest_pages_free = 0x%lx\n", dest_pages_free); ++} ++ ++static int crash_is_dir_page(struct page *page) { ++ struct inode *tmp_inode; ++ ++ if(page->mapping && page->mapping->host) { ++ tmp_inode = (struct inode *)page->mapping->host; ++ if((tmp_inode->i_sb->s_magic == EXT2_SUPER_MAGIC) && ++ (S_ISDIR(tmp_inode->i_mode))) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static u_long crash_get_source_page(void) ++{ ++ struct page *page; ++ u_long va; ++ ++ while (crash_cur_pfn < max_low_pfn) { ++ page = mem_map + crash_cur_pfn; ++ if (!(PageFree(page) || PageAnon(page) || PageShm(page) || page->buffers)) ++ break; ++ src_pages_skipped++; ++ crash_cur_pfn++; ++ } ++ if (crash_cur_pfn == max_low_pfn) ++ return 0; ++ ++ va = (u_long) phys_to_virt(PFN_PHYS(crash_cur_pfn)); ++ src_pages_saved++; ++ crash_cur_pfn++; ++ return va; ++} ++ ++static u_long crash_update_map(u_long map, u_long src_base, u_long dest, u_long * pages) ++{ ++ struct mem_crash_map_entry *m; ++ ++ ++ if (!map) { ++ (*pages)++; ++ return crash_alloc_dest_page(); ++ } ++ m = (struct mem_crash_map_entry *)map; ++ m->src_va = src_base; ++ m->dest_page_va = dest; ++ if (dest) ++ if (crash_chksum_page(dest, &m->check_sum)) ++ return 0; ++ ++ map += sizeof(struct mem_crash_map_entry); ++ ++ m = (struct mem_crash_map_entry *)map; ++ if (!src_base) { /* end of list */ ++ if (crash_chksum((u_long) m, &m->src_va)) ++ return 0; ++ } else if ((map + 3 * sizeof(struct mem_crash_map_entry)) > (u_long) round_page(map)) { ++ m->src_va = -1; ++ map = m->dest_page_va = crash_alloc_dest_page(); ++ if (crash_debug) ++ printk("\nm = 0x%lx m->src_va = 0x%lx m->dest_page_va = 0x%lx\n", ++ (u_long) trunc_page(m), m->src_va, m->dest_page_va); ++ m++; ++ if (crash_chksum((u_long) m, &m->src_va)) ++ return 0; ++ if (crash_debug) ++ printk("m = 0x%lx chksum = m->src_va = 0x%lx\n", (u_long) trunc_page(m), ++ m->src_va); ++ if (crash_audit_map_page((u_long) m)) ++ return 0; ++ (*pages)++; ++ } ++ return map; ++} ++ ++static int crash_chksum(u_long limit, u_long * sum_addr) ++{ ++ u_long sum; ++ u_long *addr; ++ ++ if (!crash_is_kseg(limit)) { ++ printk("bad addr = 0x%lx to crash_chksum\n", limit); ++ return 1; ++ } ++ sum = 0; ++ addr = (u_long *) trunc_page(limit); ++ for (; (u_long) addr < limit; addr++) ++ sum += *addr; ++ *sum_addr = sum; ++ return 0; ++} ++ ++static int crash_chksum_page(u_long pg_addr, u_long * sum_addr) ++{ ++ u_long sum, limit; ++ u_long *addr; ++ ++ if (!crash_is_kseg(pg_addr)) { ++ printk("bad addr = 0x%lx to crash_chksum_page\n", pg_addr); ++ return 1; ++ } ++ ++ sum = 0; ++ addr = (u_long *) trunc_page(pg_addr); ++ limit = (u_long) addr + PAGE_SIZE; ++ for (; (u_long) addr < limit; addr++) ++ sum += *addr; ++ *sum_addr = sum; ++ return 0; ++} ++ ++static int crash_audit_maps(void) ++{ ++ u_long m, count; ++ u_long *link_addr; ++ struct mem_crash_map_hdr *header; ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ if (header->magic[0] != CRASH_MAGIC) ++ return 1; ++ ++ link_addr = &header->map; ++ m = header->map; ++ ++ count = 0; ++ for (;;) { ++ if (!crash_is_kseg(m)) { ++ printk("crash_audit_maps: bad link 0x%lx at 0x%lx\n", m, ++ (u_long) link_addr); ++ return 1; ++ } ++ if (crash_audit_map_page(m)) { ++ printk("audit failed while on map page %ld\n", count); ++ return 1; ++ } ++ if (!crash_link(m)) ++ break; ++ link_addr = crash_link(m); ++ m = *link_addr; ++ ++ count++; ++ } ++ return 0; ++} ++ ++static int crash_audit_map_page(u_long map) ++{ ++ struct mem_crash_map_entry *m; ++ u_long sum; ++ ++ if (!map || !crash_is_kseg(map)) { ++ printk("crash_audit_map_page: bad map = 0x%lx\n", map); ++ return 1; ++ } ++ map = (u_long) trunc_page((u_long) map); ++ m = (struct mem_crash_map_entry *)map; ++ for (;;) { ++ if ((m->src_va == -1) || (m->src_va == 0)) { ++ m++; ++ if (crash_chksum((u_long) m, &sum)) ++ return 1; ++ if (m->src_va != sum) { ++ printk("crash_audit_map_page: checksum failure1\n"); ++ printk("m = 0x%lx, sum = 0x%lx, m->src_va = 0x%lx\n", ++ (u_long) m, (u_long) sum, (u_long) m->src_va); ++ crash_print_data_around((u_long) & m->src_va); ++ return 1; ++ } else { ++ return 0; ++ } ++ } else { ++ if (crash_chksum_page((u_long) m->dest_page_va, &sum) ++ || (m->check_sum != sum)) { ++ printk("crash_audit_map_page: checksum failure2\n"); ++ printk ++ ("dest_page_va = 0x%lx, &dest_page_va = 0x%lx, sum = 0x%lx, m->check_sum = 0x%lx\n", ++ (u_long) m->dest_page_va, (u_long) (&m->check_sum), ++ (u_long) sum, (u_long) m->check_sum); ++ crash_print_data_around((u_long) & m->check_sum); ++ return 1; ++ } ++ } ++ m++; ++ } ++} ++ ++static void crash_print_data_around(u_long p) ++{ ++ u_long *a; ++ int i; ++ ++ if (!crash_is_kseg(p)) { ++ printk("crash_print_data_around: p = 0x%lx not kseg\n", p); ++ return; ++ } ++ a = (u_long *) p; ++ a -= 20; ++ for (i = 0; i < 40; i++) ++ printk("%lx\n", *a++); ++} ++ ++#ifdef CRASH_DEBUG ++static void crash_print_map_page(u_long map) ++{ ++ struct mem_crash_map_entry *m; ++ int j = 0; ++ u_long sum; ++ ++ map = (u_long) trunc_page((u_long) map); ++ m = (struct mem_crash_map_entry *)map; ++ for (;;) { ++ printk("%lx %lx %lx ", m->src_va, m->dest_page_va, m->check_sum); ++ if (!(j++ % 4)) ++ printk("\n"); ++ if ((m->src_va == -1) || (m->src_va == 0)) { ++ m++; ++ printk("%lx %lx ", m->src_va, m->dest_page_va); ++ if (crash_chksum((u_long) m, &sum)); ++ else ++ printk("\nchksum = 0x%lx\n", sum); ++ return; ++ } ++ m++; ++ } ++} ++#endif /* CRASH_DEBUG */ ++ ++static void crash_wait_cpus(void) ++{ ++ int i; ++ int msecs = 0; ++ ++ for (i = 0; i < smp_num_cpus; i++) { ++ if (i != smp_processor_id()) { ++ while (!panic_threads[i]) { ++ msecs++; ++ mdelay(1); ++ if (msecs > CRASH_CPU_TIMEOUT) { ++ /* if other cpus are still running ++ * we have to halt, otherwise we could ++ * risk using buffer cache pages which ++ * could subsequently get flushed to disk. ++ */ ++ printk("Unable to halt other CPUs, halting system.\n"); ++ crash_halt_or_reboot(0); ++ } ++ } ++ } ++ } ++ ++ crash_cleanup_smp_state(); ++} ++ ++ ++#if 0 ++static void *czalloc(void *arg, unsigned int items, unsigned int size) ++{ ++ u_long nbytes; ++ u_long addr; ++ ++ nbytes = (u_long) (items * size); ++ nbytes = (u_long) round_page(nbytes); ++ if ((crash_zalloc_cur + nbytes) > crash_zalloc_end) ++ return 0; ++ addr = crash_zalloc_cur; ++ crash_zalloc_cur += nbytes; ++ return ((void *)addr); ++} ++ ++static void czfree(void *arg, void *ptr) ++{ ++ printk("zfree: ptr = 0x%lx\n", (u_long) ptr); ++} ++#endif ++ ++static boolean_t crash_is_kseg(u_long addr) ++{ ++ u_long phys; ++ ++ phys = virt_to_phys((void *)addr); ++ if (phys < PFN_PHYS(max_low_pfn)) ++ return TRUE; ++ else ++ return FALSE; ++} ++ ++static u_long *crash_link(u_long p) ++{ ++ struct mem_crash_map_entry *m; ++ ++ p = (u_long) trunc_page(p); ++ m = (struct mem_crash_map_entry *)p; ++ for (; m->src_va; m++) ++ if (m->src_va == -1) ++ return &m->dest_page_va; ++ ++ return 0; ++} ++ ++/* Call this after data written to disk. */ ++static int crash_free_crashmem(void) ++{ ++ struct mem_crash_map_hdr *header; ++ struct mem_crash_map_entry *m, *last_m; ++ ++ if (crash_debug) ++ printk("crash_free_crashmem: \n"); ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ if (crash_audit_maps()) { ++ header->magic[0] = 0; ++ return 1; ++ } ++ m = (struct mem_crash_map_entry *)header->map; ++ again: ++ for (; m->src_va; m++) { ++ if (m->src_va == -1) { ++ last_m = m; ++ m = (struct mem_crash_map_entry *)m->dest_page_va; ++ crash_free_page((unsigned long)last_m); ++ goto again; ++ } ++ crash_free_page(m->dest_page_va); ++ } ++ if (crash_debug) ++ printk("crash_free_crashmem: 0x%lx freed\n", ++ (header->data_pages + header->map_pages) * PAGE_SIZE); ++ header->magic[0] = 0; ++ return 0; ++} ++ ++static void crash_free_page(u_long addr) ++{ ++ struct page *page; ++ ++ page = virt_to_page(addr); ++ ClearPageReserved(page); ++ set_page_count(page, 1); ++ __free_page(page); ++} ++ ++static int get_dump_helper(u_long kva, u_long buf) ++{ ++ struct page *page; ++ struct mem_crash_map_hdr *header; ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ if (header->magic[0] != CRASH_MAGIC) ++ return 1; ++ ++ if (!kva) { ++ if (crash_audit_maps()) { ++ printk("get_dump_helper: audit failure\n"); ++ header->magic[0] = 0; ++ return 1; ++ } ++ page = virt_to_page((u_long) crash_dump_header); ++ if (!PageReserved(page)) { ++ printk("not reserved: crash_dump_header = 0x%lx\n", crash_dump_header); ++ return 1; ++ } ++ if (copy_to_user((char *)buf, (char *)crash_dump_header, ++ sizeof(struct mem_crash_map_hdr))) { ++ printk("get_dump_helper: copy_to_user failed1\n"); ++ return 1; ++ } ++ } else { ++ page = virt_to_page(kva); ++ if (!PageReserved(page)) { ++ printk("not reserved: kva = 0x%lx\n", kva); ++ return 1; ++ } ++ if (copy_to_user((char *)buf, (char *)trunc_page(kva), PAGE_SIZE)) { ++ printk("get_dump_helper: copy_to_user failed2\n"); ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++static void free_dump_helper(void) ++{ ++ struct mem_crash_map_hdr *header; ++ ++ header = (struct mem_crash_map_hdr *)crash_dump_header; ++ if (header->magic[0] != CRASH_MAGIC) ++ return; ++ if (crash_debug) ++ printk("free_dump_helper\n"); ++ crash_free_crashmem(); ++} ++ ++static int crashdev_open(struct inode *inode, struct file *file) ++{ ++ /* always return success -- nothing to do here */ ++ return 0; ++} ++ ++/* character device implementation */ ++static struct file_operations crashdev_fops = { ++ ioctl:crashdev_ioctl, ++ open:crashdev_open, ++}; ++ ++static struct miscdevice crash_miscdev = { ++ 190, "crash", &crashdev_fops ++}; ++ ++int crash_init_chrdev(void) ++{ ++ int result; ++ ++ result = misc_register(&crash_miscdev); ++ ++ if (result < 0) ++ printk(KERN_WARNING "crash: can't register crash device (c 10 190)\n"); ++ ++ return result; ++} ++ ++/* call the original syscalls, just to get things going */ ++static int crashdev_ioctl(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ int retval = 0; ++ ++ switch (cmd) { ++ case CRASH_IOCFREEDUMP: ++ free_dump_helper(); ++ break; ++ ++ case CRASH_IOCGETDUMP: ++ if (crash_debug) { ++ printk("crashdev_ioctl: get dump\n"); ++ printk("vals: %08lx %08lx\n", ++ ((struct ioctl_getdump *)arg)->kva, ++ ((struct ioctl_getdump *)arg)->buf); ++ } ++ ++ retval = get_dump_helper((u_long) ((struct ioctl_getdump *)arg)->kva, ++ (u_long) ((struct ioctl_getdump *)arg)->buf); ++ break; ++ ++#ifdef CONFIG_BOOTIMG ++ case CRASH_IOCBOOTIMG: ++ if (crash_debug) ++ printk("crashdev_ioctl: bootimg\n"); ++ ++ retval = sys_bootimg((struct boot_image *)arg); ++ break; ++#endif ++ ++ case CRASH_IOCVERSION: ++ if (crash_debug) ++ printk("crashdev_ioctl: version\n"); ++ retval = CRASH_K_MINOR | (CRASH_K_MAJOR << 16); ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ ++ return retval; ++} +Index: linux/kernel/module.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/kernel/module.c,v +retrieving revision 1.1.1.1.4.1 +retrieving revision 1.1.1.1.4.1.2.1 +diff -u -r1.1.1.1.4.1 -r1.1.1.1.4.1.2.1 +--- linux/kernel/module.c 12 Mar 2003 19:51:36 -0000 1.1.1.1.4.1 ++++ linux/kernel/module.c 1 Apr 2003 12:17:41 -0000 1.1.1.1.4.1.2.1 +@@ -311,7 +311,14 @@ + error = -EEXIST; + goto err1; + } ++#if defined(CONFIG_MCL_COREDUMP) ++ /* Call vmalloc_32 instead of module_map (vmalloc for i386) ++ * to avoid being mapped in highmem where mcore can't see us. ++ */ ++ if ((mod = (struct module *)vmalloc_32(size)) == NULL) { ++#else + if ((mod = (struct module *)module_map(size)) == NULL) { ++#endif + error = -ENOMEM; + goto err1; + } +Index: linux/kernel/panic.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/kernel/panic.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/kernel/panic.c 12 Mar 2003 19:51:36 -0000 1.3.2.1 ++++ linux/kernel/panic.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 +@@ -19,6 +19,10 @@ + #include + #include + ++#ifdef CONFIG_MCL_COREDUMP ++#include ++#endif ++ + asmlinkage void sys_sync(void); /* it's really int */ + + int panic_timeout; +@@ -197,20 +201,43 @@ + unsigned long caller = (unsigned long) __builtin_return_address(0); + #endif + ++#ifdef CONFIG_MCL_COREDUMP ++ crash_save_regs(); ++#endif ++ + bust_spinlocks(1); + va_start(args, fmt); + vsprintf(buf, fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic: %s\n",buf); ++ ++#ifdef CONFIG_MCL_COREDUMP ++ if (!panicmsg) { ++ panicmsg = buf; ++ panic_processor = smp_processor_id(); ++ mb(); ++ } ++#endif ++ + if (netdump_func) + BUG(); + if (in_interrupt()) + printk(KERN_EMERG "In interrupt handler - not syncing\n"); + else if (!current->pid) + printk(KERN_EMERG "In idle task - not syncing\n"); ++#ifdef CONFIG_MCL_COREDUMP ++ else if (crash_perform_sync) ++#else + else ++#endif + sys_sync(); ++ + bust_spinlocks(0); ++ ++#ifdef CONFIG_MCL_COREDUMP ++ smp_call_function((void *)smp_crash_funnel_cpu,0,0,0); ++ crash_save_current_state(current); ++#endif + + #ifdef CONFIG_SMP + smp_send_stop(); +Index: linux/kernel/sysctl.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/kernel/sysctl.c,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.1 +diff -u -r1.2.2.1 -r1.2.2.1.2.1 +--- linux/kernel/sysctl.c 12 Mar 2003 19:51:36 -0000 1.2.2.1 ++++ linux/kernel/sysctl.c 1 Apr 2003 12:17:41 -0000 1.2.2.1.2.1 +@@ -37,6 +37,10 @@ + #include + #endif + ++#ifdef CONFIG_MCL_COREDUMP ++#include ++#endif ++ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ +@@ -247,6 +251,10 @@ + {KERN_SYSRQ, "sysrq", &sysrq_enabled, sizeof (int), + 0644, NULL, &proc_dointvec}, + #endif ++#ifdef CONFIG_MCL_COREDUMP ++ {KERN_PANIC_ON_OOPS, "panic_on_oops", &panic_on_oops, sizeof(int), ++ 0644, NULL, &proc_dointvec}, ++#endif + {KERN_CADPID, "cad_pid", &cad_pid, sizeof (int), + 0600, NULL, &proc_dointvec}, + {KERN_MAX_THREADS, "threads-max", &max_threads, sizeof(int), +Index: linux/lib/Config.in +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/lib/Config.in,v +retrieving revision 1.2 +retrieving revision 1.2.4.1 +diff -u -r1.2 -r1.2.4.1 +--- linux/lib/Config.in 14 Feb 2003 22:59:23 -0000 1.2 ++++ linux/lib/Config.in 1 Apr 2003 12:17:41 -0000 1.2.4.1 +@@ -23,12 +23,14 @@ + fi + fi + +-if [ "$CONFIG_PPP_DEFLATE" = "y" -o \ ++if [ "$CONFIG_MCL_COREDUMP" = "y" -o \ ++ "$CONFIG_PPP_DEFLATE" = "y" -o \ + "$CONFIG_JFFS2_FS" = "y" ]; then + define_tristate CONFIG_ZLIB_DEFLATE y + else + if [ "$CONFIG_PPP_DEFLATE" = "m" -o \ +- "$CONFIG_JFFS2_FS" = "m" ]; then ++ "$CONFIG_JFFS2_FS" = "m" -o \ ++ "$CONFIG_MCL_COREDUMP" = "m" ]; then + define_tristate CONFIG_ZLIB_DEFLATE m + else + tristate 'zlib compression support' CONFIG_ZLIB_DEFLATE +Index: linux/mm/memory.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/mm/memory.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/mm/memory.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 ++++ linux/mm/memory.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 +@@ -1381,6 +1381,10 @@ + } + lock_page(page); + ++#ifdef CONFIG_MCL_COREDUMP ++ set_bit(PG_anon, &page->flags); ++#endif ++ + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. +@@ -1470,6 +1474,9 @@ + mm->rss++; + flush_page_to_ram(page); + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); ++#ifdef CONFIG_MCL_COREDUMP ++ set_bit(PG_anon, &page->flags); ++#endif + lru_cache_add(page); + } + +Index: linux/mm/page_alloc.c +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/mm/page_alloc.c,v +retrieving revision 1.3.2.1 +retrieving revision 1.3.2.1.2.1 +diff -u -r1.3.2.1 -r1.3.2.1.2.1 +--- linux/mm/page_alloc.c 12 Mar 2003 19:51:37 -0000 1.3.2.1 ++++ linux/mm/page_alloc.c 1 Apr 2003 12:17:41 -0000 1.3.2.1.2.1 +@@ -95,6 +95,10 @@ + struct page *base; + per_cpu_t *per_cpu; + zone_t *zone; ++#ifdef CONFIG_MCL_COREDUMP ++ struct page *pagemap; ++ int count = 1<lock); + ++#ifdef CONFIG_MCL_COREDUMP ++ pagemap = page; ++ do { ++ pagemap->flags |= (1<flags &= ~((1<free_pages -= mask; + + while (mask + (1 << (MAX_ORDER-1))) { +@@ -268,6 +281,16 @@ + zone->free_pages -= 1UL << order; + + page = expand(zone, page, index, order, curr_order, area); ++#ifdef CONFIG_MCL_COREDUMP ++ { ++ struct page *pagemap = page; ++ int count = 1<flags &= ~(1<lock, flags); + + set_page_count(page, 1); +Index: linux/arch/i386//boot/compressed/head.S +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/boot/compressed/head.S,v +retrieving revision 1.1.1.1 +retrieving revision 1.1.1.1.12.6 +diff -u -r1.1.1.1 -r1.1.1.1.12.6 +--- linux/arch/i386//boot/compressed/head.S 7 May 2002 21:53:54 -0000 1.1.1.1 ++++ linux/arch/i386//boot/compressed/head.S 5 Apr 2003 05:51:27 -0000 1.1.1.1.12.6 +@@ -23,6 +23,7 @@ + */ + .text + ++#include + #include + #include + +@@ -31,6 +32,55 @@ + startup_32: + cld + cli ++ ++#ifdef CONFIG_BOOTIMG ++/* ++ * GDT is invalid if we're booted by bootimg, so reload it now ++ */ ++ lgdt %cs:gdt_descr ++ ljmp $(__KERNEL_CS),$1f ++ ++gdt_table_limit = gdt_table_end - gdt_table - 1 ++gdt_descr: ++ .word gdt_table_limit ++ .long gdt_table ++ ++gdt_table: /* stolen from arch/i386/kernel/head.S */ ++ .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x0000000000000000 /* 0x0b reserved */ ++ .quad 0x0000000000000000 /* 0x13 reserved */ ++ .quad 0x0000000000000000 /* 0x1b reserved */ ++ .quad 0x00cffa000000ffff /* 0x23 user 4GB code at 0x00000000 */ ++ .quad 0x00cff2000000ffff /* 0x2b user 4GB data at 0x00000000 */ ++ .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ ++ .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ ++ .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ ++ .quad 0x0000000000000000 /* 0x4b reserved */ ++ .quad 0x0000000000000000 /* 0x53 reserved */ ++ .quad 0x0000000000000000 /* 0x5b reserved */ ++ ++ .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ ++ .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ ++ .quad 0x0000000000000000 /* 0x70 TSS descriptor */ ++ .quad 0x0000000000000000 /* 0x78 LDT descriptor */ ++ ++ /* Segments used for calling PnP BIOS */ ++ .quad 0x00c09a0000000000 /* 0x80 32-bit code */ ++ .quad 0x00809a0000000000 /* 0x88 16-bit code */ ++ .quad 0x0080920000000000 /* 0x90 16-bit data */ ++ .quad 0x0080920000000000 /* 0x98 16-bit data */ ++ .quad 0x0080920000000000 /* 0xa0 16-bit data */ ++ /* ++ * The APM segments have byte granularity and their bases ++ * and limits are set at run time. ++ */ ++ .quad 0x00409a0000000000 /* 0xa8 APM CS code */ ++ .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ ++ .quad 0x0040920000000000 /* 0xb8 APM DS data */ ++gdt_table_end: ++ ++1: ++#endif + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es +@@ -92,7 +142,6 @@ + cld + rep + movsl +- + popl %esi # discard the address + popl %ebx # real mode pointer + popl %esi # low_buffer_start +@@ -124,5 +173,10 @@ + movsl + movl %ebx,%esi # Restore setup pointer + xorl %ebx,%ebx ++#ifdef CONFIG_BOOTIMG ++ movl $0x100000,%eax ++ jmpl *%eax ++#else + ljmp $(__KERNEL_CS), $0x100000 ++#endif + move_routine_end: +Index: linux/arch/i386//kernel/head.S +=================================================================== +RCS file: /chaos/cvs/kernel-rh/linux/arch/i386/kernel/head.S,v +retrieving revision 1.2.2.1 +retrieving revision 1.2.2.1.2.5 +diff -u -r1.2.2.1 -r1.2.2.1.2.5 +--- linux/arch/i386//kernel/head.S 12 Mar 2003 19:49:06 -0000 1.2.2.1 ++++ linux/arch/i386//kernel/head.S 5 Apr 2003 05:51:27 -0000 1.2.2.1.2.5 +@@ -42,6 +42,21 @@ + * On entry, %esi points to the real-mode code as a 32-bit pointer. + */ + startup_32: ++#ifdef CONFIG_BOOTIMG ++/* ++ * GDT is invalid if we're booted by bootimg, so reload it now ++ */ ++ lgdt %cs:_gdt_descr-__PAGE_OFFSET ++ ljmp $(__KERNEL_CS),$1f-__PAGE_OFFSET ++ ++gdt_limit = SYMBOL_NAME(cpu_gdt_table_end) - SYMBOL_NAME(cpu_gdt_table) - 1 ++ ++_gdt_descr: ++ .word gdt_limit ++ .long SYMBOL_NAME(cpu_gdt_table)-__PAGE_OFFSET ++ ++1: ++#endif + /* + * Set segments to known values + */ +@@ -452,6 +467,7 @@ + .quad 0x00409a0000000000 /* 0xa8 APM CS code */ + .quad 0x00009a0000000000 /* 0xb0 APM CS 16 code (16 bit) */ + .quad 0x0040920000000000 /* 0xb8 APM DS data */ ++ENTRY(cpu_gdt_table_end) + + #if CONFIG_SMP + .fill (NR_CPUS-1)*GDT_ENTRIES,8,0 /* other CPU's GDT */ diff --git a/lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch b/lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch deleted file mode 100644 index f25baa4..0000000 --- a/lustre/kernel_patches/patches/patch-2.4.18-hp1_pnnl18.2.8qsnet.patch +++ /dev/null @@ -1,1673 +0,0 @@ ---- linux-pristine/./include/linux/lustre_version.h Wed Dec 31 19:00:00 1969 -+++ linux/./include/linux/lustre_version.h Tue Nov 26 07:02:14 2002 -@@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 5 ---- linux-pristine/./arch/ia64/mm/init.c Thu Dec 5 10:47:25 2002 -+++ linux/./arch/ia64/mm/init.c Fri Nov 29 18:06:20 2002 -@@ -44,6 +44,12 @@ - - static struct page *vmem_map; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int - do_check_pgt_cache (int low, int high) - { ---- linux-pristine/./arch/i386/mm/init.c Thu Dec 5 10:47:24 2002 -+++ linux/./arch/i386/mm/init.c Fri Nov 29 18:06:20 2002 -@@ -43,6 +43,12 @@ - static unsigned long totalram_pages; - static unsigned long totalhigh_pages; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int do_check_pgt_cache(int low, int high) - { - int freed = 0; ---- linux-pristine/./drivers/block/blkpg.c Thu Dec 5 10:47:36 2002 -+++ linux/./drivers/block/blkpg.c Fri Nov 29 18:08:05 2002 -@@ -308,6 +308,41 @@ - - EXPORT_SYMBOL(blk_ioctl); - -+#define NUM_DEV_NO_WRITE 16 -+static int dev_no_write[NUM_DEV_NO_WRITE]; -+ -+/* -+ * Debug code for turning block devices "read-only" (will discard writes -+ * silently). This is for filesystem crash/recovery testing. -+ */ -+void dev_set_rdonly(kdev_t dev, int no_write) -+{ -+ if (dev) { -+ printk(KERN_WARNING "Turning device %s read-only\n", -+ bdevname(dev)); -+ dev_no_write[no_write] = 0xdead0000 + dev; -+ } -+} -+ -+int dev_check_rdonly(kdev_t dev) { -+ int i; -+ -+ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { -+ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && -+ dev == (dev_no_write[i] & 0xffff)) -+ return 1; -+ } -+ return 0; -+} -+ -+void dev_clear_rdonly(int no_write) { -+ dev_no_write[no_write] = 0; -+} -+ -+EXPORT_SYMBOL(dev_set_rdonly); -+EXPORT_SYMBOL(dev_check_rdonly); -+EXPORT_SYMBOL(dev_clear_rdonly); -+ - /********************* - * get_last_sector() - * ---- linux-pristine/./drivers/block/loop.c Thu Dec 5 10:47:37 2002 -+++ linux/./drivers/block/loop.c Fri Nov 29 18:06:20 2002 -@@ -471,6 +471,11 @@ - spin_unlock_irq(&lo->lo_lock); - - if (rw == WRITE) { -+#ifdef CONFIG_DEV_RDONLY -+ if (dev_check_rdonly(rbh->b_rdev)) -+ goto err; -+#endif -+ - if (lo->lo_flags & LO_FLAGS_READ_ONLY) - goto err; - } else if (rw == READA) { ---- linux-pristine/./drivers/ide/ide-disk.c Thu Dec 5 10:47:59 2002 -+++ linux/./drivers/ide/ide-disk.c Fri Nov 29 18:06:20 2002 -@@ -367,6 +367,12 @@ - */ - static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) - { -+#ifdef CONFIG_DEV_RDONLY -+ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { -+ ide_end_request(1, HWGROUP(drive)); -+ return ide_stopped; -+ } -+#endif - if (IDE_CONTROL_REG) - OUT_BYTE(drive->ctl,IDE_CONTROL_REG); - OUT_BYTE(0x00, IDE_FEATURE_REG); ---- linux-pristine/./fs/ext3/Makefile Thu Dec 5 10:49:13 2002 -+++ linux/./fs/ext3/Makefile Fri Nov 29 18:06:20 2002 -@@ -9,6 +9,8 @@ - - O_TARGET := ext3.o - -+export-objs := super.o -+ - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o - obj-m := $(O_TARGET) ---- linux-pristine/./fs/ext3/super.c Thu Dec 5 10:49:13 2002 -+++ linux/./fs/ext3/super.c Fri Nov 29 18:06:20 2002 -@@ -1744,7 +1744,7 @@ - unregister_filesystem(&ext3_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-pristine/./fs/jbd/commit.c Thu Dec 5 10:49:15 2002 -+++ linux/./fs/jbd/commit.c Fri Nov 29 18:06:20 2002 -@@ -475,7 +475,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -566,8 +566,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -577,6 +579,7 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } - -@@ -600,7 +603,6 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this -@@ -609,6 +611,25 @@ - - skip_commit: - -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); -+ - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); ---- linux-pristine/./fs/jbd/journal.c Thu Dec 5 10:49:15 2002 -+++ linux/./fs/jbd/journal.c Fri Nov 29 18:06:20 2002 -@@ -58,6 +58,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); ---- linux-pristine/./fs/jbd/transaction.c Thu Dec 5 10:49:15 2002 -+++ linux/./fs/jbd/transaction.c Fri Nov 29 18:06:20 2002 -@@ -57,6 +57,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -201,6 +202,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -227,14 +242,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -333,14 +345,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1328,6 +1337,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1393,7 +1424,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the ---- linux-pristine/./include/linux/blkdev.h Thu Dec 5 10:49:41 2002 -+++ linux/./include/linux/blkdev.h Fri Nov 29 18:30:34 2002 -@@ -228,4 +228,8 @@ - return retval; - } - -+#define CONFIG_DEV_RDONLY -+void dev_set_rdonly(kdev_t, int); -+int dev_check_rdonly(kdev_t); -+void dev_clear_rdonly(int); - #endif ---- linux-pristine/./include/linux/slab.h Thu Dec 5 10:49:53 2002 -+++ linux/./include/linux/slab.h Fri Nov 29 18:30:15 2002 -@@ -58,6 +58,7 @@ - extern void *kmem_cache_alloc(kmem_cache_t *, int); - extern void *kmem_cache_zalloc(kmem_cache_t *, int); - extern void kmem_cache_free(kmem_cache_t *, void *); -+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); - - extern void *kmalloc(size_t, int); - extern void kfree(const void *); ---- linux-pristine/./include/linux/jbd.h Thu Dec 5 10:49:43 2002 -+++ linux/./include/linux/jbd.h Fri Nov 29 18:50:01 2002 -@@ -249,6 +249,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -279,6 +286,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -398,6 +411,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -646,6 +663,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); ---- linux-pristine/./kernel/ksyms.c Thu Dec 5 10:50:01 2002 -+++ linux/./kernel/ksyms.c Fri Nov 29 18:37:23 2002 -@@ -271,6 +271,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); -@@ -285,6 +286,11 @@ - EXPORT_SYMBOL(nr_free_pages); - EXPORT_SYMBOL(page_cache_size); - -+/* lustre */ -+EXPORT_SYMBOL(pagecache_lock); -+EXPORT_SYMBOL(do_kern_mount); -+EXPORT_SYMBOL(kmem_cache_validate); -+ - /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ - EXPORT_SYMBOL(default_llseek); - EXPORT_SYMBOL(dentry_open); ---- linux-pristine/./include/linux/dcache.h Thu Dec 5 10:49:42 2002 -+++ linux/./include/linux/dcache.h Fri Nov 29 18:30:11 2002 -@@ -6,6 +6,34 @@ - #include - #include - -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_MKDIR (1<<2) -+#define IT_LINK (1<<3) -+#define IT_LINK2 (1<<4) -+#define IT_SYMLINK (1<<5) -+#define IT_UNLINK (1<<6) -+#define IT_RMDIR (1<<7) -+#define IT_RENAME (1<<8) -+#define IT_RENAME2 (1<<9) -+#define IT_READDIR (1<<10) -+#define IT_GETATTR (1<<11) -+#define IT_SETATTR (1<<12) -+#define IT_READLINK (1<<13) -+#define IT_MKNOD (1<<14) -+#define IT_LOOKUP (1<<15) -+ -+struct lookup_intent { -+ int it_op; -+ int it_mode; -+ int it_disposition; -+ int it_status; -+ struct iattr *it_iattr; -+ __u64 it_lock_handle[2]; -+ int it_lock_mode; -+ void *it_data; -+}; -+ - /* - * linux/include/linux/dcache.h - * -@@ -78,6 +106,7 @@ - unsigned long d_time; /* used by d_revalidate */ - struct dentry_operations *d_op; - struct super_block * d_sb; /* The root of the dentry tree */ -+ struct lookup_intent *d_it; - unsigned long d_vfs_flags; - void * d_fsdata; /* fs-specific data */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ -@@ -90,6 +119,8 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); -+ void (*d_intent_release)(struct dentry *, struct lookup_intent *); - }; - - /* the dentry parameter passed to d_hash and d_compare is the parent ---- linux-pristine/./include/linux/fs.h Thu Dec 5 10:49:42 2002 -+++ linux/./include/linux/fs.h Fri Nov 29 18:30:15 2002 -@@ -588,6 +588,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_intent; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -849,7 +850,9 @@ - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it); - - /* - * File types -@@ -911,6 +914,7 @@ - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); - int (*link) (struct dentry *,struct inode *,struct dentry *); - int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); -@@ -921,6 +925,8 @@ - struct inode *, struct dentry *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -@@ -1063,7 +1069,7 @@ - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); - extern long do_mount(char *, char *, char *, unsigned long, void *); -- -+struct vfsmount *do_kern_mount(char *type, int flags, char *name, void *data); - #define kern_umount mntput - - extern int vfs_statfs(struct super_block *, struct statfs *); -@@ -1387,6 +1393,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1397,6 +1404,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1508,6 +1517,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; ---- linux-pristine/./fs/dcache.c Thu Dec 5 10:49:13 2002 -+++ linux/./fs/dcache.c Fri Nov 29 18:06:20 2002 -@@ -617,6 +617,7 @@ - dentry->d_op = NULL; - dentry->d_fsdata = NULL; - dentry->d_mounted = 0; -+ dentry->d_it = NULL; - INIT_LIST_HEAD(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-pristine/./fs/nfsd/vfs.c Thu Dec 5 10:49:18 2002 -+++ linux/./fs/nfsd/vfs.c Fri Nov 29 18:06:20 2002 -@@ -1285,7 +1285,7 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); ---- linux-pristine/./fs/namei.c Thu Dec 5 10:49:16 2002 -+++ linux/./fs/namei.c Fri Nov 29 18:11:18 2002 -@@ -94,6 +94,12 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct dentry *de, struct lookup_intent *it) -+{ -+ if (it && de->d_op && de->d_op->d_intent_release) -+ de->d_op->d_intent_release(de, it); -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +266,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { -+ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,7 +296,8 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -@@ -300,6 +316,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup2) -+ result = dir->i_op->lookup2(dir, dentry, it); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +340,12 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate2) { -+ if (!result->d_op->d_revalidate2(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ result = ERR_PTR(-ENOENT); -+ } - } - return result; - } -@@ -332,7 +357,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 5) -@@ -346,10 +372,14 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -- err = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(dentry, it); - path_release(nd); - return -ELOOP; - } -@@ -445,7 +475,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -518,9 +549,9 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -537,8 +568,8 @@ - if (!inode->i_op) - goto out_dput; - -- if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if (inode->i_op->follow_link || inode->i_op->follow_link2) { -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -554,7 +585,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup2) - break; - continue; - /* here ends the main loop */ -@@ -581,9 +612,9 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -591,9 +622,9 @@ - while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) - ; - inode = dentry->d_inode; -- if ((lookup_flags & LOOKUP_FOLLOW) -- && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op && -+ (inode->i_op->follow_link || inode->i_op->follow_link2)) { -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -607,7 +638,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup2)) - break; - } - goto return_base; -@@ -636,10 +668,21 @@ - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -742,7 +785,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -765,13 +809,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup2) -+ dentry = inode->i_op->lookup2(inode, new, it); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -783,6 +830,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -804,7 +857,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -836,6 +889,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -970,7 +1040,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -985,7 +1056,7 @@ - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -995,6 +1066,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1011,7 +1086,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1020,6 +1095,7 @@ - goto exit; - } - -+ it->it_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - if (!IS_POSIX_ACL(dir->d_inode)) -@@ -1054,7 +1130,8 @@ - error = -ENOENT; - if (!dentry->d_inode) - goto exit_dput; -- if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) -+ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || -+ dentry->d_inode->i_op->follow_link2)) - goto do_link; - - dput(nd->dentry); -@@ -1140,8 +1217,10 @@ - return 0; - - exit_dput: -+ intent_release(dentry, it); - dput(dentry); - exit: -+ intent_release(nd->dentry, it); - path_release(nd); - return error; - -@@ -1160,7 +1239,12 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -- error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(dentry, it); - dput(dentry); - if (error) - return error; -@@ -1182,13 +1266,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1196,7 +1287,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1242,6 +1333,7 @@ - char * tmp; - struct dentry * dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode }; - - if (S_ISDIR(mode)) - return -EPERM; -@@ -1253,7 +1345,7 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - - if (!IS_POSIX_ACL(nd.dentry->d_inode)) -@@ -1272,6 +1364,7 @@ - default: - error = -EINVAL; - } -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1312,6 +1405,7 @@ - { - int error = 0; - char * tmp; -+ struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode }; - - tmp = getname(pathname); - error = PTR_ERR(tmp); -@@ -1323,12 +1417,13 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ dentry = lookup_create(&nd, 1, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - if (!IS_POSIX_ACL(nd.dentry->d_inode)) - mode &= ~current->fs->umask; - error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1410,6 +1505,7 @@ - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_RMDIR }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1432,10 +1528,11 @@ - goto exit1; - } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1479,6 +1576,7 @@ - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_UNLINK }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1492,7 +1590,7 @@ - if (nd.last_type != LAST_NORM) - goto exit1; - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1500,6 +1598,7 @@ - goto slashes; - error = vfs_unlink(nd.dentry->d_inode, dentry); - exit2: -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1546,6 +1645,7 @@ - int error = 0; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_SYMLINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1560,10 +1660,12 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ it.it_data = from; -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1629,6 +1731,7 @@ - int error; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_LINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1641,7 +1744,7 @@ - - error = 0; - if (path_init(from, LOOKUP_POSITIVE, &old_nd)) -- error = path_walk(from, &old_nd); -+ error = path_walk_it(from, &old_nd, &it); - if (error) - goto exit; - if (path_init(to, LOOKUP_PARENT, &nd)) -@@ -1651,10 +1754,12 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ it.it_op = IT_LINK2; -+ new_dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -+ intent_release(new_dentry, &it); - dput(new_dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1697,7 +1802,8 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - struct inode *target; -@@ -1757,6 +1863,7 @@ - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - if (target) { - if (!error) - target->i_flags |= S_DEAD; -@@ -1778,7 +1885,8 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - -@@ -1809,6 +1917,7 @@ - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - double_up(&old_dir->i_zombie, &new_dir->i_zombie); - if (error) - return error; -@@ -1820,13 +1929,14 @@ - } - - int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - if (S_ISDIR(old_dentry->d_inode->i_mode)) -- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it); - else -- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it); - if (!error) { - if (old_dir == new_dir) - inode_dir_notify(old_dir, DN_RENAME); -@@ -1843,6 +1953,7 @@ - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; -+ struct lookup_intent it = { .it_op = IT_RENAME }; - struct nameidata oldnd, newnd; - - if (path_init(oldname, LOOKUP_PARENT, &oldnd)) -@@ -1871,7 +1982,7 @@ - - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1887,18 +1998,21 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ it.it_op = IT_RENAME2; -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, &it); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, &it); - unlock_kernel(); - -+ intent_release(new_dentry, &it); - dput(new_dentry); - exit4: -+ intent_release(old_dentry, &it); - dput(old_dentry); - exit3: - double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1947,7 +2061,8 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; -@@ -1960,7 +2075,7 @@ - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -1982,7 +2097,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2024,7 +2145,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); ---- linux-pristine/./fs/open.c Thu Dec 5 10:49:20 2002 -+++ linux/./fs/open.c Fri Nov 29 18:06:21 2002 -@@ -19,6 +19,9 @@ - #include - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -94,12 +97,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -144,6 +148,7 @@ - put_write_access(inode); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -235,8 +240,9 @@ - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -262,6 +268,7 @@ - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -279,8 +286,9 @@ - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - - if (error) - goto out; -@@ -307,6 +315,7 @@ - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -323,6 +332,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -340,13 +350,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - -@@ -362,6 +373,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -370,7 +382,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -382,6 +394,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -422,6 +435,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -430,7 +444,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -447,6 +461,7 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -491,8 +506,9 @@ - struct inode * inode; - int error; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -512,6 +528,7 @@ - error = notify_change(nd.dentry, &newattrs); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -581,10 +598,12 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -594,10 +613,12 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -631,10 +652,16 @@ - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+ - struct file *filp_open(const char * filename, int flags, int mode) - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -642,14 +669,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -692,6 +720,7 @@ - } - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - -+ intent_release(dentry, it); - return f; - - cleanup_all: -@@ -706,11 +735,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(dentry, it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ ---- linux-pristine/./fs/stat.c Thu Dec 5 10:49:22 2002 -+++ linux/./fs/stat.c Fri Nov 29 18:06:21 2002 -@@ -13,6 +13,7 @@ - - #include - -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - /* - * Revalidate the inode. This is required for proper NFS attribute caching. - */ -@@ -135,13 +136,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -151,13 +154,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -172,13 +177,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -189,13 +196,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -247,20 +256,21 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_READLINK }; - - if (bufsiz <= 0) - return -EINVAL; - -- error = user_path_walk_link(path, &nd); -+ error = user_path_walk_link_it(path, &nd, &it); - if (!error) { - struct inode * inode = nd.dentry->d_inode; -- - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && - !(error = do_revalidate(nd.dentry))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -333,12 +343,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -348,12 +360,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; ---- linux-pristine/./mm/slab.c Thu Dec 5 10:50:02 2002 -+++ linux/./mm/slab.c Fri Nov 29 18:06:21 2002 -@@ -1187,6 +1187,59 @@ - * Called with the cache-lock held. - */ - -+extern struct page *check_get_page(unsigned long kaddr); -+struct page *page_mem_map(struct page *page); -+static int kmem_check_cache_obj (kmem_cache_t * cachep, -+ slab_t *slabp, void * objp) -+{ -+ int i; -+ unsigned int objnr; -+ -+#if DEBUG -+ if (cachep->flags & SLAB_RED_ZONE) { -+ objp -= BYTES_PER_WORD; -+ if ( *(unsigned long *)objp != RED_MAGIC2) -+ /* Either write before start, or a double free. */ -+ return 0; -+ if (*(unsigned long *)(objp+cachep->objsize - -+ BYTES_PER_WORD) != RED_MAGIC2) -+ /* Either write past end, or a double free. */ -+ return 0; -+ } -+#endif -+ -+ objnr = (objp-slabp->s_mem)/cachep->objsize; -+ if (objnr >= cachep->num) -+ return 0; -+ if (objp != slabp->s_mem + objnr*cachep->objsize) -+ return 0; -+ -+ /* Check slab's freelist to see if this obj is there. */ -+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { -+ if (i == objnr) -+ return 0; -+ } -+ return 1; -+} -+ -+ -+int kmem_cache_validate(kmem_cache_t *cachep, void *objp) -+{ -+ struct page *page = check_get_page((unsigned long)objp); -+ -+ if (!VALID_PAGE(page)) -+ return 0; -+ -+ if (!PageSlab(page)) -+ return 0; -+ -+ /* XXX check for freed slab objects ? */ -+ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) -+ return 0; -+ -+ return (cachep == GET_PAGE_CACHE(page)); -+} -+ - #if DEBUG - static int kmem_extra_free_checks (kmem_cache_t * cachep, - slab_t *slabp, void * objp) diff --git a/lustre/kernel_patches/patches/tcp-zero-copy.patch b/lustre/kernel_patches/patches/tcp-zero-copy.patch new file mode 100644 index 0000000..7176eca --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy.patch @@ -0,0 +1,455 @@ +diff -u -r1.1.1.1 linux/include/linux/skbuff.h +--- linux/include/linux/skbuff.h 2 Aug 2002 10:59:25 -0000 1.1.1.1 ++++ linux/include/linux/skbuff.h 2 Aug 2002 14:20:00 -0000 +@@ -116,6 +116,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +147,12 @@ + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +diff -u -r1.1.1.1 linux/include/net/tcp.h +--- linux/include/net/tcp.h 2 Aug 2002 10:59:29 -0000 1.1.1.1 ++++ linux/include/net/tcp.h 2 Aug 2002 14:03:49 -0000 +@@ -639,6 +639,8 @@ + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -732,6 +734,9 @@ + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern int tcp_listen_start(struct sock *sk); + +diff -u -r1.1.1.1 linux/net/netsyms.c +--- linux/net/netsyms.c 2 Aug 2002 10:59:31 -0000 1.1.1.1 ++++ linux/net/netsyms.c 2 Aug 2002 14:21:31 -0000 +@@ -395,6 +395,8 @@ + EXPORT_SYMBOL(sysctl_tcp_ecn); + EXPORT_SYMBOL(tcp_cwnd_application_limited); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + + EXPORT_SYMBOL(tcp_write_xmit); + +diff -u -r1.1.1.1 linux/net/core/skbuff.c +--- linux/net/core/skbuff.c 2 Aug 2002 10:59:32 -0000 1.1.1.1 ++++ linux/net/core/skbuff.c 2 Aug 2002 14:07:13 -0000 +@@ -208,6 +208,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; + return skb; + + nodata: +@@ -276,6 +278,10 @@ + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -532,6 +538,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ skb_shinfo(skb)->zccd2 = NULL; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -577,6 +585,14 @@ + + n->data_len = skb->data_len; + n->len = skb->len; ++ ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; + + if (skb_shinfo(skb)->nr_frags) { + int i; +@@ -620,6 +636,8 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ + + if (skb_shared(skb)) + BUG(); +@@ -641,6 +659,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data+nhead) - skb->head; +@@ -655,6 +678,8 @@ + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +diff -u -r1.1.1.1 linux/net/ipv4/tcp.c +--- linux/net/ipv4/tcp.c 2 Aug 2002 10:59:34 -0000 1.1.1.1 ++++ linux/net/ipv4/tcp.c 2 Aug 2002 14:36:30 -0000 +@@ -745,7 +745,7 @@ + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -824,7 +824,8 @@ + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ ++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -872,6 +873,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -881,6 +893,20 @@ + tcp_mark_push(tp, skb); + goto new_segment; + } ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } + + skb->len += copy; + skb->data_len += copy; +@@ -945,7 +971,31 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +@@ -1767,6 +1817,202 @@ + recv_urg: + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; ++} ++ ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->err || ++ sk->state == TCP_CLOSE || ++ (sk->shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sk->done) ++ break; ++ ++ if (sk->err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->state == TCP_CLOSE) { ++ if (!sk->done) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ timeo = tcp_data_wait(sk, timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!sk->urginline) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; + } + + /* diff --git a/lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch b/lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch new file mode 100644 index 0000000..b35fee0 --- /dev/null +++ b/lustre/kernel_patches/patches/uml-patch-2.4.20-4.patch @@ -0,0 +1,39358 @@ +diff -Naur -X ../exclude-files orig/CREDITS um/CREDITS +--- orig/CREDITS Thu Feb 27 13:04:11 2003 ++++ um/CREDITS Thu Feb 27 13:05:17 2003 +@@ -432,6 +432,7 @@ + E: lars@nocrew.org + W: http://lars.nocrew.org/ + D: dsp56k device driver ++D: ptrace proxy in user mode kernel port + S: Kopmansg 2 + S: 411 13 Goteborg + S: Sweden +@@ -721,7 +722,7 @@ + E: jdike@karaya.com + W: http://user-mode-linux.sourceforge.net + D: User mode kernel port +-S: RR1 Box 67C ++S: 375 Tubbs Hill Rd + S: Deering NH 03244 + S: USA + +diff -Naur -X ../exclude-files orig/Documentation/Configure.help um/Documentation/Configure.help +--- orig/Documentation/Configure.help Thu Feb 27 13:04:11 2003 ++++ um/Documentation/Configure.help Thu Feb 27 13:05:17 2003 +@@ -14690,19 +14690,23 @@ + The module will be called dsbr100.o. If you want to compile it as a + module, say M here and read . + +-Always do synchronous disk IO for UBD +-CONFIG_BLK_DEV_UBD_SYNC ++CONFIG_BLK_DEV_UBD + The User-Mode Linux port includes a driver called UBD which will let + you access arbitrary files on the host computer as block devices. +- Writes to such a block device are not immediately written to the +- host's disk; this may cause problems if, for example, the User-Mode +- Linux 'Virtual Machine' uses a journalling file system and the host +- computer crashes. ++ Unless you know that you do not need such virtual block devices say ++ Y here. ++ ++Always do synchronous disk IO for UBD ++CONFIG_BLK_DEV_UBD_SYNC ++ Writes to the virtual block device are not immediately written to the host's ++ disk; this may cause problems if, for example, the User-Mode Linux ++ 'Virtual Machine' uses a journalling filesystem and the host computer ++ crashes. + + Synchronous operation (i.e. always writing data to the host's disk + immediately) is configurable on a per-UBD basis by using a special + kernel command line option. Alternatively, you can say Y here to +- turn on synchronous operation by default for all block. ++ turn on synchronous operation by default for all block devices. + + If you're running a journalling file system (like reiserfs, for + example) in your virtual machine, you will want to say Y here. If +@@ -14714,6 +14718,7 @@ + CONFIG_PT_PROXY + This option enables a debugging interface which allows gdb to debug + the kernel without needing to actually attach to kernel threads. ++ CONFIG_XTERM_CHAN must be enabled in order to enable CONFIG_PT_PROXY. + If you want to do kernel debugging, say Y here; otherwise say N. + + Management console +@@ -14908,25 +14913,173 @@ + + SLIP transport + CONFIG_UML_NET_SLIP +- The Slip User-Mode Linux network transport allows a running UML to ++ The slip User-Mode Linux network transport allows a running UML to + network with its host over a point-to-point link. Unlike Ethertap, + which can carry any Ethernet frame (and hence even non-IP packets), +- the Slip transport can only carry IP packets. ++ the slip transport can only carry IP packets. + +- To use this, your host must support Slip devices. ++ To use this, your host must support slip devices. + + For more information, see + . That site +- has examples of the UML command line to use to enable Slip ++ has examples of the UML command line to use to enable slip + networking, and details of a few quirks with it. + +- The Ethertap Transport is preferred over Slip because of its +- limitation. If you prefer Slip, however, say Y here. Otherwise ++ The Ethertap Transport is preferred over slip because of its ++ limitations. If you prefer slip, however, say Y here. Otherwise + choose the Multicast transport (to network multiple UMLs on + multiple hosts), Ethertap (to network with the host and the + outside world), and/or the Daemon transport (to network multiple + UMLs on a single host). You may choose more than one without + conflict. If you don't need UML networking, say N. ++ ++SLiRP transport ++CONFIG_UML_NET_SLIRP ++ The SLiRP User-Mode Linux network transport allows a running UML ++ to network by invoking a program that can handle SLIP encapsulated ++ packets. This is commonly (but not limited to) the application ++ known as SLiRP, a program that can re-socket IP packets back onto ++ the host on which it is run. Only IP packets are supported, ++ unlike other network transports that can handle all Ethernet ++ frames. In general, slirp allows the UML the same IP connectivity ++ to the outside world that the host user is permitted, and unlike ++ other transports, SLiRP works without the need of root level ++ privleges, setuid binaries, or SLIP devices on the host. This ++ also means not every type of connection is possible, but most ++ situations can be accomodated with carefully crafted slirp ++ commands that can be passed along as part of the network device's ++ setup string. The effect of this transport on the UML is similar ++ that of a host behind a firewall that masquerades all network ++ connections passing through it (but is less secure). ++ ++ To use this you should first have slirp compiled somewhere ++ accessible on the host, and have read its documentation. If you ++ don't need UML networking, say N. ++ ++ Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" ++ ++Default main console channel initialization ++CONFIG_CON_ZERO_CHAN ++ This is the string describing the channel to which the main console ++ will be attached by default. This value can be overridden from the ++ command line. The default value is "fd:0,fd:1", which attaches the ++ main console to stdin and stdout. ++ It is safe to leave this unchanged. ++ ++Default console channel initialization ++CONFIG_CON_CHAN ++ This is the string describing the channel to which all consoles ++ except the main console will be attached by default. This value can ++ be overridden from the command line. The default value is "xterm", ++ which brings them up in xterms. ++ It is safe to leave this unchanged, although you may wish to change ++ this if you expect the UML that you build to be run in environments ++ which don't have X or xterm available. ++ ++Default serial line channel initialization ++CONFIG_SSL_CHAN ++ This is the string describing the channel to which the serial lines ++ will be attached by default. This value can be overridden from the ++ command line. The default value is "pty", which attaches them to ++ traditional pseudo-terminals. ++ It is safe to leave this unchanged, although you may wish to change ++ this if you expect the UML that you build to be run in environments ++ which don't have a set of /dev/pty* devices. ++ ++Nesting level ++CONFIG_NEST_LEVEL ++ This is set to the number of layers of UMLs that this UML will be run ++ in. Normally, this is zero, meaning that it will run directly on the ++ host. Setting it to one will build a UML that can run inside a UML ++ that is running on the host. Generally, if you intend this UML to run ++ inside another UML, set CONFIG_NEST_LEVEL to one more than the host UML. ++ Note that if the hosting UML has its CONFIG_KERNEL_HALF_GIGS set to ++ greater than one, then the guest UML should have its CONFIG_NEST_LEVEL ++ set to the host's CONFIG_NEST_LEVEL + CONFIG_KERNEL_HALF_GIGS. ++ Only change this if you are running nested UMLs. ++ ++Kernel address space size (in .5G units) ++CONFIG_KERNEL_HALF_GIGS ++ This determines the amount of address space that UML will allocate for ++ its own, measured in half Gigabyte units. The default is 1. ++ Change this only if you need to boot UML with an unusually large amount ++ of physical memory. ++ ++UML sound support ++CONFIG_UML_SOUND ++ This option enables UML sound support. If enabled, it will pull in ++ soundcore and the UML hostaudio relay, which acts as a intermediary ++ between the host's dsp and mixer devices and the UML sound system. ++ It is safe to say 'Y' here. ++ ++UML SMP support ++CONFIG_UML_SMP ++ This option enables UML SMP support. UML implements virtual SMP by ++ allowing as many processes to run simultaneously on the host as ++ there are virtual processors configured. Obviously, if the host is ++ a uniprocessor, those processes will timeshare, but, inside UML, ++ will appear to be running simultaneously. If the host is a ++ multiprocessor, then UML processes may run simultaneously, depending ++ on the host scheduler. ++ CONFIG_SMP will be set to whatever this option is set to. ++ It is safe to leave this unchanged. ++ ++file descriptor channel support ++CONFIG_FD_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to already set up file descriptors. Generally, the main ++ console is attached to file descriptors 0 and 1 (stdin and stdout), ++ so it would be wise to leave this enabled unless you intend to ++ attach it to some other host device. ++ ++null device channel support ++CONFIG_NULL_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to a device similar to /dev/null. Data written to it disappears ++ and there is never any data to be read. ++ ++port channel support ++CONFIG_PORT_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to host portals. They may be accessed with 'telnet ++ '. Any number of consoles and serial lines may be ++ attached to a single portal, although what UML device you get when ++ you telnet to that portal will be unpredictable. ++ It is safe to say 'Y' here. ++ ++pty channel support ++CONFIG_PTY_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to host pseudo-terminals. Access to both traditional ++ pseudo-terminals (/dev/pty*) and pts pseudo-terminals are controlled ++ with this option. The assignment of UML devices to host devices ++ will be announced in the kernel message log. ++ It is safe to say 'Y' here. ++ ++tty channel support ++CONFIG_TTY_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to host terminals. Access to both virtual consoles ++ (/dev/tty*) and the slave side of pseudo-terminals (/dev/ttyp* and ++ /dev/pts/*) are controlled by this option. ++ It is safe to say 'Y' here. ++ ++xterm channel support ++CONFIG_XTERM_CHAN ++ This option enables support for attaching UML consoles and serial ++ lines to xterms. Each UML device so assigned will be brought up in ++ its own xterm. ++ If you disable this option, then CONFIG_PT_PROXY will be disabled as ++ well, since UML's gdb currently requires an xterm. ++ It is safe to say 'Y' here. ++ ++tty logging ++CONFIG_TTY_LOG ++ This option enables logging of all data going through pseudo-terminals ++ to the host. This is primarily useful for honeypots, where you want ++ secure keystroke logging that can't be detected or disabled by root. ++ Say 'N' unless you are setting up a UML honeypot or otherwise know that ++ you want this option. + + Microtek USB scanner support + CONFIG_USB_MICROTEK +diff -Naur -X ../exclude-files orig/MAINTAINERS um/MAINTAINERS +--- orig/MAINTAINERS Thu Feb 27 13:04:12 2003 ++++ um/MAINTAINERS Thu Feb 27 13:05:17 2003 +@@ -1841,6 +1841,14 @@ + L: linux-usb-devel@lists.sourceforge.net + W: http://usb.in.tum.de + S: Maintained ++ ++USER-MODE PORT ++P: Jeff Dike ++M: jdike@karaya.com ++L: user-mode-linux-devel@lists.sourceforge.net ++L: user-mode-linux-user@lists.sourceforge.net ++W: http://user-mode-linux.sourceforge.net ++S: Maintained + + USB "USBNET" DRIVER + P: David Brownell +diff -Naur -X ../exclude-files orig/Makefile um/Makefile +--- orig/Makefile Thu Feb 27 13:04:12 2003 ++++ um/Makefile Thu Feb 27 13:05:17 2003 +@@ -5,7 +5,15 @@ + + KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) + +-ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) ++# SUBARCH tells the usermode build what the underlying arch is. That is set ++# first, and if a usermode build is happening, the "ARCH=um" on the command ++# line overrides the setting of ARCH below. If a native build is happening, ++# then ARCH is assigned, getting whatever value it gets normally, and ++# SUBARCH is subsequently ignored. ++ ++SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) ++ARCH := $(SUBARCH) ++ + KERNELPATH=kernel-$(shell echo $(KERNELRELEASE) | sed -e "s/-//g") + + CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ +diff -Naur -X ../exclude-files orig/arch/um/Makefile um/arch/um/Makefile +--- orig/arch/um/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile Fri Mar 28 21:46:54 2003 +@@ -0,0 +1,168 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++OS := $(shell uname -s) ++ ++ARCH_DIR = arch/um ++ ++core-y := kernel sys-$(SUBARCH) os-$(OS) ++drivers-y := fs drivers ++subdir-y := $(core-y) $(drivers-y) ++SUBDIRS += $(foreach dir,$(subdir-y),$(ARCH_DIR)/$(dir)) ++ ++CORE_FILES += $(foreach dir,$(core-y),$(ARCH_DIR)/$(dir)/built-in.o) ++DRIVERS += $(foreach dir,$(drivers-y),$(ARCH_DIR)/$(dir)/built-in.o) ++ ++include $(ARCH_DIR)/Makefile-$(SUBARCH) ++include $(ARCH_DIR)/Makefile-os-$(OS) ++ ++MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt ++MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas ++ ++ifneq ($(MAKEFILE-y),) ++ include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y)) ++endif ++ ++EXTRAVERSION := $(EXTRAVERSION)-4um ++ ++include/linux/version.h: arch/$(ARCH)/Makefile ++ ++# Recalculate MODLIB to reflect the EXTRAVERSION changes (via KERNELRELEASE) ++# The way the toplevel Makefile is written EXTRAVERSION is not supposed ++# to be changed outside the toplevel Makefile, but recalculating MODLIB is ++# a sufficient workaround until we no longer need architecture dependent ++# EXTRAVERSION... ++MODLIB := $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) ++ ++ifeq ($(CONFIG_DEBUGSYM),y) ++CFLAGS := $(subst -fomit-frame-pointer,,$(CFLAGS)) ++endif ++ ++CFLAGS-$(CONFIG_DEBUGSYM) += -g ++ ++ARCH_INCLUDE = -I$(TOPDIR)/$(ARCH_DIR)/include ++ ++# -Derrno=kernel_errno - This turns all kernel references to errno into ++# kernel_errno to separate them from the libc errno. This allows -fno-common ++# in CFLAGS. Otherwise, it would cause ld to complain about the two different ++# errnos. ++ ++CFLAGS += $(ARCH_CFLAGS) $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ ++ -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \ ++ $(MODE_INCLUDE) ++ ++LINKFLAGS += -r ++ ++LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc ++ ++SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) ++ ++# These aren't in Makefile-tt because they are needed in the !CONFIG_MODE_TT + ++# CONFIG_MODE_SKAS + CONFIG_STATIC_LINK case. ++ ++LINK_TT = -static ++LD_SCRIPT_TT := link.ld ++ ++ifeq ($(CONFIG_STATIC_LINK),y) ++ LINK-y += $(LINK_TT) ++ LD_SCRIPT-y := $(LD_SCRIPT_TT) ++else ++ifeq ($(CONFIG_MODE_TT),y) ++ LINK-y += $(LINK_TT) ++ LD_SCRIPT-y := $(LD_SCRIPT_TT) ++else ++ifeq ($(CONFIG_MODE_SKAS),y) ++ LINK-y += $(LINK_SKAS) ++ LD_SCRIPT-y := $(LD_SCRIPT_SKAS) ++endif ++endif ++endif ++ ++LD_SCRIPT-y := $(ARCH_DIR)/$(LD_SCRIPT-y) ++M4_MODE_TT := $(shell [ "$(CONFIG_MODE_TT)" = "y" ] && echo -DMODE_TT) ++ ++$(LD_SCRIPT-y): $(LD_SCRIPT-y).in ++ pages=$$(( 1 << $(CONFIG_KERNEL_STACK_ORDER) )) ; \ ++ m4 -DSTART=$$(($(TOP_ADDR) - $(SIZE))) -DELF_ARCH=$(ELF_ARCH) \ ++ -DELF_FORMAT=$(ELF_FORMAT) $(M4_MODE_TT) \ ++ -DKERNEL_STACK_SIZE=$$(( 4096 * $$pages )) $< > $@ ++ ++SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \ ++ include/asm-um/sigcontext.h include/asm-um/processor.h \ ++ include/asm-um/ptrace.h include/asm-um/arch-signal.h ++ ++ARCH_SYMLINKS = include/asm-um/arch arch/um/include/sysdep arch/um/os \ ++ $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h ++ ++ifeq ($(CONFIG_MODE_SKAS), y) ++$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++endif ++ ++GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h ++ ++setup: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) ++ ++linux: setup $(ARCH_DIR)/main.o vmlinux $(LD_SCRIPT-y) ++ mv vmlinux vmlinux.o ++ $(CC) -Wl,-T,$(LD_SCRIPT-y) $(LINK-y) $(LINK_WRAPS) \ ++ -o linux $(ARCH_DIR)/main.o vmlinux.o -L/usr/lib -lutil ++ ++USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) ++USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS)) ++USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ ++ $(MODE_INCLUDE) ++ ++# To get a definition of F_SETSIG ++USER_CFLAGS += -D_GNU_SOURCE ++ ++CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/link.ld $(ARCH_DIR)/dyn_link.ld \ ++ $(GEN_HEADERS) $(ARCH_DIR)/include/uml-config.h ++ ++$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c ++ $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< ++ ++archmrproper: ++ rm -f $(SYMLINK_HEADERS) $(ARCH_SYMLINKS) include/asm \ ++ $(LD_SCRIPT) $(addprefix $(ARCH_DIR)/kernel/,$(KERN_SYMLINKS)) ++ ++archclean: sysclean ++ find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ ++ -o -name '*.gcov' \) -type f -print | xargs rm -f ++ cd $(ARCH_DIR) ; \ ++ for dir in $(subdir-y) util ; do $(MAKE) -C $$dir clean; done ++ ++archdep: ++ ++$(SYMLINK_HEADERS): ++ cd $(TOPDIR)/$(dir $@) ; \ ++ ln -sf $(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $(notdir $@) ++ ++include/asm-um/arch: ++ cd $(TOPDIR)/include/asm-um && ln -sf ../asm-$(SUBARCH) arch ++ ++arch/um/include/sysdep: ++ cd $(TOPDIR)/arch/um/include && ln -sf sysdep-$(SUBARCH) sysdep ++ ++arch/um/os: ++ cd $(ARCH_DIR) && ln -sf os-$(OS) os ++ ++$(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task ++ $< > $@ ++ ++$(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants ++ $< > $@ ++ ++$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h ++ sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@ ++ ++$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/util/mk_task_user.c \ ++ $(ARCH_DIR)/util/mk_task_kern.c $(SYS_HEADERS) ++ $(MAKE) $(MFLAGS) -C $(ARCH_DIR)/util mk_task ++ ++$(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util/mk_constants_user.c \ ++ $(ARCH_DIR)/util/mk_constants_kern.c ++ $(MAKE) $(MFLAGS) -C $(ARCH_DIR)/util mk_constants ++ ++export SUBARCH USER_CFLAGS OS +diff -Naur -X ../exclude-files orig/arch/um/Makefile-i386 um/arch/um/Makefile-i386 +--- orig/arch/um/Makefile-i386 Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-i386 Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,35 @@ ++ifeq ($(CONFIG_HOST_2G_2G), y) ++TOP_ADDR = 0x80000000 ++else ++TOP_ADDR = 0xc0000000 ++endif ++ ++ARCH_CFLAGS = -U__$(SUBARCH)__ -U$(SUBARCH) -DUM_FASTCALL ++ELF_ARCH = $(SUBARCH) ++ELF_FORMAT = elf32-$(SUBARCH) ++ ++I386_H = $(ARCH_DIR)/include/sysdep-i386 ++SYS = $(ARCH_DIR)/sys-i386 ++UTIL = $(SYS)/util ++SUBDIRS += $(UTIL) ++ ++SYS_HEADERS = $(I386_H)/sc.h $(I386_H)/thread.h ++ ++$(I386_H)/sc.h : $(UTIL)/mk_sc ++ $(UTIL)/mk_sc > $@ ++ ++$(I386_H)/thread.h : $(UTIL)/mk_thread ++ $(UTIL)/mk_thread > $@ ++ ++$(UTIL)/mk_sc : $(UTIL)/mk_sc.c ++ $(MAKE) -C $(UTIL) mk_sc ++ ++$(UTIL)/mk_thread : $(UTIL)/mk_thread_user.c $(UTIL)/mk_thread_kern.c \ ++ $(I386_H)/sc.h ++ $(MAKE) -C $(UTIL) mk_thread ++ ++sysclean : ++ rm -f $(SYS_HEADERS) ++ $(MAKE) -C $(UTIL) clean ++ $(MAKE) -C $(SYS) clean ++ +diff -Naur -X ../exclude-files orig/arch/um/Makefile-ia64 um/arch/um/Makefile-ia64 +--- orig/arch/um/Makefile-ia64 Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-ia64 Wed Oct 23 21:08:04 2002 +@@ -0,0 +1 @@ ++START_ADDR = 0x1000000000000000 +diff -Naur -X ../exclude-files orig/arch/um/Makefile-os-Linux um/arch/um/Makefile-os-Linux +--- orig/arch/um/Makefile-os-Linux Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-os-Linux Mon Dec 9 14:21:51 2002 +@@ -0,0 +1,7 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++SUBDIRS += $(ARCH_DIR)/os-$(OS)/drivers ++DRIVERS += $(ARCH_DIR)/os-$(OS)/drivers/drivers.o +diff -Naur -X ../exclude-files orig/arch/um/Makefile-ppc um/arch/um/Makefile-ppc +--- orig/arch/um/Makefile-ppc Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-ppc Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,9 @@ ++ifeq ($(CONFIG_HOST_2G_2G), y) ++START_ADDR = 0x80000000 ++else ++START_ADDR = 0xc0000000 ++endif ++ARCH_CFLAGS = -U__powerpc__ -D__UM_PPC__ ++ ++# The arch is ppc, but the elf32 name is powerpc ++ELF_SUBARCH = powerpc +diff -Naur -X ../exclude-files orig/arch/um/Makefile-skas um/arch/um/Makefile-skas +--- orig/arch/um/Makefile-skas Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-skas Sun Dec 15 22:02:57 2002 +@@ -0,0 +1,20 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++PROFILE += -pg ++ ++CFLAGS-$(CONFIG_GCOV) += -fprofile-arcs -ftest-coverage ++CFLAGS-$(CONFIG_GPROF) += $(PROFILE) ++LINK-$(CONFIG_GPROF) += $(PROFILE) ++ ++MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/kernel/skas/include ++ ++LINK_SKAS = -Wl,-rpath,/lib ++LD_SCRIPT_SKAS = dyn_link.ld ++ ++GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++ ++$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h : ++ $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h +diff -Naur -X ../exclude-files orig/arch/um/Makefile-tt um/arch/um/Makefile-tt +--- orig/arch/um/Makefile-tt Wed Dec 31 19:00:00 1969 ++++ um/arch/um/Makefile-tt Mon Dec 16 20:22:23 2002 +@@ -0,0 +1,7 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/kernel/tt/include ++ +diff -Naur -X ../exclude-files orig/arch/um/common.ld.in um/arch/um/common.ld.in +--- orig/arch/um/common.ld.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/common.ld.in Tue Feb 4 19:35:13 2003 +@@ -0,0 +1,53 @@ ++ .kstrtab : { *(.kstrtab) } ++ ++ . = ALIGN(16); /* Exception table */ ++ __start___ex_table = .; ++ __ex_table : { *(__ex_table) } ++ __stop___ex_table = .; ++ ++ __start___ksymtab = .; /* Kernel symbol table */ ++ __ksymtab : { *(__ksymtab) } ++ __stop___ksymtab = .; ++ ++ .unprotected : { *(.unprotected) } ++ . = ALIGN(4096); ++ PROVIDE (_unprotected_end = .); ++ ++ . = ALIGN(4096); ++ __uml_setup_start = .; ++ .uml.setup.init : { *(.uml.setup.init) } ++ __uml_setup_end = .; ++ __uml_help_start = .; ++ .uml.help.init : { *(.uml.help.init) } ++ __uml_help_end = .; ++ __uml_postsetup_start = .; ++ .uml.postsetup.init : { *(.uml.postsetup.init) } ++ __uml_postsetup_end = .; ++ __setup_start = .; ++ .setup.init : { *(.setup.init) } ++ __setup_end = .; ++ __initcall_start = .; ++ .initcall.init : { *(.initcall.init) } ++ __initcall_end = .; ++ __uml_initcall_start = .; ++ .uml.initcall.init : { *(.uml.initcall.init) } ++ __uml_initcall_end = .; ++ __init_end = .; ++ __exitcall_begin = .; ++ .exitcall : { *(.exitcall.exit) } ++ __exitcall_end = .; ++ __uml_exitcall_begin = .; ++ .uml.exitcall : { *(.uml.exitcall.exit) } ++ __uml_exitcall_end = .; ++ ++ __preinit_array_start = .; ++ .preinit_array : { *(.preinit_array) } ++ __preinit_array_end = .; ++ __init_array_start = .; ++ .init_array : { *(.init_array) } ++ __init_array_end = .; ++ __fini_array_start = .; ++ .fini_array : { *(.fini_array) } ++ __fini_array_end = .; ++ ++ .data.init : { *(.data.init) } +diff -Naur -X ../exclude-files orig/arch/um/config.in um/arch/um/config.in +--- orig/arch/um/config.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config.in Thu Feb 27 13:12:39 2003 +@@ -0,0 +1,104 @@ ++define_bool CONFIG_USERMODE y ++ ++mainmenu_name "Linux/Usermode Kernel Configuration" ++ ++define_bool CONFIG_ISA n ++define_bool CONFIG_SBUS n ++define_bool CONFIG_PCI n ++ ++define_bool CONFIG_UID16 y ++ ++define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM y ++ ++mainmenu_option next_comment ++comment 'Code maturity level options' ++bool 'Prompt for development and/or incomplete code/drivers' CONFIG_EXPERIMENTAL ++endmenu ++ ++mainmenu_option next_comment ++comment 'General Setup' ++ ++bool 'Separate kernel address space support' CONFIG_MODE_SKAS ++ ++# This is to ensure that at least one of the modes is enabled. When neither ++# is present in defconfig, they default to N, which is bad. ++if [ "$CONFIG_MODE_SKAS" != "y" ]; then ++ define_bool CONFIG_MODE_TT y ++fi ++ ++bool 'Tracing thread support' CONFIG_MODE_TT ++if [ "$CONFIG_MODE_TT" != "y" ]; then ++ bool 'Statically linked binary when CONFIG_MODE_TT is disabled' CONFIG_STATIC_LINK ++fi ++bool 'Networking support' CONFIG_NET ++bool 'System V IPC' CONFIG_SYSVIPC ++bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT ++bool 'Sysctl support' CONFIG_SYSCTL ++tristate 'Kernel support for a.out binaries' CONFIG_BINFMT_AOUT ++tristate 'Kernel support for ELF binaries' CONFIG_BINFMT_ELF ++tristate 'Kernel support for MISC binaries' CONFIG_BINFMT_MISC ++tristate 'Host filesystem' CONFIG_HOSTFS ++tristate 'Honeypot proc filesystem' CONFIG_HPPFS ++bool 'Management console' CONFIG_MCONSOLE ++dep_bool 'Magic SysRq key' CONFIG_MAGIC_SYSRQ $CONFIG_MCONSOLE ++bool '2G/2G host address space split' CONFIG_HOST_2G_2G ++bool 'Symmetric multi-processing support' CONFIG_UML_SMP ++define_bool CONFIG_SMP $CONFIG_UML_SMP ++int 'Nesting level' CONFIG_NEST_LEVEL 0 ++int 'Kernel address space size (in .5G units)' CONFIG_KERNEL_HALF_GIGS 1 ++bool 'Highmem support' CONFIG_HIGHMEM ++bool '/proc/mm' CONFIG_PROC_MM ++int 'Kernel stack size order' CONFIG_KERNEL_STACK_ORDER 2 ++endmenu ++ ++mainmenu_option next_comment ++comment 'Loadable module support' ++bool 'Enable loadable module support' CONFIG_MODULES ++if [ "$CONFIG_MODULES" = "y" ]; then ++# MODVERSIONS does not yet work in this architecture ++# bool ' Set version information on all module symbols' CONFIG_MODVERSIONS ++ bool ' Kernel module loader' CONFIG_KMOD ++fi ++endmenu ++ ++source arch/um/config_char.in ++ ++source arch/um/config_block.in ++ ++define_bool CONFIG_NETDEVICES $CONFIG_NET ++ ++if [ "$CONFIG_NET" = "y" ]; then ++ source arch/um/config_net.in ++ source net/Config.in ++fi ++ ++source fs/Config.in ++ ++mainmenu_option next_comment ++comment 'SCSI support' ++ ++tristate 'SCSI support' CONFIG_SCSI ++ ++if [ "$CONFIG_SCSI" != "n" ]; then ++ source arch/um/config_scsi.in ++fi ++endmenu ++ ++source drivers/md/Config.in ++ ++source drivers/mtd/Config.in ++ ++source lib/Config.in ++ ++mainmenu_option next_comment ++comment 'Kernel hacking' ++bool 'Debug memory allocations' CONFIG_DEBUG_SLAB ++bool 'Enable kernel debugging symbols' CONFIG_DEBUGSYM ++if [ "$CONFIG_XTERM_CHAN" = "y" ]; then ++ dep_bool 'Enable ptrace proxy' CONFIG_PT_PROXY $CONFIG_DEBUGSYM ++else ++ define_bool CONFIG_PT_PROXY n ++fi ++dep_bool 'Enable gprof support' CONFIG_GPROF $CONFIG_DEBUGSYM ++dep_bool 'Enable gcov support' CONFIG_GCOV $CONFIG_DEBUGSYM ++endmenu +diff -Naur -X ../exclude-files orig/arch/um/config.release um/arch/um/config.release +--- orig/arch/um/config.release Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config.release Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,302 @@ ++# ++# Automatically generated make config: don't edit ++# ++CONFIG_USERMODE=y ++# CONFIG_ISA is not set ++# CONFIG_SBUS is not set ++# CONFIG_PCI is not set ++CONFIG_UID16=y ++CONFIG_RWSEM_XCHGADD_ALGORITHM=y ++ ++# ++# Code maturity level options ++# ++CONFIG_EXPERIMENTAL=y ++ ++# ++# General Setup ++# ++CONFIG_NET=y ++CONFIG_SYSVIPC=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_SYSCTL=y ++CONFIG_BINFMT_AOUT=y ++CONFIG_BINFMT_ELF=y ++CONFIG_BINFMT_MISC=y ++CONFIG_HOSTFS=y ++# CONFIG_HPPFS is not set ++CONFIG_MCONSOLE=y ++CONFIG_MAGIC_SYSRQ=y ++# CONFIG_HOST_2G_2G is not set ++# CONFIG_UML_SMP is not set ++# CONFIG_SMP is not set ++CONFIG_NEST_LEVEL=0 ++CONFIG_KERNEL_HALF_GIGS=1 ++ ++# ++# Loadable module support ++# ++CONFIG_MODULES=y ++CONFIG_KMOD=y ++ ++# ++# Character Devices ++# ++CONFIG_STDIO_CONSOLE=y ++CONFIG_SSL=y ++CONFIG_FD_CHAN=y ++# CONFIG_NULL_CHAN is not set ++CONFIG_PORT_CHAN=y ++CONFIG_PTY_CHAN=y ++CONFIG_TTY_CHAN=y ++CONFIG_XTERM_CHAN=y ++CONFIG_CON_ZERO_CHAN="fd:0,fd:1" ++CONFIG_CON_CHAN="xterm" ++CONFIG_SSL_CHAN="pty" ++CONFIG_UNIX98_PTYS=y ++CONFIG_UNIX98_PTY_COUNT=256 ++# CONFIG_WATCHDOG is not set ++CONFIG_UML_SOUND=y ++CONFIG_SOUND=y ++CONFIG_HOSTAUDIO=y ++# CONFIG_TTY_LOG is not set ++ ++# ++# Block Devices ++# ++CONFIG_BLK_DEV_UBD=y ++# CONFIG_BLK_DEV_UBD_SYNC is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=4096 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_MMAPPER is not set ++CONFIG_NETDEVICES=y ++ ++# ++# Network Devices ++# ++CONFIG_UML_NET=y ++CONFIG_UML_NET_ETHERTAP=y ++CONFIG_UML_NET_TUNTAP=y ++CONFIG_UML_NET_SLIP=y ++CONFIG_UML_NET_DAEMON=y ++CONFIG_UML_NET_MCAST=y ++CONFIG_DUMMY=y ++CONFIG_BONDING=m ++CONFIG_EQUALIZER=m ++CONFIG_TUN=y ++CONFIG_PPP=m ++CONFIG_PPP_MULTILINK=y ++# CONFIG_PPP_ASYNC is not set ++CONFIG_PPP_SYNC_TTY=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPPOE=m ++CONFIG_SLIP=m ++ ++# ++# Networking options ++# ++CONFIG_PACKET=y ++CONFIG_PACKET_MMAP=y ++# CONFIG_NETLINK_DEV is not set ++# CONFIG_NETFILTER is not set ++# CONFIG_FILTER is not set ++CONFIG_UNIX=y ++CONFIG_INET=y ++# CONFIG_IP_MULTICAST is not set ++# CONFIG_IP_ADVANCED_ROUTER is not set ++# CONFIG_IP_PNP is not set ++# CONFIG_NET_IPIP is not set ++# CONFIG_NET_IPGRE is not set ++# CONFIG_ARPD is not set ++# CONFIG_INET_ECN is not set ++# CONFIG_SYN_COOKIES is not set ++# CONFIG_IPV6 is not set ++# CONFIG_KHTTPD is not set ++# CONFIG_ATM is not set ++# CONFIG_VLAN_8021Q is not set ++ ++# ++# ++# ++# CONFIG_IPX is not set ++# CONFIG_ATALK is not set ++ ++# ++# Appletalk devices ++# ++# CONFIG_DECNET is not set ++# CONFIG_BRIDGE is not set ++# CONFIG_X25 is not set ++# CONFIG_LAPB is not set ++# CONFIG_LLC is not set ++# CONFIG_NET_DIVERT is not set ++# CONFIG_ECONET is not set ++# CONFIG_WAN_ROUTER is not set ++# CONFIG_NET_FASTROUTE is not set ++# CONFIG_NET_HW_FLOWCONTROL is not set ++ ++# ++# QoS and/or fair queueing ++# ++# CONFIG_NET_SCHED is not set ++ ++# ++# Network testing ++# ++# CONFIG_NET_PKTGEN is not set ++ ++# ++# File systems ++# ++CONFIG_QUOTA=y ++CONFIG_AUTOFS_FS=m ++CONFIG_AUTOFS4_FS=m ++CONFIG_REISERFS_FS=m ++# CONFIG_REISERFS_CHECK is not set ++# CONFIG_REISERFS_PROC_INFO is not set ++CONFIG_ADFS_FS=m ++# CONFIG_ADFS_FS_RW is not set ++CONFIG_AFFS_FS=m ++CONFIG_HFS_FS=m ++CONFIG_BFS_FS=m ++CONFIG_EXT3_FS=y ++CONFIG_JBD=y ++# CONFIG_JBD_DEBUG is not set ++CONFIG_FAT_FS=y ++CONFIG_MSDOS_FS=y ++CONFIG_UMSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_EFS_FS=m ++CONFIG_CRAMFS=m ++CONFIG_TMPFS=y ++CONFIG_RAMFS=y ++CONFIG_ISO9660_FS=y ++# CONFIG_JOLIET is not set ++# CONFIG_ZISOFS is not set ++CONFIG_MINIX_FS=m ++CONFIG_VXFS_FS=m ++# CONFIG_NTFS_FS is not set ++CONFIG_HPFS_FS=m ++CONFIG_PROC_FS=y ++CONFIG_DEVFS_FS=y ++CONFIG_DEVFS_MOUNT=y ++# CONFIG_DEVFS_DEBUG is not set ++CONFIG_DEVPTS_FS=y ++CONFIG_QNX4FS_FS=m ++# CONFIG_QNX4FS_RW is not set ++CONFIG_ROMFS_FS=m ++CONFIG_EXT2_FS=y ++CONFIG_SYSV_FS=m ++CONFIG_UDF_FS=m ++# CONFIG_UDF_RW is not set ++CONFIG_UFS_FS=m ++# CONFIG_UFS_FS_WRITE is not set ++ ++# ++# Network File Systems ++# ++# CONFIG_CODA_FS is not set ++# CONFIG_INTERMEZZO_FS is not set ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3=y ++CONFIG_SUNRPC=y ++CONFIG_LOCKD=y ++CONFIG_LOCKD_V4=y ++# CONFIG_SMB_FS is not set ++# CONFIG_NCP_FS is not set ++# CONFIG_ZISOFS_FS is not set ++CONFIG_ZLIB_FS_INFLATE=m ++ ++# ++# Partition Types ++# ++# CONFIG_PARTITION_ADVANCED is not set ++CONFIG_MSDOS_PARTITION=y ++# CONFIG_SMB_NLS is not set ++CONFIG_NLS=y ++ ++# ++# Native Language Support ++# ++CONFIG_NLS_DEFAULT="iso8859-1" ++# CONFIG_NLS_CODEPAGE_437 is not set ++# CONFIG_NLS_CODEPAGE_737 is not set ++# CONFIG_NLS_CODEPAGE_775 is not set ++# CONFIG_NLS_CODEPAGE_850 is not set ++# CONFIG_NLS_CODEPAGE_852 is not set ++# CONFIG_NLS_CODEPAGE_855 is not set ++# CONFIG_NLS_CODEPAGE_857 is not set ++# CONFIG_NLS_CODEPAGE_860 is not set ++# CONFIG_NLS_CODEPAGE_861 is not set ++# CONFIG_NLS_CODEPAGE_862 is not set ++# CONFIG_NLS_CODEPAGE_863 is not set ++# CONFIG_NLS_CODEPAGE_864 is not set ++# CONFIG_NLS_CODEPAGE_865 is not set ++# CONFIG_NLS_CODEPAGE_866 is not set ++# CONFIG_NLS_CODEPAGE_869 is not set ++# CONFIG_NLS_CODEPAGE_936 is not set ++# CONFIG_NLS_CODEPAGE_950 is not set ++# CONFIG_NLS_CODEPAGE_932 is not set ++# CONFIG_NLS_CODEPAGE_949 is not set ++# CONFIG_NLS_CODEPAGE_874 is not set ++# CONFIG_NLS_ISO8859_8 is not set ++# CONFIG_NLS_CODEPAGE_1250 is not set ++# CONFIG_NLS_CODEPAGE_1251 is not set ++# CONFIG_NLS_ISO8859_1 is not set ++# CONFIG_NLS_ISO8859_2 is not set ++# CONFIG_NLS_ISO8859_3 is not set ++# CONFIG_NLS_ISO8859_4 is not set ++# CONFIG_NLS_ISO8859_5 is not set ++# CONFIG_NLS_ISO8859_6 is not set ++# CONFIG_NLS_ISO8859_7 is not set ++# CONFIG_NLS_ISO8859_9 is not set ++# CONFIG_NLS_ISO8859_13 is not set ++# CONFIG_NLS_ISO8859_14 is not set ++# CONFIG_NLS_ISO8859_15 is not set ++# CONFIG_NLS_KOI8_R is not set ++# CONFIG_NLS_KOI8_U is not set ++# CONFIG_NLS_UTF8 is not set ++ ++# ++# SCSI support ++# ++CONFIG_SCSI=y ++ ++# ++# SCSI support type (disk, tape, CD-ROM) ++# ++# CONFIG_BLK_DEV_SD is not set ++# CONFIG_CHR_DEV_ST is not set ++# CONFIG_BLK_DEV_SR is not set ++# CONFIG_CHR_DEV_SG is not set ++ ++# ++# Some SCSI devices (e.g. CD jukebox) support multiple LUNs ++# ++# CONFIG_SCSI_DEBUG_QUEUES is not set ++# CONFIG_SCSI_MULTI_LUN is not set ++# CONFIG_SCSI_CONSTANTS is not set ++# CONFIG_SCSI_LOGGING is not set ++CONFIG_SCSI_DEBUG=m ++ ++# ++# Multi-device support (RAID and LVM) ++# ++# CONFIG_MD is not set ++ ++# ++# Memory Technology Devices (MTD) ++# ++# CONFIG_MTD is not set ++ ++# ++# Kernel hacking ++# ++# CONFIG_DEBUG_SLAB is not set ++# CONFIG_DEBUGSYM is not set +diff -Naur -X ../exclude-files orig/arch/um/config_block.in um/arch/um/config_block.in +--- orig/arch/um/config_block.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config_block.in Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,16 @@ ++mainmenu_option next_comment ++comment 'Block Devices' ++ ++bool 'Virtual block device' CONFIG_BLK_DEV_UBD ++dep_bool ' Always do synchronous disk IO for UBD' CONFIG_BLK_DEV_UBD_SYNC $CONFIG_BLK_DEV_UBD ++tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP ++dep_tristate 'Network block device support' CONFIG_BLK_DEV_NBD $CONFIG_NET ++tristate 'RAM disk support' CONFIG_BLK_DEV_RAM ++if [ "$CONFIG_BLK_DEV_RAM" = "y" -o "$CONFIG_BLK_DEV_RAM" = "m" ]; then ++ int ' Default RAM disk size' CONFIG_BLK_DEV_RAM_SIZE 4096 ++fi ++dep_bool ' Initial RAM disk (initrd) support' CONFIG_BLK_DEV_INITRD $CONFIG_BLK_DEV_RAM ++ ++tristate 'Example IO memory driver' CONFIG_MMAPPER ++ ++endmenu +diff -Naur -X ../exclude-files orig/arch/um/config_char.in um/arch/um/config_char.in +--- orig/arch/um/config_char.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config_char.in Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,37 @@ ++mainmenu_option next_comment ++comment 'Character Devices' ++ ++define_bool CONFIG_STDIO_CONSOLE y ++ ++bool 'Virtual serial line' CONFIG_SSL ++ ++bool 'file descriptor channel support' CONFIG_FD_CHAN ++bool 'null channel support' CONFIG_NULL_CHAN ++bool 'port channel support' CONFIG_PORT_CHAN ++bool 'pty channel support' CONFIG_PTY_CHAN ++bool 'tty channel support' CONFIG_TTY_CHAN ++bool 'xterm channel support' CONFIG_XTERM_CHAN ++string 'Default main console channel initialization' CONFIG_CON_ZERO_CHAN \ ++ "fd:0,fd:1" ++string 'Default console channel initialization' CONFIG_CON_CHAN "xterm" ++string 'Default serial line channel initialization' CONFIG_SSL_CHAN "pty" ++ ++ ++bool 'Unix98 PTY support' CONFIG_UNIX98_PTYS ++if [ "$CONFIG_UNIX98_PTYS" = "y" ]; then ++ int 'Maximum number of Unix98 PTYs in use (0-2048)' CONFIG_UNIX98_PTY_COUNT 256 ++fi ++ ++bool 'Watchdog Timer Support' CONFIG_WATCHDOG ++dep_bool ' Disable watchdog shutdown on close' CONFIG_WATCHDOG_NOWAYOUT \ ++ $CONFIG_WATCHDOG ++dep_tristate ' Software Watchdog' CONFIG_SOFT_WATCHDOG $CONFIG_WATCHDOG ++dep_tristate ' UML watchdog' CONFIG_UML_WATCHDOG $CONFIG_WATCHDOG ++ ++tristate 'Sound support' CONFIG_UML_SOUND ++define_tristate CONFIG_SOUND $CONFIG_UML_SOUND ++define_tristate CONFIG_HOSTAUDIO $CONFIG_UML_SOUND ++ ++bool 'Enable tty logging' CONFIG_TTY_LOG ++ ++endmenu +diff -Naur -X ../exclude-files orig/arch/um/config_net.in um/arch/um/config_net.in +--- orig/arch/um/config_net.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config_net.in Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,47 @@ ++mainmenu_option next_comment ++comment 'Network Devices' ++ ++# UML virtual driver ++bool 'Virtual network device' CONFIG_UML_NET ++ ++dep_bool ' Ethertap transport' CONFIG_UML_NET_ETHERTAP $CONFIG_UML_NET ++dep_bool ' TUN/TAP transport' CONFIG_UML_NET_TUNTAP $CONFIG_UML_NET ++dep_bool ' SLIP transport' CONFIG_UML_NET_SLIP $CONFIG_UML_NET ++dep_bool ' SLiRP transport' CONFIG_UML_NET_SLIRP $CONFIG_UML_NET ++dep_bool ' Daemon transport' CONFIG_UML_NET_DAEMON $CONFIG_UML_NET ++dep_bool ' Multicast transport' CONFIG_UML_NET_MCAST $CONFIG_UML_NET ++dep_bool ' pcap transport' CONFIG_UML_NET_PCAP $CONFIG_UML_NET ++ ++# Below are hardware-independent drivers mirrored from ++# drivers/net/Config.in. It would be nice if Linux ++# had HW independent drivers separated from the other ++# but it does not. Until then each non-ISA/PCI arch ++# needs to provide it's own menu of network drivers ++ ++tristate 'Dummy net driver support' CONFIG_DUMMY ++tristate 'Bonding driver support' CONFIG_BONDING ++tristate 'EQL (serial line load balancing) support' CONFIG_EQUALIZER ++tristate 'Universal TUN/TAP device driver support' CONFIG_TUN ++if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ if [ "$CONFIG_NETLINK" = "y" ]; then ++ tristate 'Ethertap network tap (OBSOLETE)' CONFIG_ETHERTAP ++ fi ++fi ++ ++tristate 'PPP (point-to-point protocol) support' CONFIG_PPP ++if [ ! "$CONFIG_PPP" = "n" ]; then ++ dep_bool ' PPP multilink support (EXPERIMENTAL)' CONFIG_PPP_MULTILINK $CONFIG_EXPERIMENTAL ++ dep_bool ' PPP filtering' CONFIG_PPP_FILTER $CONFIG_FILTER ++ dep_tristate ' PPP support for async serial ports' CONFIG_PPP_ASYNC $CONFIG_PPP ++ dep_tristate ' PPP support for sync tty ports' CONFIG_PPP_SYNC_TTY $CONFIG_PPP ++ dep_tristate ' PPP Deflate compression' CONFIG_PPP_DEFLATE $CONFIG_PPP ++ dep_tristate ' PPP BSD-Compress compression' CONFIG_PPP_BSDCOMP $CONFIG_PPP ++ dep_tristate ' PPP over Ethernet (EXPERIMENTAL)' CONFIG_PPPOE $CONFIG_PPP $CONFIG_EXPERIMENTAL ++fi ++ ++tristate 'SLIP (serial line) support' CONFIG_SLIP ++dep_bool ' CSLIP compressed headers' CONFIG_SLIP_COMPRESSED $CONFIG_SLIP ++dep_bool ' Keepalive and linefill' CONFIG_SLIP_SMART $CONFIG_SLIP ++dep_bool ' Six bit SLIP encapsulation' CONFIG_SLIP_MODE_SLIP6 $CONFIG_SLIP ++ ++endmenu +diff -Naur -X ../exclude-files orig/arch/um/config_scsi.in um/arch/um/config_scsi.in +--- orig/arch/um/config_scsi.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/config_scsi.in Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++comment 'SCSI support type (disk, tape, CD-ROM)' ++ ++dep_tristate ' SCSI disk support' CONFIG_BLK_DEV_SD $CONFIG_SCSI ++ ++if [ "$CONFIG_BLK_DEV_SD" != "n" ]; then ++ int 'Maximum number of SCSI disks that can be loaded as modules' CONFIG_SD_EXTRA_DEVS 40 ++fi ++ ++dep_tristate ' SCSI tape support' CONFIG_CHR_DEV_ST $CONFIG_SCSI ++ ++dep_tristate ' SCSI CD-ROM support' CONFIG_BLK_DEV_SR $CONFIG_SCSI ++ ++if [ "$CONFIG_BLK_DEV_SR" != "n" ]; then ++ bool ' Enable vendor-specific extensions (for SCSI CDROM)' CONFIG_BLK_DEV_SR_VENDOR ++ int 'Maximum number of CDROM devices that can be loaded as modules' CONFIG_SR_EXTRA_DEVS 2 ++fi ++dep_tristate ' SCSI generic support' CONFIG_CHR_DEV_SG $CONFIG_SCSI ++ ++comment 'Some SCSI devices (e.g. CD jukebox) support multiple LUNs' ++ ++#if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then ++ bool ' Enable extra checks in new queueing code' CONFIG_SCSI_DEBUG_QUEUES ++#fi ++ ++bool ' Probe all LUNs on each SCSI device' CONFIG_SCSI_MULTI_LUN ++ ++bool ' Verbose SCSI error reporting (kernel size +=12K)' CONFIG_SCSI_CONSTANTS ++bool ' SCSI logging facility' CONFIG_SCSI_LOGGING ++ ++dep_tristate 'SCSI debugging host simulator (EXPERIMENTAL)' CONFIG_SCSI_DEBUG $CONFIG_SCSI +diff -Naur -X ../exclude-files orig/arch/um/defconfig um/arch/um/defconfig +--- orig/arch/um/defconfig Wed Dec 31 19:00:00 1969 ++++ um/arch/um/defconfig Mon Jan 20 11:26:54 2003 +@@ -0,0 +1,396 @@ ++# ++# Automatically generated make config: don't edit ++# ++CONFIG_USERMODE=y ++# CONFIG_ISA is not set ++# CONFIG_SBUS is not set ++# CONFIG_PCI is not set ++CONFIG_UID16=y ++CONFIG_RWSEM_XCHGADD_ALGORITHM=y ++ ++# ++# Code maturity level options ++# ++CONFIG_EXPERIMENTAL=y ++ ++# ++# General Setup ++# ++CONFIG_MODE_TT=y ++CONFIG_MODE_SKAS=y ++CONFIG_NET=y ++CONFIG_SYSVIPC=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_SYSCTL=y ++CONFIG_BINFMT_AOUT=y ++CONFIG_BINFMT_ELF=y ++CONFIG_BINFMT_MISC=y ++CONFIG_HOSTFS=y ++CONFIG_HPPFS=y ++CONFIG_MCONSOLE=y ++CONFIG_MAGIC_SYSRQ=y ++# CONFIG_HOST_2G_2G is not set ++# CONFIG_UML_SMP is not set ++# CONFIG_SMP is not set ++CONFIG_NEST_LEVEL=0 ++CONFIG_KERNEL_HALF_GIGS=1 ++# CONFIG_HIGHMEM is not set ++CONFIG_PROC_MM=y ++CONFIG_KERNEL_STACK_ORDER=2 ++ ++# ++# Loadable module support ++# ++CONFIG_MODULES=y ++# CONFIG_KMOD is not set ++ ++# ++# Character Devices ++# ++CONFIG_STDIO_CONSOLE=y ++CONFIG_SSL=y ++CONFIG_FD_CHAN=y ++CONFIG_NULL_CHAN=y ++CONFIG_PORT_CHAN=y ++CONFIG_PTY_CHAN=y ++CONFIG_TTY_CHAN=y ++CONFIG_XTERM_CHAN=y ++CONFIG_CON_ZERO_CHAN="fd:0,fd:1" ++CONFIG_CON_CHAN="xterm" ++CONFIG_SSL_CHAN="pty" ++CONFIG_UNIX98_PTYS=y ++CONFIG_UNIX98_PTY_COUNT=256 ++# CONFIG_WATCHDOG is not set ++# CONFIG_WATCHDOG_NOWAYOUT is not set ++# CONFIG_SOFT_WATCHDOG is not set ++# CONFIG_UML_WATCHDOG is not set ++CONFIG_UML_SOUND=y ++CONFIG_SOUND=y ++CONFIG_HOSTAUDIO=y ++# CONFIG_TTY_LOG is not set ++ ++# ++# Block Devices ++# ++CONFIG_BLK_DEV_UBD=y ++# CONFIG_BLK_DEV_UBD_SYNC is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=4096 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_MMAPPER is not set ++CONFIG_NETDEVICES=y ++ ++# ++# Network Devices ++# ++CONFIG_UML_NET=y ++CONFIG_UML_NET_ETHERTAP=y ++CONFIG_UML_NET_TUNTAP=y ++CONFIG_UML_NET_SLIP=y ++CONFIG_UML_NET_SLIRP=y ++CONFIG_UML_NET_DAEMON=y ++CONFIG_UML_NET_MCAST=y ++# CONFIG_UML_NET_PCAP is not set ++CONFIG_DUMMY=y ++# CONFIG_BONDING is not set ++# CONFIG_EQUALIZER is not set ++CONFIG_TUN=y ++CONFIG_PPP=y ++# CONFIG_PPP_MULTILINK is not set ++# CONFIG_PPP_FILTER is not set ++# CONFIG_PPP_ASYNC is not set ++# CONFIG_PPP_SYNC_TTY is not set ++# CONFIG_PPP_DEFLATE is not set ++# CONFIG_PPP_BSDCOMP is not set ++# CONFIG_PPPOE is not set ++CONFIG_SLIP=y ++# CONFIG_SLIP_COMPRESSED is not set ++# CONFIG_SLIP_SMART is not set ++# CONFIG_SLIP_MODE_SLIP6 is not set ++ ++# ++# Networking options ++# ++CONFIG_PACKET=y ++CONFIG_PACKET_MMAP=y ++# CONFIG_NETLINK_DEV is not set ++# CONFIG_NETFILTER is not set ++# CONFIG_FILTER is not set ++CONFIG_UNIX=y ++CONFIG_INET=y ++# CONFIG_IP_MULTICAST is not set ++# CONFIG_IP_ADVANCED_ROUTER is not set ++# CONFIG_IP_PNP is not set ++# CONFIG_NET_IPIP is not set ++# CONFIG_NET_IPGRE is not set ++# CONFIG_ARPD is not set ++# CONFIG_INET_ECN is not set ++# CONFIG_SYN_COOKIES is not set ++# CONFIG_IPV6 is not set ++# CONFIG_KHTTPD is not set ++# CONFIG_ATM is not set ++# CONFIG_VLAN_8021Q is not set ++ ++# ++# ++# ++# CONFIG_IPX is not set ++# CONFIG_ATALK is not set ++ ++# ++# Appletalk devices ++# ++# CONFIG_DEV_APPLETALK is not set ++# CONFIG_DECNET is not set ++# CONFIG_BRIDGE is not set ++# CONFIG_X25 is not set ++# CONFIG_LAPB is not set ++# CONFIG_LLC is not set ++# CONFIG_NET_DIVERT is not set ++# CONFIG_ECONET is not set ++# CONFIG_WAN_ROUTER is not set ++# CONFIG_NET_FASTROUTE is not set ++# CONFIG_NET_HW_FLOWCONTROL is not set ++ ++# ++# QoS and/or fair queueing ++# ++# CONFIG_NET_SCHED is not set ++ ++# ++# Network testing ++# ++# CONFIG_NET_PKTGEN is not set ++ ++# ++# File systems ++# ++CONFIG_QUOTA=y ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++CONFIG_REISERFS_FS=y ++# CONFIG_REISERFS_CHECK is not set ++# CONFIG_REISERFS_PROC_INFO is not set ++# CONFIG_ADFS_FS is not set ++# CONFIG_ADFS_FS_RW is not set ++# CONFIG_AFFS_FS is not set ++# CONFIG_HFS_FS is not set ++# CONFIG_BFS_FS is not set ++# CONFIG_EXT3_FS is not set ++# CONFIG_JBD is not set ++# CONFIG_JBD_DEBUG is not set ++CONFIG_FAT_FS=y ++CONFIG_MSDOS_FS=y ++CONFIG_UMSDOS_FS=y ++CONFIG_VFAT_FS=y ++# CONFIG_EFS_FS is not set ++CONFIG_JFFS_FS=y ++CONFIG_JFFS_FS_VERBOSE=0 ++CONFIG_JFFS_PROC_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_JFFS2_FS_DEBUG=0 ++# CONFIG_CRAMFS is not set ++# CONFIG_TMPFS is not set ++CONFIG_RAMFS=y ++CONFIG_ISO9660_FS=y ++# CONFIG_JOLIET is not set ++# CONFIG_ZISOFS is not set ++CONFIG_MINIX_FS=y ++# CONFIG_VXFS_FS is not set ++# CONFIG_NTFS_FS is not set ++# CONFIG_NTFS_RW is not set ++# CONFIG_HPFS_FS is not set ++CONFIG_PROC_FS=y ++CONFIG_DEVFS_FS=y ++CONFIG_DEVFS_MOUNT=y ++# CONFIG_DEVFS_DEBUG is not set ++CONFIG_DEVPTS_FS=y ++# CONFIG_QNX4FS_FS is not set ++# CONFIG_QNX4FS_RW is not set ++# CONFIG_ROMFS_FS is not set ++CONFIG_EXT2_FS=y ++# CONFIG_SYSV_FS is not set ++# CONFIG_UDF_FS is not set ++# CONFIG_UDF_RW is not set ++# CONFIG_UFS_FS is not set ++# CONFIG_UFS_FS_WRITE is not set ++ ++# ++# Network File Systems ++# ++# CONFIG_CODA_FS is not set ++# CONFIG_INTERMEZZO_FS is not set ++# CONFIG_NFS_FS is not set ++# CONFIG_NFS_V3 is not set ++# CONFIG_ROOT_NFS is not set ++# CONFIG_NFSD is not set ++# CONFIG_NFSD_V3 is not set ++# CONFIG_SUNRPC is not set ++# CONFIG_LOCKD is not set ++# CONFIG_SMB_FS is not set ++# CONFIG_NCP_FS is not set ++# CONFIG_NCPFS_PACKET_SIGNING is not set ++# CONFIG_NCPFS_IOCTL_LOCKING is not set ++# CONFIG_NCPFS_STRONG is not set ++# CONFIG_NCPFS_NFS_NS is not set ++# CONFIG_NCPFS_OS2_NS is not set ++# CONFIG_NCPFS_SMALLDOS is not set ++# CONFIG_NCPFS_NLS is not set ++# CONFIG_NCPFS_EXTRAS is not set ++# CONFIG_ZISOFS_FS is not set ++# CONFIG_ZLIB_FS_INFLATE is not set ++ ++# ++# Partition Types ++# ++# CONFIG_PARTITION_ADVANCED is not set ++CONFIG_MSDOS_PARTITION=y ++# CONFIG_SMB_NLS is not set ++CONFIG_NLS=y ++ ++# ++# Native Language Support ++# ++CONFIG_NLS_DEFAULT="iso8859-1" ++# CONFIG_NLS_CODEPAGE_437 is not set ++# CONFIG_NLS_CODEPAGE_737 is not set ++# CONFIG_NLS_CODEPAGE_775 is not set ++# CONFIG_NLS_CODEPAGE_850 is not set ++# CONFIG_NLS_CODEPAGE_852 is not set ++# CONFIG_NLS_CODEPAGE_855 is not set ++# CONFIG_NLS_CODEPAGE_857 is not set ++# CONFIG_NLS_CODEPAGE_860 is not set ++# CONFIG_NLS_CODEPAGE_861 is not set ++# CONFIG_NLS_CODEPAGE_862 is not set ++# CONFIG_NLS_CODEPAGE_863 is not set ++# CONFIG_NLS_CODEPAGE_864 is not set ++# CONFIG_NLS_CODEPAGE_865 is not set ++# CONFIG_NLS_CODEPAGE_866 is not set ++# CONFIG_NLS_CODEPAGE_869 is not set ++# CONFIG_NLS_CODEPAGE_936 is not set ++# CONFIG_NLS_CODEPAGE_950 is not set ++# CONFIG_NLS_CODEPAGE_932 is not set ++# CONFIG_NLS_CODEPAGE_949 is not set ++# CONFIG_NLS_CODEPAGE_874 is not set ++# CONFIG_NLS_ISO8859_8 is not set ++# CONFIG_NLS_CODEPAGE_1250 is not set ++# CONFIG_NLS_CODEPAGE_1251 is not set ++# CONFIG_NLS_ISO8859_1 is not set ++# CONFIG_NLS_ISO8859_2 is not set ++# CONFIG_NLS_ISO8859_3 is not set ++# CONFIG_NLS_ISO8859_4 is not set ++# CONFIG_NLS_ISO8859_5 is not set ++# CONFIG_NLS_ISO8859_6 is not set ++# CONFIG_NLS_ISO8859_7 is not set ++# CONFIG_NLS_ISO8859_9 is not set ++# CONFIG_NLS_ISO8859_13 is not set ++# CONFIG_NLS_ISO8859_14 is not set ++# CONFIG_NLS_ISO8859_15 is not set ++# CONFIG_NLS_KOI8_R is not set ++# CONFIG_NLS_KOI8_U is not set ++# CONFIG_NLS_UTF8 is not set ++ ++# ++# SCSI support ++# ++CONFIG_SCSI=y ++ ++# ++# SCSI support type (disk, tape, CD-ROM) ++# ++# CONFIG_BLK_DEV_SD is not set ++# CONFIG_CHR_DEV_ST is not set ++# CONFIG_BLK_DEV_SR is not set ++# CONFIG_CHR_DEV_SG is not set ++ ++# ++# Some SCSI devices (e.g. CD jukebox) support multiple LUNs ++# ++# CONFIG_SCSI_DEBUG_QUEUES is not set ++# CONFIG_SCSI_MULTI_LUN is not set ++# CONFIG_SCSI_CONSTANTS is not set ++# CONFIG_SCSI_LOGGING is not set ++CONFIG_SCSI_DEBUG=y ++ ++# ++# Multi-device support (RAID and LVM) ++# ++# CONFIG_MD is not set ++# CONFIG_BLK_DEV_MD is not set ++# CONFIG_MD_LINEAR is not set ++# CONFIG_MD_RAID0 is not set ++# CONFIG_MD_RAID1 is not set ++# CONFIG_MD_RAID5 is not set ++# CONFIG_MD_MULTIPATH is not set ++# CONFIG_BLK_DEV_LVM is not set ++ ++# ++# Memory Technology Devices (MTD) ++# ++CONFIG_MTD=y ++# CONFIG_MTD_DEBUG is not set ++# CONFIG_MTD_PARTITIONS is not set ++# CONFIG_MTD_CONCAT is not set ++# CONFIG_MTD_REDBOOT_PARTS is not set ++ ++# ++# User Modules And Translation Layers ++# ++CONFIG_MTD_CHAR=y ++CONFIG_MTD_BLOCK=y ++# CONFIG_FTL is not set ++# CONFIG_NFTL is not set ++ ++# ++# RAM/ROM/Flash chip drivers ++# ++# CONFIG_MTD_CFI is not set ++# CONFIG_MTD_JEDECPROBE is not set ++# CONFIG_MTD_GEN_PROBE is not set ++# CONFIG_MTD_CFI_INTELEXT is not set ++# CONFIG_MTD_CFI_AMDSTD is not set ++# CONFIG_MTD_RAM is not set ++# CONFIG_MTD_ROM is not set ++# CONFIG_MTD_ABSENT is not set ++# CONFIG_MTD_OBSOLETE_CHIPS is not set ++# CONFIG_MTD_AMDSTD is not set ++# CONFIG_MTD_SHARP is not set ++# CONFIG_MTD_JEDEC is not set ++ ++# ++# Mapping drivers for chip access ++# ++# CONFIG_MTD_PHYSMAP is not set ++# CONFIG_MTD_PCI is not set ++ ++# ++# Self-contained MTD device drivers ++# ++# CONFIG_MTD_PMC551 is not set ++# CONFIG_MTD_SLRAM is not set ++# CONFIG_MTD_MTDRAM is not set ++CONFIG_MTD_BLKMTD=y ++ ++# ++# Disk-On-Chip Device Drivers ++# ++# CONFIG_MTD_DOC1000 is not set ++# CONFIG_MTD_DOC2000 is not set ++# CONFIG_MTD_DOC2001 is not set ++# CONFIG_MTD_DOCPROBE is not set ++ ++# ++# NAND Flash Device Drivers ++# ++# CONFIG_MTD_NAND is not set ++ ++# ++# Kernel hacking ++# ++# CONFIG_DEBUG_SLAB is not set ++CONFIG_DEBUGSYM=y ++CONFIG_PT_PROXY=y ++# CONFIG_GPROF is not set ++# CONFIG_GCOV is not set +diff -Naur -X ../exclude-files orig/arch/um/drivers/Makefile um/arch/um/drivers/Makefile +--- orig/arch/um/drivers/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,94 @@ ++# ++# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET := built-in.o ++ ++CHAN_OBJS := chan_kern.o chan_user.o line.o ++ ++list-multi := slip.o slirp.o daemon.o mcast.o mconsole.o net.o ubd.o \ ++ hostaudio.o pcap.o port.o harddog.o ++ ++slip-objs := slip_kern.o slip_user.o ++slirp-objs := slirp_kern.o slirp_user.o ++daemon-objs := daemon_kern.o daemon_user.o ++mcast-objs := mcast_kern.o mcast_user.o ++pcap-objs := pcap_kern.o pcap_user.o -lpcap -L/usr/lib ++net-objs := net_kern.o net_user.o ++mconsole-objs := mconsole_kern.o mconsole_user.o ++hostaudio-objs := hostaudio_kern.o hostaudio_user.o ++ubd-objs := ubd_kern.o ubd_user.o ++port-objs := port_kern.o port_user.o ++harddog-objs := harddog_kern.o harddog_user.o ++ ++export-objs := mconsole_kern.o ++ ++obj-y = ++obj-$(CONFIG_SSL) += ssl.o ++obj-$(CONFIG_UML_NET_SLIP) += slip.o ++obj-$(CONFIG_UML_NET_SLIRP) += slirp.o ++obj-$(CONFIG_UML_NET_DAEMON) += daemon.o ++obj-$(CONFIG_UML_NET_MCAST) += mcast.o ++obj-$(CONFIG_UML_NET_PCAP) += pcap.o ++obj-$(CONFIG_UML_NET) += net.o ++obj-$(CONFIG_MCONSOLE) += mconsole.o ++obj-$(CONFIG_MMAPPER) += mmapper_kern.o ++obj-$(CONFIG_BLK_DEV_UBD) += ubd.o ++obj-$(CONFIG_HOSTAUDIO) += hostaudio.o ++obj-$(CONFIG_FD_CHAN) += fd.o ++obj-$(CONFIG_NULL_CHAN) += null.o ++obj-$(CONFIG_PORT_CHAN) += port.o ++obj-$(CONFIG_PTY_CHAN) += pty.o ++obj-$(CONFIG_TTY_CHAN) += tty.o ++obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o ++obj-$(CONFIG_UML_WATCHDOG) += harddog.o ++ ++CFLAGS_pcap_user.o = -I/usr/include/pcap ++ ++obj-y += stdio_console.o $(CHAN_OBJS) ++ ++USER_SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) ++ ++USER_OBJS = $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \ ++ null.o pty.o tty.o xterm.o ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean: ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: ++ ++daemon.o : $(daemon-objs) ++ ++slip.o : $(slip-objs) ++ ++slirp.o : $(slirp-objs) ++ ++mcast.o : $(mcast-objs) ++ ++pcap.o : $(pcap-objs) ++ ++mconsole.o : $(mconsole-objs) ++ ++net.o : $(net-objs) ++ ++hostaudio.o : $(hostaudio-objs) ++ ++ubd.o : $(ubd-objs) ++ ++port.o : $(port-objs) ++ ++harddog.o : $(harddog-objs) ++ ++$(list-multi) : # This doesn't work, but should : '%.o : $(%-objs)' ++ $(LD) $(LD_RFLAG) -r -o $@ $($(patsubst %.o,%,$@)-objs) +diff -Naur -X ../exclude-files orig/arch/um/drivers/chan_kern.c um/arch/um/drivers/chan_kern.c +--- orig/arch/um/drivers/chan_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/chan_kern.c Thu Mar 6 19:25:16 2003 +@@ -0,0 +1,510 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "chan_kern.h" ++#include "user_util.h" ++#include "kern.h" ++#include "irq_user.h" ++#include "sigio.h" ++#include "line.h" ++ ++static void *not_configged_init(char *str, int device, struct chan_opts *opts) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(NULL); ++} ++ ++static int not_configged_open(int input, int output, int primary, void *data, ++ char **dev_out) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(-ENODEV); ++} ++ ++static void not_configged_close(int fd, void *data) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++} ++ ++static int not_configged_read(int fd, char *c_out, void *data) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(-EIO); ++} ++ ++static int not_configged_write(int fd, const char *buf, int len, void *data) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(-EIO); ++} ++ ++static int not_configged_console_write(int fd, const char *buf, int len, ++ void *data) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(-EIO); ++} ++ ++static int not_configged_window_size(int fd, void *data, unsigned short *rows, ++ unsigned short *cols) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++ return(-ENODEV); ++} ++ ++static void not_configged_free(void *data) ++{ ++ printk(KERN_ERR "Using a channel type which is configured out of " ++ "UML\n"); ++} ++ ++static struct chan_ops not_configged_ops = { ++ .init = not_configged_init, ++ .open = not_configged_open, ++ .close = not_configged_close, ++ .read = not_configged_read, ++ .write = not_configged_write, ++ .console_write = not_configged_console_write, ++ .window_size = not_configged_window_size, ++ .free = not_configged_free, ++ .winch = 0, ++}; ++ ++static void tty_receive_char(struct tty_struct *tty, char ch) ++{ ++ if(tty == NULL) return; ++ ++ if(I_IXON(tty) && !I_IXOFF(tty) && !tty->raw) { ++ if(ch == STOP_CHAR(tty)){ ++ stop_tty(tty); ++ return; ++ } ++ else if(ch == START_CHAR(tty)){ ++ start_tty(tty); ++ return; ++ } ++ } ++ ++ if((tty->flip.flag_buf_ptr == NULL) || ++ (tty->flip.char_buf_ptr == NULL)) ++ return; ++ tty_insert_flip_char(tty, ch, TTY_NORMAL); ++} ++ ++static int open_one_chan(struct chan *chan, int input, int output, int primary) ++{ ++ int fd; ++ ++ if(chan->opened) return(0); ++ if(chan->ops->open == NULL) fd = 0; ++ else fd = (*chan->ops->open)(input, output, primary, chan->data, ++ &chan->dev); ++ if(fd < 0) return(fd); ++ chan->fd = fd; ++ ++ chan->opened = 1; ++ return(0); ++} ++ ++int open_chan(struct list_head *chans) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ int ret, err = 0; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ ret = open_one_chan(chan, chan->input, chan->output, ++ chan->primary); ++ if(chan->primary) err = ret; ++ } ++ return(err); ++} ++ ++void chan_enable_winch(struct list_head *chans, void *line) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(chan->primary && chan->output && chan->ops->winch){ ++ register_winch(chan->fd, line); ++ return; ++ } ++ } ++} ++ ++void enable_chan(struct list_head *chans, void *data) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->opened) continue; ++ ++ line_setup_irq(chan->fd, chan->input, chan->output, data); ++ } ++} ++ ++void close_chan(struct list_head *chans) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ ++ /* Close in reverse order as open in case more than one of them ++ * refers to the same device and they save and restore that device's ++ * state. Then, the first one opened will have the original state, ++ * so it must be the last closed. ++ */ ++ for(ele = chans->prev; ele != chans; ele = ele->prev){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->opened) continue; ++ if(chan->ops->close != NULL) ++ (*chan->ops->close)(chan->fd, chan->data); ++ chan->opened = 0; ++ chan->fd = -1; ++ } ++} ++ ++int write_chan(struct list_head *chans, const char *buf, int len, ++ int write_irq) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ int n, ret = 0; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->output || (chan->ops->write == NULL)) continue; ++ n = chan->ops->write(chan->fd, buf, len, chan->data); ++ if(chan->primary){ ++ ret = n; ++ if((ret == -EAGAIN) || ((ret >= 0) && (ret < len))){ ++ reactivate_fd(chan->fd, write_irq); ++ if(ret == -EAGAIN) ret = 0; ++ } ++ } ++ } ++ return(ret); ++} ++ ++int console_write_chan(struct list_head *chans, const char *buf, int len) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ int n, ret = 0; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->output || (chan->ops->console_write == NULL)) ++ continue; ++ n = chan->ops->console_write(chan->fd, buf, len, chan->data); ++ if(chan->primary) ret = n; ++ } ++ return(ret); ++} ++ ++int chan_window_size(struct list_head *chans, unsigned short *rows_out, ++ unsigned short *cols_out) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(chan->primary){ ++ if(chan->ops->window_size == NULL) return(0); ++ return(chan->ops->window_size(chan->fd, chan->data, ++ rows_out, cols_out)); ++ } ++ } ++ return(0); ++} ++ ++void free_one_chan(struct chan *chan) ++{ ++ list_del(&chan->list); ++ if(chan->ops->free != NULL) ++ (*chan->ops->free)(chan->data); ++ free_irq_by_fd(chan->fd); ++ if(chan->primary && chan->output) ignore_sigio_fd(chan->fd); ++ kfree(chan); ++} ++ ++void free_chan(struct list_head *chans) ++{ ++ struct list_head *ele, *next; ++ struct chan *chan; ++ ++ list_for_each_safe(ele, next, chans){ ++ chan = list_entry(ele, struct chan, list); ++ free_one_chan(chan); ++ } ++} ++ ++static int one_chan_config_string(struct chan *chan, char *str, int size, ++ char **error_out) ++{ ++ int n = 0; ++ ++ CONFIG_CHUNK(str, size, n, chan->ops->type, 0); ++ ++ if(chan->dev == NULL){ ++ CONFIG_CHUNK(str, size, n, "", 1); ++ return(n); ++ } ++ ++ CONFIG_CHUNK(str, size, n, ":", 0); ++ CONFIG_CHUNK(str, size, n, chan->dev, 0); ++ ++ return(n); ++} ++ ++static int chan_pair_config_string(struct chan *in, struct chan *out, ++ char *str, int size, char **error_out) ++{ ++ int n; ++ ++ n = one_chan_config_string(in, str, size, error_out); ++ str += n; ++ size -= n; ++ ++ if(in == out){ ++ CONFIG_CHUNK(str, size, n, "", 1); ++ return(n); ++ } ++ ++ CONFIG_CHUNK(str, size, n, ",", 1); ++ n = one_chan_config_string(out, str, size, error_out); ++ str += n; ++ size -= n; ++ CONFIG_CHUNK(str, size, n, "", 1); ++ ++ return(n); ++} ++ ++int chan_config_string(struct list_head *chans, char *str, int size, ++ char **error_out) ++{ ++ struct list_head *ele; ++ struct chan *chan, *in = NULL, *out = NULL; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->primary) ++ continue; ++ if(chan->input) ++ in = chan; ++ if(chan->output) ++ out = chan; ++ } ++ ++ return(chan_pair_config_string(in, out, str, size, error_out)); ++} ++ ++struct chan_type { ++ char *key; ++ struct chan_ops *ops; ++}; ++ ++struct chan_type chan_table[] = { ++#ifdef CONFIG_FD_CHAN ++ { "fd", &fd_ops }, ++#else ++ { "fd", ¬_configged_ops }, ++#endif ++ ++#ifdef CONFIG_NULL_CHAN ++ { "null", &null_ops }, ++#else ++ { "null", ¬_configged_ops }, ++#endif ++ ++#ifdef CONFIG_PORT_CHAN ++ { "port", &port_ops }, ++#else ++ { "port", ¬_configged_ops }, ++#endif ++ ++#ifdef CONFIG_PTY_CHAN ++ { "pty", &pty_ops }, ++ { "pts", &pts_ops }, ++#else ++ { "pty", ¬_configged_ops }, ++ { "pts", ¬_configged_ops }, ++#endif ++ ++#ifdef CONFIG_TTY_CHAN ++ { "tty", &tty_ops }, ++#else ++ { "tty", ¬_configged_ops }, ++#endif ++ ++#ifdef CONFIG_XTERM_CHAN ++ { "xterm", &xterm_ops }, ++#else ++ { "xterm", ¬_configged_ops }, ++#endif ++}; ++ ++static struct chan *parse_chan(char *str, int pri, int device, ++ struct chan_opts *opts) ++{ ++ struct chan_type *entry; ++ struct chan_ops *ops; ++ struct chan *chan; ++ void *data; ++ int i; ++ ++ ops = NULL; ++ data = NULL; ++ for(i = 0; i < sizeof(chan_table)/sizeof(chan_table[0]); i++){ ++ entry = &chan_table[i]; ++ if(!strncmp(str, entry->key, strlen(entry->key))){ ++ ops = entry->ops; ++ str += strlen(entry->key); ++ break; ++ } ++ } ++ if(ops == NULL){ ++ printk(KERN_ERR "parse_chan couldn't parse \"%s\"\n", ++ str); ++ return(NULL); ++ } ++ if(ops->init == NULL) return(NULL); ++ data = (*ops->init)(str, device, opts); ++ if(data == NULL) return(NULL); ++ ++ chan = kmalloc(sizeof(*chan), GFP_KERNEL); ++ if(chan == NULL) return(NULL); ++ *chan = ((struct chan) { .list = LIST_HEAD_INIT(chan->list), ++ .primary = 1, ++ .input = 0, ++ .output = 0, ++ .opened = 0, ++ .fd = -1, ++ .pri = pri, ++ .ops = ops, ++ .data = data }); ++ return(chan); ++} ++ ++int parse_chan_pair(char *str, struct list_head *chans, int pri, int device, ++ struct chan_opts *opts) ++{ ++ struct chan *new, *chan; ++ char *in, *out; ++ ++ if(!list_empty(chans)){ ++ chan = list_entry(chans->next, struct chan, list); ++ if(chan->pri >= pri) return(0); ++ free_chan(chans); ++ INIT_LIST_HEAD(chans); ++ } ++ ++ if((out = strchr(str, ',')) != NULL){ ++ in = str; ++ *out = '\0'; ++ out++; ++ new = parse_chan(in, pri, device, opts); ++ if(new == NULL) return(-1); ++ new->input = 1; ++ list_add(&new->list, chans); ++ ++ new = parse_chan(out, pri, device, opts); ++ if(new == NULL) return(-1); ++ list_add(&new->list, chans); ++ new->output = 1; ++ } ++ else { ++ new = parse_chan(str, pri, device, opts); ++ if(new == NULL) return(-1); ++ list_add(&new->list, chans); ++ new->input = 1; ++ new->output = 1; ++ } ++ return(0); ++} ++ ++int chan_out_fd(struct list_head *chans) ++{ ++ struct list_head *ele; ++ struct chan *chan; ++ ++ list_for_each(ele, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(chan->primary && chan->output) ++ return(chan->fd); ++ } ++ return(-1); ++} ++ ++void chan_interrupt(struct list_head *chans, struct tq_struct *task, ++ struct tty_struct *tty, int irq, void *dev) ++{ ++ struct list_head *ele, *next; ++ struct chan *chan; ++ int err; ++ char c; ++ ++ list_for_each_safe(ele, next, chans){ ++ chan = list_entry(ele, struct chan, list); ++ if(!chan->input || (chan->ops->read == NULL)) continue; ++ do { ++ if((tty != NULL) && ++ (tty->flip.count >= TTY_FLIPBUF_SIZE)){ ++ queue_task(task, &tq_timer); ++ goto out; ++ } ++ err = chan->ops->read(chan->fd, &c, chan->data); ++ if(err > 0) tty_receive_char(tty, c); ++ } while(err > 0); ++ if(err == 0) reactivate_fd(chan->fd, irq); ++ if(err == -EIO){ ++ if(chan->primary){ ++ if(tty != NULL) tty_hangup(tty); ++ line_disable(dev, irq); ++ close_chan(chans); ++ free_chan(chans); ++ return; ++ } ++ else { ++ if(chan->ops->close != NULL) ++ chan->ops->close(chan->fd, chan->data); ++ free_one_chan(chan); ++ } ++ } ++ } ++ out: ++ if(tty) tty_flip_buffer_push(tty); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/chan_user.c um/arch/um/drivers/chan_user.c +--- orig/arch/um/drivers/chan_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/chan_user.c Wed Mar 26 13:23:48 2003 +@@ -0,0 +1,213 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "kern_util.h" ++#include "user_util.h" ++#include "chan_user.h" ++#include "user.h" ++#include "helper.h" ++#include "os.h" ++#include "choose-mode.h" ++#include "mode.h" ++ ++void generic_close(int fd, void *unused) ++{ ++ close(fd); ++} ++ ++int generic_read(int fd, char *c_out, void *unused) ++{ ++ int n; ++ ++ n = read(fd, c_out, sizeof(*c_out)); ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-EIO); ++ return(1); ++} ++ ++int generic_write(int fd, const char *buf, int n, void *unused) ++{ ++ int count; ++ ++ count = write(fd, buf, n); ++ if(count < 0) return(-errno); ++ return(count); ++} ++ ++int generic_console_write(int fd, const char *buf, int n, void *unused) ++{ ++ struct termios save, new; ++ int err; ++ ++ if(isatty(fd)){ ++ tcgetattr(fd, &save); ++ new = save; ++ new.c_oflag |= OPOST; ++ tcsetattr(fd, TCSAFLUSH, &new); ++ } ++ err = generic_write(fd, buf, n, NULL); ++ if(isatty(fd)) tcsetattr(fd, TCSAFLUSH, &save); ++ return(err); ++} ++ ++int generic_window_size(int fd, void *unused, unsigned short *rows_out, ++ unsigned short *cols_out) ++{ ++ struct winsize size; ++ int ret = 0; ++ ++ if(ioctl(fd, TIOCGWINSZ, &size) == 0){ ++ ret = ((*rows_out != size.ws_row) || ++ (*cols_out != size.ws_col)); ++ *rows_out = size.ws_row; ++ *cols_out = size.ws_col; ++ } ++ return(ret); ++} ++ ++void generic_free(void *data) ++{ ++ kfree(data); ++} ++ ++static void winch_handler(int sig) ++{ ++} ++ ++struct winch_data { ++ int pty_fd; ++ int pipe_fd; ++ int close_me; ++}; ++ ++static int winch_thread(void *arg) ++{ ++ struct winch_data *data = arg; ++ sigset_t sigs; ++ int pty_fd, pipe_fd; ++ char c = 1; ++ ++ close(data->close_me); ++ pty_fd = data->pty_fd; ++ pipe_fd = data->pipe_fd; ++ if(write(pipe_fd, &c, sizeof(c)) != sizeof(c)) ++ printk("winch_thread : failed to write synchronization " ++ "byte, errno = %d\n", errno); ++ ++ signal(SIGWINCH, winch_handler); ++ sigfillset(&sigs); ++ sigdelset(&sigs, SIGWINCH); ++ if(sigprocmask(SIG_SETMASK, &sigs, NULL) < 0){ ++ printk("winch_thread : sigprocmask failed, errno = %d\n", ++ errno); ++ exit(1); ++ } ++ ++ if(setsid() < 0){ ++ printk("winch_thread : setsid failed, errno = %d\n", errno); ++ exit(1); ++ } ++ ++ if(ioctl(pty_fd, TIOCSCTTY, 0) < 0){ ++ printk("winch_thread : TIOCSCTTY failed, errno = %d\n", errno); ++ exit(1); ++ } ++ if(tcsetpgrp(pty_fd, os_getpid()) < 0){ ++ printk("winch_thread : tcsetpgrp failed, errno = %d\n", errno); ++ exit(1); ++ } ++ ++ if(read(pipe_fd, &c, sizeof(c)) != sizeof(c)) ++ printk("winch_thread : failed to read synchronization byte, " ++ "errno = %d\n", errno); ++ ++ while(1){ ++ pause(); ++ ++ if(write(pipe_fd, &c, sizeof(c)) != sizeof(c)){ ++ printk("winch_thread : write failed, errno = %d\n", ++ errno); ++ } ++ } ++} ++ ++static int winch_tramp(int fd, void *device_data, int *fd_out) ++{ ++ struct winch_data data; ++ unsigned long stack; ++ int fds[2], pid, n, err; ++ char c; ++ ++ err = os_pipe(fds, 1, 1); ++ if(err){ ++ printk("winch_tramp : os_pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ ++ data = ((struct winch_data) { .pty_fd = fd, ++ .pipe_fd = fds[1], ++ .close_me = fds[0] } ); ++ pid = run_helper_thread(winch_thread, &data, 0, &stack, 0); ++ if(pid < 0){ ++ printk("fork of winch_thread failed - errno = %d\n", errno); ++ return(pid); ++ } ++ ++ close(fds[1]); ++ *fd_out = fds[0]; ++ n = read(fds[0], &c, sizeof(c)); ++ if(n != sizeof(c)){ ++ printk("winch_tramp : failed to read synchronization byte\n"); ++ printk("read returned %d, errno = %d\n", n, errno); ++ printk("fd %d will not support SIGWINCH\n", fd); ++ *fd_out = -1; ++ } ++ return(pid); ++} ++ ++void register_winch(int fd, void *device_data) ++{ ++ int pid, thread, thread_fd; ++ char c = 1; ++ ++ if(!isatty(fd)) return; ++ ++ pid = tcgetpgrp(fd); ++ if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, ++ device_data) && (pid == -1)){ ++ thread = winch_tramp(fd, device_data, &thread_fd); ++ if(fd != -1){ ++ register_winch_irq(thread_fd, fd, thread, device_data); ++ ++ if(write(thread_fd, &c, sizeof(c)) != sizeof(c)) ++ printk("register_winch : failed to write " ++ "synchronization byte\n"); ++ } ++ } ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon.h um/arch/um/drivers/daemon.h +--- orig/arch/um/drivers/daemon.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/daemon.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "net_user.h" ++ ++#define SWITCH_VERSION 3 ++ ++struct daemon_data { ++ char *sock_type; ++ char *ctl_sock; ++ void *ctl_addr; ++ void *data_addr; ++ void *local_addr; ++ int fd; ++ int control; ++ void *dev; ++}; ++ ++extern struct net_user_info daemon_user_info; ++ ++extern int daemon_user_write(int fd, void *buf, int len, ++ struct daemon_data *pri); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon_kern.c um/arch/um/drivers/daemon_kern.c +--- orig/arch/um/drivers/daemon_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/daemon_kern.c Sun Dec 15 21:19:17 2002 +@@ -0,0 +1,113 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * Licensed under the GPL. ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/etherdevice.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "daemon.h" ++ ++struct daemon_init { ++ char *sock_type; ++ char *ctl_sock; ++}; ++ ++void daemon_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *pri; ++ struct daemon_data *dpri; ++ struct daemon_init *init = data; ++ ++ init_etherdev(dev, 0); ++ pri = dev->priv; ++ dpri = (struct daemon_data *) pri->user; ++ *dpri = ((struct daemon_data) ++ { .sock_type = init->sock_type, ++ .ctl_sock = init->ctl_sock, ++ .ctl_addr = NULL, ++ .data_addr = NULL, ++ .local_addr = NULL, ++ .fd = -1, ++ .control = -1, ++ .dev = dev }); ++ ++ printk("daemon backend (uml_switch version %d) - %s:%s", ++ SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock); ++ printk("\n"); ++} ++ ++static int daemon_read(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER); ++ if(*skb == NULL) return(-ENOMEM); ++ return(net_recvfrom(fd, (*skb)->mac.raw, ++ (*skb)->dev->mtu + ETH_HEADER_OTHER)); ++} ++ ++static int daemon_write(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(daemon_user_write(fd, (*skb)->data, (*skb)->len, ++ (struct daemon_data *) &lp->user)); ++} ++ ++static struct net_kern_info daemon_kern_info = { ++ .init = daemon_init, ++ .protocol = eth_protocol, ++ .read = daemon_read, ++ .write = daemon_write, ++}; ++ ++int daemon_setup(char *str, char **mac_out, void *data) ++{ ++ struct daemon_init *init = data; ++ char *remain; ++ ++ *init = ((struct daemon_init) ++ { .sock_type = "unix", ++ .ctl_sock = "/tmp/uml.ctl" }); ++ ++ remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock, ++ NULL); ++ if(remain != NULL) ++ printk(KERN_WARNING "daemon_setup : Ignoring data socket " ++ "specification\n"); ++ ++ return(1); ++} ++ ++static struct transport daemon_transport = { ++ .list = LIST_HEAD_INIT(daemon_transport.list), ++ .name = "daemon", ++ .setup = daemon_setup, ++ .user = &daemon_user_info, ++ .kern = &daemon_kern_info, ++ .private_size = sizeof(struct daemon_data), ++ .setup_size = sizeof(struct daemon_init), ++}; ++ ++static int register_daemon(void) ++{ ++ register_transport(&daemon_transport); ++ return(1); ++} ++ ++__initcall(register_daemon); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/daemon_user.c um/arch/um/drivers/daemon_user.c +--- orig/arch/um/drivers/daemon_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/daemon_user.c Fri Jan 17 13:48:59 2003 +@@ -0,0 +1,195 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * Licensed under the GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "net_user.h" ++#include "daemon.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "user.h" ++#include "os.h" ++ ++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER) ++ ++enum request_type { REQ_NEW_CONTROL }; ++ ++#define SWITCH_MAGIC 0xfeedface ++ ++struct request_v3 { ++ uint32_t magic; ++ uint32_t version; ++ enum request_type type; ++ struct sockaddr_un sock; ++}; ++ ++static struct sockaddr_un *new_addr(void *name, int len) ++{ ++ struct sockaddr_un *sun; ++ ++ sun = um_kmalloc(sizeof(struct sockaddr_un)); ++ if(sun == NULL){ ++ printk("new_addr: allocation of sockaddr_un failed\n"); ++ return(NULL); ++ } ++ sun->sun_family = AF_UNIX; ++ memcpy(sun->sun_path, name, len); ++ return(sun); ++} ++ ++static int connect_to_switch(struct daemon_data *pri) ++{ ++ struct sockaddr_un *ctl_addr = pri->ctl_addr; ++ struct sockaddr_un *local_addr = pri->local_addr; ++ struct sockaddr_un *sun; ++ struct request_v3 req; ++ int fd, n, err; ++ ++ if((pri->control = socket(AF_UNIX, SOCK_STREAM, 0)) < 0){ ++ printk("daemon_open : control socket failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ ++ if(connect(pri->control, (struct sockaddr *) ctl_addr, ++ sizeof(*ctl_addr)) < 0){ ++ printk("daemon_open : control connect failed, errno = %d\n", ++ errno); ++ err = -errno; ++ goto out; ++ } ++ ++ if((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0){ ++ printk("daemon_open : data socket failed, errno = %d\n", ++ errno); ++ err = -errno; ++ goto out; ++ } ++ if(bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0){ ++ printk("daemon_open : data bind failed, errno = %d\n", ++ errno); ++ err = -errno; ++ goto out_close; ++ } ++ ++ sun = um_kmalloc(sizeof(struct sockaddr_un)); ++ if(sun == NULL){ ++ printk("new_addr: allocation of sockaddr_un failed\n"); ++ err = -ENOMEM; ++ goto out_close; ++ } ++ ++ req.magic = SWITCH_MAGIC; ++ req.version = SWITCH_VERSION; ++ req.type = REQ_NEW_CONTROL; ++ req.sock = *local_addr; ++ n = write(pri->control, &req, sizeof(req)); ++ if(n != sizeof(req)){ ++ printk("daemon_open : control setup request returned %d, " ++ "errno = %d\n", n, errno); ++ err = -ENOTCONN; ++ goto out; ++ } ++ ++ n = read(pri->control, sun, sizeof(*sun)); ++ if(n != sizeof(*sun)){ ++ printk("daemon_open : read of data socket returned %d, " ++ "errno = %d\n", n, errno); ++ err = -ENOTCONN; ++ goto out_close; ++ } ++ ++ pri->data_addr = sun; ++ return(fd); ++ ++ out_close: ++ close(fd); ++ out: ++ close(pri->control); ++ return(err); ++} ++ ++static void daemon_user_init(void *data, void *dev) ++{ ++ struct daemon_data *pri = data; ++ struct timeval tv; ++ struct { ++ char zero; ++ int pid; ++ int usecs; ++ } name; ++ ++ if(!strcmp(pri->sock_type, "unix")) ++ pri->ctl_addr = new_addr(pri->ctl_sock, ++ strlen(pri->ctl_sock) + 1); ++ name.zero = 0; ++ name.pid = os_getpid(); ++ gettimeofday(&tv, NULL); ++ name.usecs = tv.tv_usec; ++ pri->local_addr = new_addr(&name, sizeof(name)); ++ pri->dev = dev; ++ pri->fd = connect_to_switch(pri); ++ if(pri->fd < 0){ ++ kfree(pri->local_addr); ++ pri->local_addr = NULL; ++ } ++} ++ ++static int daemon_open(void *data) ++{ ++ struct daemon_data *pri = data; ++ return(pri->fd); ++} ++ ++static void daemon_remove(void *data) ++{ ++ struct daemon_data *pri = data; ++ ++ close(pri->fd); ++ close(pri->control); ++ if(pri->data_addr != NULL) kfree(pri->data_addr); ++ if(pri->ctl_addr != NULL) kfree(pri->ctl_addr); ++ if(pri->local_addr != NULL) kfree(pri->local_addr); ++} ++ ++int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri) ++{ ++ struct sockaddr_un *data_addr = pri->data_addr; ++ ++ return(net_sendto(fd, buf, len, data_addr, sizeof(*data_addr))); ++} ++ ++static int daemon_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++struct net_user_info daemon_user_info = { ++ .init = daemon_user_init, ++ .open = daemon_open, ++ .close = NULL, ++ .remove = daemon_remove, ++ .set_mtu = daemon_set_mtu, ++ .add_address = NULL, ++ .delete_address = NULL, ++ .max_packet = MAX_PACKET - ETH_HEADER_OTHER ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/fd.c um/arch/um/drivers/fd.c +--- orig/arch/um/drivers/fd.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/fd.c Sun Dec 15 20:57:25 2002 +@@ -0,0 +1,96 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include "user.h" ++#include "user_util.h" ++#include "chan_user.h" ++ ++struct fd_chan { ++ int fd; ++ int raw; ++ struct termios tt; ++ char str[sizeof("1234567890\0")]; ++}; ++ ++void *fd_init(char *str, int device, struct chan_opts *opts) ++{ ++ struct fd_chan *data; ++ char *end; ++ int n; ++ ++ if(*str != ':'){ ++ printk("fd_init : channel type 'fd' must specify a file " ++ "descriptor\n"); ++ return(NULL); ++ } ++ str++; ++ n = strtoul(str, &end, 0); ++ if((*end != '\0') || (end == str)){ ++ printk("fd_init : couldn't parse file descriptor '%s'\n", str); ++ return(NULL); ++ } ++ if((data = um_kmalloc(sizeof(*data))) == NULL) return(NULL); ++ *data = ((struct fd_chan) { .fd = n, ++ .raw = opts->raw }); ++ return(data); ++} ++ ++int fd_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct fd_chan *data = d; ++ ++ if(data->raw && isatty(data->fd)){ ++ tcgetattr(data->fd, &data->tt); ++ raw(data->fd, 0); ++ } ++ sprintf(data->str, "%d", data->fd); ++ *dev_out = data->str; ++ return(data->fd); ++} ++ ++void fd_close(int fd, void *d) ++{ ++ struct fd_chan *data = d; ++ ++ if(data->raw && isatty(fd)){ ++ tcsetattr(fd, TCSAFLUSH, &data->tt); ++ data->raw = 0; ++ } ++} ++ ++int fd_console_write(int fd, const char *buf, int n, void *d) ++{ ++ struct fd_chan *data = d; ++ ++ return(generic_console_write(fd, buf, n, &data->tt)); ++} ++ ++struct chan_ops fd_ops = { ++ .type = "fd", ++ .init = fd_init, ++ .open = fd_open, ++ .close = fd_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = fd_console_write, ++ .window_size = generic_window_size, ++ .free = generic_free, ++ .winch = 1, ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/harddog_kern.c um/arch/um/drivers/harddog_kern.c +--- orig/arch/um/drivers/harddog_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/harddog_kern.c Sun Dec 15 20:57:42 2002 +@@ -0,0 +1,194 @@ ++/* UML hardware watchdog, shamelessly stolen from: ++ * ++ * SoftDog 0.05: A Software Watchdog Device ++ * ++ * (c) Copyright 1996 Alan Cox , All Rights Reserved. ++ * http://www.redhat.com ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ * Neither Alan Cox nor CymruNet Ltd. admit liability nor provide ++ * warranty for any of this software. This material is provided ++ * "AS-IS" and at no charge. ++ * ++ * (c) Copyright 1995 Alan Cox ++ * ++ * Software only watchdog driver. Unlike its big brother the WDT501P ++ * driver this won't always recover a failed machine. ++ * ++ * 03/96: Angelo Haritsis : ++ * Modularised. ++ * Added soft_margin; use upon insmod to change the timer delay. ++ * NB: uses same minor as wdt (WATCHDOG_MINOR); we could use separate ++ * minors. ++ * ++ * 19980911 Alan Cox ++ * Made SMP safe for 2.3.x ++ * ++ * 20011127 Joel Becker (jlbec@evilplan.org> ++ * Added soft_noboot; Allows testing the softdog trigger without ++ * requiring a recompile. ++ * Added WDIOC_GETTIMEOUT and WDIOC_SETTIMOUT. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "helper.h" ++#include "mconsole.h" ++ ++MODULE_LICENSE("GPL"); ++ ++/* Locked by the BKL in harddog_open and harddog_release */ ++static int timer_alive; ++static int harddog_in_fd = -1; ++static int harddog_out_fd = -1; ++ ++/* ++ * Allow only one person to hold it open ++ */ ++ ++extern int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock); ++ ++static int harddog_open(struct inode *inode, struct file *file) ++{ ++ int err; ++ char *sock = NULL; ++ ++ lock_kernel(); ++ if(timer_alive) ++ return -EBUSY; ++#ifdef CONFIG_HARDDOG_NOWAYOUT ++ MOD_INC_USE_COUNT; ++#endif ++ ++#ifdef CONFIG_MCONSOLE ++ sock = mconsole_notify_socket(); ++#endif ++ err = start_watchdog(&harddog_in_fd, &harddog_out_fd, sock); ++ if(err) return(err); ++ ++ timer_alive = 1; ++ unlock_kernel(); ++ return 0; ++} ++ ++extern void stop_watchdog(int in_fd, int out_fd); ++ ++static int harddog_release(struct inode *inode, struct file *file) ++{ ++ /* ++ * Shut off the timer. ++ */ ++ lock_kernel(); ++ ++ stop_watchdog(harddog_in_fd, harddog_out_fd); ++ harddog_in_fd = -1; ++ harddog_out_fd = -1; ++ ++ timer_alive=0; ++ unlock_kernel(); ++ return 0; ++} ++ ++extern int ping_watchdog(int fd); ++ ++static ssize_t harddog_write(struct file *file, const char *data, size_t len, ++ loff_t *ppos) ++{ ++ /* Can't seek (pwrite) on this device */ ++ if (ppos != &file->f_pos) ++ return -ESPIPE; ++ ++ /* ++ * Refresh the timer. ++ */ ++ if(len) ++ return(ping_watchdog(harddog_out_fd)); ++ return 0; ++} ++ ++static int harddog_ioctl(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ static struct watchdog_info ident = { ++ WDIOF_SETTIMEOUT, ++ 0, ++ "UML Hardware Watchdog" ++ }; ++ switch (cmd) { ++ default: ++ return -ENOTTY; ++ case WDIOC_GETSUPPORT: ++ if(copy_to_user((struct harddog_info *)arg, &ident, ++ sizeof(ident))) ++ return -EFAULT; ++ return 0; ++ case WDIOC_GETSTATUS: ++ case WDIOC_GETBOOTSTATUS: ++ return put_user(0,(int *)arg); ++ case WDIOC_KEEPALIVE: ++ return(ping_watchdog(harddog_out_fd)); ++ } ++} ++ ++static struct file_operations harddog_fops = { ++ .owner = THIS_MODULE, ++ .write = harddog_write, ++ .ioctl = harddog_ioctl, ++ .open = harddog_open, ++ .release = harddog_release, ++}; ++ ++static struct miscdevice harddog_miscdev = { ++ .minor = WATCHDOG_MINOR, ++ .name = "watchdog", ++ .fops = &harddog_fops, ++}; ++ ++static char banner[] __initdata = KERN_INFO "UML Watchdog Timer\n"; ++ ++static int __init harddog_init(void) ++{ ++ int ret; ++ ++ ret = misc_register(&harddog_miscdev); ++ ++ if (ret) ++ return ret; ++ ++ printk(banner); ++ ++ return(0); ++} ++ ++static void __exit harddog_exit(void) ++{ ++ misc_deregister(&harddog_miscdev); ++} ++ ++module_init(harddog_init); ++module_exit(harddog_exit); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/harddog_user.c um/arch/um/drivers/harddog_user.c +--- orig/arch/um/drivers/harddog_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/harddog_user.c Wed Dec 4 16:38:05 2002 +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include "user_util.h" ++#include "user.h" ++#include "helper.h" ++#include "mconsole.h" ++#include "os.h" ++#include "choose-mode.h" ++#include "mode.h" ++ ++struct dog_data { ++ int stdin; ++ int stdout; ++ int close_me[2]; ++}; ++ ++static void pre_exec(void *d) ++{ ++ struct dog_data *data = d; ++ ++ dup2(data->stdin, 0); ++ dup2(data->stdout, 1); ++ dup2(data->stdout, 2); ++ close(data->stdin); ++ close(data->stdout); ++ close(data->close_me[0]); ++ close(data->close_me[1]); ++} ++ ++int start_watchdog(int *in_fd_ret, int *out_fd_ret, char *sock) ++{ ++ struct dog_data data; ++ int in_fds[2], out_fds[2], pid, n, err; ++ char pid_buf[sizeof("nnnnn\0")], c; ++ char *pid_args[] = { "/usr/bin/uml_watchdog", "-pid", pid_buf, NULL }; ++ char *mconsole_args[] = { "/usr/bin/uml_watchdog", "-mconsole", NULL, ++ NULL }; ++ char **args = NULL; ++ ++ err = os_pipe(in_fds, 1, 0); ++ if(err){ ++ printk("harddog_open - os_pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ ++ err = os_pipe(out_fds, 1, 0); ++ if(err){ ++ printk("harddog_open - os_pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ ++ data.stdin = out_fds[0]; ++ data.stdout = in_fds[1]; ++ data.close_me[0] = out_fds[1]; ++ data.close_me[1] = in_fds[0]; ++ ++ if(sock != NULL){ ++ mconsole_args[2] = sock; ++ args = mconsole_args; ++ } ++ else { ++ /* XXX The os_getpid() is not SMP correct */ ++ sprintf(pid_buf, "%d", CHOOSE_MODE(tracing_pid, os_getpid())); ++ args = pid_args; ++ } ++ ++ pid = run_helper(pre_exec, &data, args, NULL); ++ ++ close(out_fds[0]); ++ close(in_fds[1]); ++ ++ if(pid < 0){ ++ err = -pid; ++ printk("harddog_open - run_helper failed, errno = %d\n", err); ++ goto out; ++ } ++ ++ n = read(in_fds[0], &c, sizeof(c)); ++ if(n == 0){ ++ printk("harddog_open - EOF on watchdog pipe\n"); ++ helper_wait(pid); ++ err = -EIO; ++ goto out; ++ } ++ else if(n < 0){ ++ printk("harddog_open - read of watchdog pipe failed, " ++ "errno = %d\n", errno); ++ helper_wait(pid); ++ err = -errno; ++ goto out; ++ } ++ *in_fd_ret = in_fds[0]; ++ *out_fd_ret = out_fds[1]; ++ return(0); ++ out: ++ close(out_fds[1]); ++ close(in_fds[0]); ++ return(err); ++} ++ ++void stop_watchdog(int in_fd, int out_fd) ++{ ++ close(in_fd); ++ close(out_fd); ++} ++ ++int ping_watchdog(int fd) ++{ ++ int n; ++ char c = '\n'; ++ ++ n = write(fd, &c, sizeof(c)); ++ if(n < sizeof(c)){ ++ printk("ping_watchdog - write failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ return 1; ++ ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/hostaudio_kern.c um/arch/um/drivers/hostaudio_kern.c +--- orig/arch/um/drivers/hostaudio_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/hostaudio_kern.c Fri Mar 28 21:57:16 2003 +@@ -0,0 +1,330 @@ ++/* ++ * Copyright (C) 2002 Steve Schmidtke ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/module.h" ++#include "linux/version.h" ++#include "linux/init.h" ++#include "linux/slab.h" ++#include "linux/fs.h" ++#include "linux/sound.h" ++#include "linux/soundcard.h" ++#include "asm/uaccess.h" ++#include "kern_util.h" ++#include "init.h" ++#include "hostaudio.h" ++ ++/* Only changed from linux_main at boot time */ ++char *dsp = HOSTAUDIO_DEV_DSP; ++char *mixer = HOSTAUDIO_DEV_MIXER; ++ ++#ifndef MODULE ++static int set_dsp(char *name, int *add) ++{ ++ dsp = name; ++ return(0); ++} ++ ++__uml_setup("dsp=", set_dsp, ++"dsp=\n" ++" This is used to specify the host dsp device to the hostaudio driver.\n" ++" The default is \"" HOSTAUDIO_DEV_DSP "\".\n\n" ++); ++ ++static int set_mixer(char *name, int *add) ++{ ++ mixer = name; ++ return(0); ++} ++ ++__uml_setup("mixer=", set_mixer, ++"mixer=\n" ++" This is used to specify the host mixer device to the hostaudio driver.\n" ++" The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n" ++); ++#endif ++ ++/* /dev/dsp file operations */ ++ ++static ssize_t hostaudio_read(struct file *file, char *buffer, size_t count, ++ loff_t *ppos) ++{ ++ struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; ++ ++#ifdef DEBUG ++ printk("hostaudio: read called, count = %d\n", count); ++#endif ++ ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = hostaudio_read_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ if(copy_to_user(buffer, kbuf, err)) ++ err = -EFAULT; ++ ++ out: ++ kfree(kbuf); ++ return(err); ++} ++ ++static ssize_t hostaudio_write(struct file *file, const char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; ++ ++#ifdef DEBUG ++ printk("hostaudio: write called, count = %d\n", count); ++#endif ++ ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = -EFAULT; ++ if(copy_from_user(kbuf, buffer, count)) ++ goto out; ++ ++ err = hostaudio_write_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ out: ++ kfree(kbuf); ++ return(err); ++} ++ ++static unsigned int hostaudio_poll(struct file *file, ++ struct poll_table_struct *wait) ++{ ++ unsigned int mask = 0; ++ ++#ifdef DEBUG ++ printk("hostaudio: poll called (unimplemented)\n"); ++#endif ++ ++ return(mask); ++} ++ ++static int hostaudio_ioctl(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct hostaudio_state *state = file->private_data; ++ unsigned long data = 0; ++ int err; ++ ++#ifdef DEBUG ++ printk("hostaudio: ioctl called, cmd = %u\n", cmd); ++#endif ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(get_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } ++ ++ err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data); ++ ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(put_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } ++ ++ return(err); ++} ++ ++static int hostaudio_open(struct inode *inode, struct file *file) ++{ ++ struct hostaudio_state *state; ++ int r = 0, w = 0; ++ int ret; ++ ++#ifdef DEBUG ++ printk("hostaudio: open called (host: %s)\n", dsp); ++#endif ++ ++ state = kmalloc(sizeof(struct hostaudio_state), GFP_KERNEL); ++ if(state == NULL) return(-ENOMEM); ++ ++ if(file->f_mode & FMODE_READ) r = 1; ++ if(file->f_mode & FMODE_WRITE) w = 1; ++ ++ ret = hostaudio_open_user(state, r, w, dsp); ++ if(ret < 0){ ++ kfree(state); ++ return(ret); ++ } ++ ++ file->private_data = state; ++ return(0); ++} ++ ++static int hostaudio_release(struct inode *inode, struct file *file) ++{ ++ struct hostaudio_state *state = file->private_data; ++ int ret; ++ ++#ifdef DEBUG ++ printk("hostaudio: release called\n"); ++#endif ++ ++ ret = hostaudio_release_user(state); ++ kfree(state); ++ ++ return(ret); ++} ++ ++/* /dev/mixer file operations */ ++ ++static int hostmixer_ioctl_mixdev(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct hostmixer_state *state = file->private_data; ++ ++#ifdef DEBUG ++ printk("hostmixer: ioctl called\n"); ++#endif ++ ++ return(hostmixer_ioctl_mixdev_user(state, cmd, arg)); ++} ++ ++static int hostmixer_open_mixdev(struct inode *inode, struct file *file) ++{ ++ struct hostmixer_state *state; ++ int r = 0, w = 0; ++ int ret; ++ ++#ifdef DEBUG ++ printk("hostmixer: open called (host: %s)\n", mixer); ++#endif ++ ++ state = kmalloc(sizeof(struct hostmixer_state), GFP_KERNEL); ++ if(state == NULL) return(-ENOMEM); ++ ++ if(file->f_mode & FMODE_READ) r = 1; ++ if(file->f_mode & FMODE_WRITE) w = 1; ++ ++ ret = hostmixer_open_mixdev_user(state, r, w, mixer); ++ ++ if(ret < 0){ ++ kfree(state); ++ return(ret); ++ } ++ ++ file->private_data = state; ++ return(0); ++} ++ ++static int hostmixer_release(struct inode *inode, struct file *file) ++{ ++ struct hostmixer_state *state = file->private_data; ++ int ret; ++ ++#ifdef DEBUG ++ printk("hostmixer: release called\n"); ++#endif ++ ++ ret = hostmixer_release_mixdev_user(state); ++ kfree(state); ++ ++ return(ret); ++} ++ ++ ++/* kernel module operations */ ++ ++static struct file_operations hostaudio_fops = { ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .read = hostaudio_read, ++ .write = hostaudio_write, ++ .poll = hostaudio_poll, ++ .ioctl = hostaudio_ioctl, ++ .mmap = NULL, ++ .open = hostaudio_open, ++ .release = hostaudio_release, ++}; ++ ++static struct file_operations hostmixer_fops = { ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .ioctl = hostmixer_ioctl_mixdev, ++ .open = hostmixer_open_mixdev, ++ .release = hostmixer_release, ++}; ++ ++struct { ++ int dev_audio; ++ int dev_mixer; ++} module_data; ++ ++MODULE_AUTHOR("Steve Schmidtke"); ++MODULE_DESCRIPTION("UML Audio Relay"); ++MODULE_LICENSE("GPL"); ++ ++static int __init hostaudio_init_module(void) ++{ ++ printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n", ++ dsp, mixer); ++ ++ module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); ++ if(module_data.dev_audio < 0){ ++ printk(KERN_ERR "hostaudio: couldn't register DSP device!\n"); ++ return -ENODEV; ++ } ++ ++ module_data.dev_mixer = register_sound_mixer(&hostmixer_fops, -1); ++ if(module_data.dev_mixer < 0){ ++ printk(KERN_ERR "hostmixer: couldn't register mixer " ++ "device!\n"); ++ unregister_sound_dsp(module_data.dev_audio); ++ return -ENODEV; ++ } ++ ++ return 0; ++} ++ ++static void __exit hostaudio_cleanup_module (void) ++{ ++ unregister_sound_mixer(module_data.dev_mixer); ++ unregister_sound_dsp(module_data.dev_audio); ++} ++ ++module_init(hostaudio_init_module); ++module_exit(hostaudio_cleanup_module); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/hostaudio_user.c um/arch/um/drivers/hostaudio_user.c +--- orig/arch/um/drivers/hostaudio_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/hostaudio_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,149 @@ ++/* ++ * Copyright (C) 2002 Steve Schmidtke ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "hostaudio.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "os.h" ++ ++/* /dev/dsp file operations */ ++ ++ssize_t hostaudio_read_user(struct hostaudio_state *state, char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t ret; ++ ++#ifdef DEBUG ++ printk("hostaudio: read_user called, count = %d\n", count); ++#endif ++ ++ ret = read(state->fd, buffer, count); ++ ++ if(ret < 0) return(-errno); ++ return(ret); ++} ++ ++ssize_t hostaudio_write_user(struct hostaudio_state *state, const char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ ssize_t ret; ++ ++#ifdef DEBUG ++ printk("hostaudio: write_user called, count = %d\n", count); ++#endif ++ ++ ret = write(state->fd, buffer, count); ++ ++ if(ret < 0) return(-errno); ++ return(ret); ++} ++ ++int hostaudio_ioctl_user(struct hostaudio_state *state, unsigned int cmd, ++ unsigned long arg) ++{ ++ int ret; ++#ifdef DEBUG ++ printk("hostaudio: ioctl_user called, cmd = %u\n", cmd); ++#endif ++ ++ ret = ioctl(state->fd, cmd, arg); ++ ++ if(ret < 0) return(-errno); ++ return(ret); ++} ++ ++int hostaudio_open_user(struct hostaudio_state *state, int r, int w, char *dsp) ++{ ++#ifdef DEBUG ++ printk("hostaudio: open_user called\n"); ++#endif ++ ++ state->fd = os_open_file(dsp, of_set_rw(OPENFLAGS(), r, w), 0); ++ ++ if(state->fd >= 0) return(0); ++ ++ printk("hostaudio_open_user failed to open '%s', errno = %d\n", ++ dsp, errno); ++ ++ return(-errno); ++} ++ ++int hostaudio_release_user(struct hostaudio_state *state) ++{ ++#ifdef DEBUG ++ printk("hostaudio: release called\n"); ++#endif ++ if(state->fd >= 0){ ++ close(state->fd); ++ state->fd=-1; ++ } ++ ++ return(0); ++} ++ ++/* /dev/mixer file operations */ ++ ++int hostmixer_ioctl_mixdev_user(struct hostmixer_state *state, ++ unsigned int cmd, unsigned long arg) ++{ ++ int ret; ++#ifdef DEBUG ++ printk("hostmixer: ioctl_user called cmd = %u\n",cmd); ++#endif ++ ++ ret = ioctl(state->fd, cmd, arg); ++ if(ret < 0) ++ return(-errno); ++ return(ret); ++} ++ ++int hostmixer_open_mixdev_user(struct hostmixer_state *state, int r, int w, ++ char *mixer) ++{ ++#ifdef DEBUG ++ printk("hostmixer: open_user called\n"); ++#endif ++ ++ state->fd = os_open_file(mixer, of_set_rw(OPENFLAGS(), r, w), 0); ++ ++ if(state->fd >= 0) return(0); ++ ++ printk("hostaudio_open_mixdev_user failed to open '%s', errno = %d\n", ++ mixer, errno); ++ ++ return(-errno); ++} ++ ++int hostmixer_release_mixdev_user(struct hostmixer_state *state) ++{ ++#ifdef DEBUG ++ printk("hostmixer: release_user called\n"); ++#endif ++ ++ if(state->fd >= 0){ ++ close(state->fd); ++ state->fd = -1; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/line.c um/arch/um/drivers/line.c +--- orig/arch/um/drivers/line.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/line.c Wed Mar 26 15:09:44 2003 +@@ -0,0 +1,589 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/slab.h" ++#include "linux/list.h" ++#include "linux/devfs_fs_kernel.h" ++#include "asm/irq.h" ++#include "asm/uaccess.h" ++#include "chan_kern.h" ++#include "irq_user.h" ++#include "line.h" ++#include "kern.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "os.h" ++ ++#define LINE_BUFSIZE 4096 ++ ++void line_interrupt(int irq, void *data, struct pt_regs *unused) ++{ ++ struct line *dev = data; ++ ++ if(dev->count > 0) ++ chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, ++ dev); ++} ++ ++void line_timer_cb(void *arg) ++{ ++ struct line *dev = arg; ++ ++ line_interrupt(dev->driver->read_irq, dev, NULL); ++} ++ ++static void buffer_data(struct line *line, const char *buf, int len) ++{ ++ int end; ++ ++ if(line->buffer == NULL){ ++ line->buffer = kmalloc(LINE_BUFSIZE, GFP_ATOMIC); ++ if(line->buffer == NULL){ ++ printk("buffer_data - atomic allocation failed\n"); ++ return; ++ } ++ line->head = line->buffer; ++ line->tail = line->buffer; ++ } ++ end = line->buffer + LINE_BUFSIZE - line->tail; ++ if(len < end){ ++ memcpy(line->tail, buf, len); ++ line->tail += len; ++ } ++ else { ++ memcpy(line->tail, buf, end); ++ buf += end; ++ len -= end; ++ memcpy(line->buffer, buf, len); ++ line->tail = line->buffer + len; ++ } ++} ++ ++static int flush_buffer(struct line *line) ++{ ++ int n, count; ++ ++ if((line->buffer == NULL) || (line->head == line->tail)) return(1); ++ ++ if(line->tail < line->head){ ++ count = line->buffer + LINE_BUFSIZE - line->head; ++ n = write_chan(&line->chan_list, line->head, count, ++ line->driver->write_irq); ++ if(n < 0) return(n); ++ if(n == count) line->head = line->buffer; ++ else { ++ line->head += n; ++ return(0); ++ } ++ } ++ ++ count = line->tail - line->head; ++ n = write_chan(&line->chan_list, line->head, count, ++ line->driver->write_irq); ++ if(n < 0) return(n); ++ ++ line->head += n; ++ return(line->head == line->tail); ++} ++ ++int line_write(struct line *lines, struct tty_struct *tty, int from_user, ++ const char *buf, int len) ++{ ++ struct line *line; ++ char *new; ++ unsigned long flags; ++ int n, err, i; ++ ++ if(tty->stopped) return 0; ++ ++ if(from_user){ ++ new = kmalloc(len, GFP_KERNEL); ++ if(new == NULL) ++ return(0); ++ n = copy_from_user(new, buf, len); ++ if(n == len) ++ return(-EFAULT); ++ buf = new; ++ } ++ ++ i = minor(tty->device) - tty->driver.minor_start; ++ line = &lines[i]; ++ ++ down(&line->sem); ++ if(line->head != line->tail){ ++ local_irq_save(flags); ++ buffer_data(line, buf, len); ++ err = flush_buffer(line); ++ local_irq_restore(flags); ++ if(err <= 0) ++ goto out; ++ } ++ else { ++ n = write_chan(&line->chan_list, buf, len, ++ line->driver->write_irq); ++ if(n < 0){ ++ len = n; ++ goto out; ++ } ++ if(n < len) ++ buffer_data(line, buf + n, len - n); ++ } ++ out: ++ up(&line->sem); ++ ++ if(from_user) ++ kfree(buf); ++ return(len); ++} ++ ++void line_write_interrupt(int irq, void *data, struct pt_regs *unused) ++{ ++ struct line *dev = data; ++ struct tty_struct *tty = dev->tty; ++ int err; ++ ++ err = flush_buffer(dev); ++ if(err == 0) return; ++ else if(err < 0){ ++ dev->head = dev->buffer; ++ dev->tail = dev->buffer; ++ } ++ ++ if(tty == NULL) return; ++ ++ if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && ++ (tty->ldisc.write_wakeup != NULL)) ++ (tty->ldisc.write_wakeup)(tty); ++ ++ /* BLOCKING mode ++ * In blocking mode, everything sleeps on tty->write_wait. ++ * Sleeping in the console driver would break non-blocking ++ * writes. ++ */ ++ ++ if (waitqueue_active(&tty->write_wait)) ++ wake_up_interruptible(&tty->write_wait); ++ ++} ++ ++int line_write_room(struct tty_struct *tty) ++{ ++ struct line *dev = tty->driver_data; ++ int n; ++ ++ if(dev->buffer == NULL) return(LINE_BUFSIZE - 1); ++ ++ n = dev->head - dev->tail; ++ if(n <= 0) n = LINE_BUFSIZE + n; ++ return(n - 1); ++} ++ ++int line_setup_irq(int fd, int input, int output, void *data) ++{ ++ struct line *line = data; ++ struct line_driver *driver = line->driver; ++ int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM; ++ ++ if(input) err = um_request_irq(driver->read_irq, fd, IRQ_READ, ++ line_interrupt, flags, ++ driver->read_irq_name, line); ++ if(err) return(err); ++ if(output) err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, ++ line_write_interrupt, flags, ++ driver->write_irq_name, line); ++ line->have_irq = 1; ++ return(err); ++} ++ ++void line_disable(struct line *line, int current_irq) ++{ ++ if(!line->have_irq) return; ++ ++ if(line->driver->read_irq == current_irq) ++ free_irq_later(line->driver->read_irq, line); ++ else ++ free_irq(line->driver->read_irq, line); ++ ++ if(line->driver->write_irq == current_irq) ++ free_irq_later(line->driver->write_irq, line); ++ else ++ free_irq(line->driver->write_irq, line); ++ ++ line->have_irq = 0; ++} ++ ++int line_open(struct line *lines, struct tty_struct *tty, ++ struct chan_opts *opts) ++{ ++ struct line *line; ++ int n, err = 0; ++ ++ if(tty == NULL) n = 0; ++ else n = minor(tty->device) - tty->driver.minor_start; ++ line = &lines[n]; ++ ++ down(&line->sem); ++ if(line->count == 0){ ++ if(!line->valid){ ++ err = -ENODEV; ++ goto out; ++ } ++ if(list_empty(&line->chan_list)){ ++ err = parse_chan_pair(line->init_str, &line->chan_list, ++ line->init_pri, n, opts); ++ if(err) goto out; ++ err = open_chan(&line->chan_list); ++ if(err) goto out; ++ } ++ enable_chan(&line->chan_list, line); ++ INIT_TQUEUE(&line->task, line_timer_cb, line); ++ } ++ ++ if(!line->sigio){ ++ chan_enable_winch(&line->chan_list, line); ++ line->sigio = 1; ++ } ++ ++ /* This is outside the if because the initial console is opened ++ * with tty == NULL ++ */ ++ line->tty = tty; ++ ++ if(tty != NULL){ ++ tty->driver_data = line; ++ chan_window_size(&line->chan_list, &tty->winsize.ws_row, ++ &tty->winsize.ws_col); ++ } ++ ++ line->count++; ++ out: ++ up(&line->sem); ++ return(err); ++} ++ ++void line_close(struct line *lines, struct tty_struct *tty) ++{ ++ struct line *line; ++ int n; ++ ++ if(tty == NULL) n = 0; ++ else n = minor(tty->device) - tty->driver.minor_start; ++ line = &lines[n]; ++ ++ down(&line->sem); ++ line->count--; ++ ++ /* I don't like this, but I can't think of anything better. What's ++ * going on is that the tty is in the process of being closed for ++ * the last time. Its count hasn't been dropped yet, so it's still ++ * at 1. This may happen when line->count != 0 because of the initial ++ * console open (without a tty) bumping it up to 1. ++ */ ++ if((line->tty != NULL) && (line->tty->count == 1)) ++ line->tty = NULL; ++ if(line->count == 0) ++ line_disable(line, -1); ++ up(&line->sem); ++} ++ ++void close_lines(struct line *lines, int nlines) ++{ ++ int i; ++ ++ for(i = 0; i < nlines; i++) ++ close_chan(&lines[i].chan_list); ++} ++ ++int line_setup(struct line *lines, int num, char *init, int all_allowed) ++{ ++ int i, n; ++ char *end; ++ ++ if(*init == '=') n = -1; ++ else { ++ n = simple_strtoul(init, &end, 0); ++ if(*end != '='){ ++ printk(KERN_ERR "line_setup failed to parse \"%s\"\n", ++ init); ++ return(1); ++ } ++ init = end; ++ } ++ init++; ++ if((n >= 0) && (n >= num)){ ++ printk("line_setup - %d out of range ((0 ... %d) allowed)\n", ++ n, num); ++ return(1); ++ } ++ else if(n >= 0){ ++ if(lines[n].count > 0){ ++ printk("line_setup - device %d is open\n", n); ++ return(1); ++ } ++ if(lines[n].init_pri <= INIT_ONE){ ++ lines[n].init_pri = INIT_ONE; ++ if(!strcmp(init, "none")) lines[n].valid = 0; ++ else { ++ lines[n].init_str = init; ++ lines[n].valid = 1; ++ } ++ } ++ } ++ else if(!all_allowed){ ++ printk("line_setup - can't configure all devices from " ++ "mconsole\n"); ++ return(1); ++ } ++ else { ++ for(i = 0; i < num; i++){ ++ if(lines[i].init_pri <= INIT_ALL){ ++ lines[i].init_pri = INIT_ALL; ++ if(!strcmp(init, "none")) lines[i].valid = 0; ++ else { ++ lines[i].init_str = init; ++ lines[i].valid = 1; ++ } ++ } ++ } ++ } ++ return(0); ++} ++ ++int line_config(struct line *lines, int num, char *str) ++{ ++ char *new = uml_strdup(str); ++ ++ if(new == NULL){ ++ printk("line_config - uml_strdup failed\n"); ++ return(-ENOMEM); ++ } ++ return(line_setup(lines, num, new, 0)); ++} ++ ++int line_get_config(char *name, struct line *lines, int num, char *str, ++ int size, char **error_out) ++{ ++ struct line *line; ++ char *end; ++ int dev, n = 0; ++ ++ dev = simple_strtoul(name, &end, 0); ++ if((*end != '\0') || (end == name)){ ++ *error_out = "line_get_config failed to parse device number"; ++ return(0); ++ } ++ ++ if((dev < 0) || (dev >= num)){ ++ *error_out = "device number of of range"; ++ return(0); ++ } ++ ++ line = &lines[dev]; ++ ++ down(&line->sem); ++ if(!line->valid) ++ CONFIG_CHUNK(str, size, n, "none", 1); ++ else if(line->count == 0) ++ CONFIG_CHUNK(str, size, n, line->init_str, 1); ++ else n = chan_config_string(&line->chan_list, str, size, error_out); ++ up(&line->sem); ++ ++ return(n); ++} ++ ++int line_remove(struct line *lines, int num, char *str) ++{ ++ char config[sizeof("conxxxx=none\0")]; ++ ++ sprintf(config, "%s=none", str); ++ return(line_setup(lines, num, config, 0)); ++} ++ ++void line_register_devfs(struct lines *set, struct line_driver *line_driver, ++ struct tty_driver *driver, struct line *lines, ++ int nlines) ++{ ++ int err, i, n; ++ char *from, *to; ++ ++ driver->driver_name = line_driver->name; ++ driver->name = line_driver->devfs_name; ++ driver->major = line_driver->major; ++ driver->minor_start = line_driver->minor_start; ++ driver->type = line_driver->type; ++ driver->subtype = line_driver->subtype; ++ driver->magic = TTY_DRIVER_MAGIC; ++ driver->flags = TTY_DRIVER_REAL_RAW; ++ ++ n = set->num; ++ driver->num = n; ++ driver->table = kmalloc(n * sizeof(driver->table[0]), GFP_KERNEL); ++ driver->termios = kmalloc(n * sizeof(driver->termios[0]), GFP_KERNEL); ++ driver->termios_locked = kmalloc(n * sizeof(driver->termios_locked[0]), ++ GFP_KERNEL); ++ if((driver->table == NULL) || (driver->termios == NULL) || ++ (driver->termios_locked == NULL)) ++ panic("Failed to allocate driver table"); ++ ++ memset(driver->table, 0, n * sizeof(driver->table[0])); ++ memset(driver->termios, 0, n * sizeof(driver->termios[0])); ++ memset(driver->termios_locked, 0, ++ n * sizeof(driver->termios_locked[0])); ++ ++ driver->write_room = line_write_room; ++ driver->init_termios = tty_std_termios; ++ ++ if (tty_register_driver(driver)) ++ panic("line_register_devfs : Couldn't register driver\n"); ++ ++ from = line_driver->symlink_from; ++ to = line_driver->symlink_to; ++ err = devfs_mk_symlink(NULL, from, 0, to, NULL, NULL); ++ if(err) printk("Symlink creation from /dev/%s to /dev/%s " ++ "returned %d\n", from, to, err); ++ ++ for(i = 0; i < nlines; i++){ ++ if(!lines[i].valid) ++ tty_unregister_devfs(driver, driver->minor_start + i); ++ } ++ ++ mconsole_register_dev(&line_driver->mc); ++} ++ ++void lines_init(struct line *lines, int nlines) ++{ ++ struct line *line; ++ int i; ++ ++ for(i = 0; i < nlines; i++){ ++ line = &lines[i]; ++ INIT_LIST_HEAD(&line->chan_list); ++ sema_init(&line->sem, 1); ++ if(line->init_str != NULL){ ++ line->init_str = uml_strdup(line->init_str); ++ if(line->init_str == NULL) ++ printk("lines_init - uml_strdup returned " ++ "NULL\n"); ++ } ++ } ++} ++ ++struct winch { ++ struct list_head list; ++ int fd; ++ int tty_fd; ++ int pid; ++ struct line *line; ++}; ++ ++void winch_interrupt(int irq, void *data, struct pt_regs *unused) ++{ ++ struct winch *winch = data; ++ struct tty_struct *tty; ++ int err; ++ char c; ++ ++ if(winch->fd != -1){ ++ err = generic_read(winch->fd, &c, NULL); ++ if(err < 0){ ++ if(err != -EAGAIN){ ++ printk("winch_interrupt : read failed, " ++ "errno = %d\n", -err); ++ printk("fd %d is losing SIGWINCH support\n", ++ winch->tty_fd); ++ return; ++ } ++ goto out; ++ } ++ } ++ tty = winch->line->tty; ++ if(tty != NULL){ ++ chan_window_size(&winch->line->chan_list, ++ &tty->winsize.ws_row, ++ &tty->winsize.ws_col); ++ kill_pg(tty->pgrp, SIGWINCH, 1); ++ } ++ out: ++ if(winch->fd != -1) ++ reactivate_fd(winch->fd, WINCH_IRQ); ++} ++ ++DECLARE_MUTEX(winch_handler_sem); ++LIST_HEAD(winch_handlers); ++ ++void register_winch_irq(int fd, int tty_fd, int pid, void *line) ++{ ++ struct winch *winch; ++ ++ down(&winch_handler_sem); ++ winch = kmalloc(sizeof(*winch), GFP_KERNEL); ++ if(winch == NULL){ ++ printk("register_winch_irq - kmalloc failed\n"); ++ goto out; ++ } ++ *winch = ((struct winch) { .list = LIST_HEAD_INIT(winch->list), ++ .fd = fd, ++ .tty_fd = tty_fd, ++ .pid = pid, ++ .line = line }); ++ list_add(&winch->list, &winch_handlers); ++ if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, ++ SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, ++ "winch", winch) < 0) ++ printk("register_winch_irq - failed to register IRQ\n"); ++ out: ++ up(&winch_handler_sem); ++} ++ ++static void winch_cleanup(void) ++{ ++ struct list_head *ele; ++ struct winch *winch; ++ ++ list_for_each(ele, &winch_handlers){ ++ winch = list_entry(ele, struct winch, list); ++ if(winch->fd != -1){ ++ deactivate_fd(winch->fd, WINCH_IRQ); ++ close(winch->fd); ++ } ++ if(winch->pid != -1) ++ os_kill_process(winch->pid, 1); ++ } ++} ++ ++__uml_exitcall(winch_cleanup); ++ ++char *add_xterm_umid(char *base) ++{ ++ char *umid, *title; ++ int len; ++ ++ umid = get_umid(1); ++ if(umid == NULL) return(base); ++ ++ len = strlen(base) + strlen(" ()") + strlen(umid) + 1; ++ title = kmalloc(len, GFP_KERNEL); ++ if(title == NULL){ ++ printk("Failed to allocate buffer for xterm title\n"); ++ return(base); ++ } ++ ++ strncpy(title, base, len); ++ len -= strlen(title); ++ snprintf(&title[strlen(title)], len, " (%s)", umid); ++ return(title); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast.h um/arch/um/drivers/mcast.h +--- orig/arch/um/drivers/mcast.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mcast.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "net_user.h" ++ ++struct mcast_data { ++ char *addr; ++ unsigned short port; ++ void *mcast_addr; ++ int ttl; ++ void *dev; ++}; ++ ++extern struct net_user_info mcast_user_info; ++ ++extern int mcast_user_write(int fd, void *buf, int len, ++ struct mcast_data *pri); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast_kern.c um/arch/um/drivers/mcast_kern.c +--- orig/arch/um/drivers/mcast_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mcast_kern.c Sun Dec 15 20:58:55 2002 +@@ -0,0 +1,145 @@ ++/* ++ * user-mode-linux networking multicast transport ++ * Copyright (C) 2001 by Harald Welte ++ * ++ * based on the existing uml-networking code, which is ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * ++ * Licensed under the GPL. ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/etherdevice.h" ++#include "linux/in.h" ++#include "linux/inet.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "mcast.h" ++ ++struct mcast_init { ++ char *addr; ++ int port; ++ int ttl; ++}; ++ ++void mcast_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *pri; ++ struct mcast_data *dpri; ++ struct mcast_init *init = data; ++ ++ init_etherdev(dev, 0); ++ pri = dev->priv; ++ dpri = (struct mcast_data *) pri->user; ++ *dpri = ((struct mcast_data) ++ { .addr = init->addr, ++ .port = init->port, ++ .ttl = init->ttl, ++ .mcast_addr = NULL, ++ .dev = dev }); ++ printk("mcast backend "); ++ printk("multicast adddress: %s:%u, TTL:%u ", ++ dpri->addr, dpri->port, dpri->ttl); ++ ++ printk("\n"); ++} ++ ++static int mcast_read(int fd, struct sk_buff **skb, struct uml_net_private *lp) ++{ ++ *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER); ++ if(*skb == NULL) return(-ENOMEM); ++ return(net_recvfrom(fd, (*skb)->mac.raw, ++ (*skb)->dev->mtu + ETH_HEADER_OTHER)); ++} ++ ++static int mcast_write(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return mcast_user_write(fd, (*skb)->data, (*skb)->len, ++ (struct mcast_data *) &lp->user); ++} ++ ++static struct net_kern_info mcast_kern_info = { ++ .init = mcast_init, ++ .protocol = eth_protocol, ++ .read = mcast_read, ++ .write = mcast_write, ++}; ++ ++int mcast_setup(char *str, char **mac_out, void *data) ++{ ++ struct mcast_init *init = data; ++ char *port_str = NULL, *ttl_str = NULL, *remain; ++ char *last; ++ int n; ++ ++ *init = ((struct mcast_init) ++ { .addr = "239.192.168.1", ++ .port = 1102, ++ .ttl = 1 }); ++ ++ remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str, ++ NULL); ++ if(remain != NULL){ ++ printk(KERN_ERR "mcast_setup - Extra garbage on " ++ "specification : '%s'\n", remain); ++ return(0); ++ } ++ ++ if(port_str != NULL){ ++ n = simple_strtoul(port_str, &last, 10); ++ if((*last != '\0') || (last == port_str)){ ++ printk(KERN_ERR "mcast_setup - Bad port : '%s'\n", ++ port_str); ++ return(0); ++ } ++ init->port = htons(n); ++ } ++ ++ if(ttl_str != NULL){ ++ init->ttl = simple_strtoul(ttl_str, &last, 10); ++ if((*last != '\0') || (last == ttl_str)){ ++ printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n", ++ ttl_str); ++ return(0); ++ } ++ } ++ ++ printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr, ++ init->port, init->ttl); ++ ++ return(1); ++} ++ ++static struct transport mcast_transport = { ++ .list = LIST_HEAD_INIT(mcast_transport.list), ++ .name = "mcast", ++ .setup = mcast_setup, ++ .user = &mcast_user_info, ++ .kern = &mcast_kern_info, ++ .private_size = sizeof(struct mcast_data), ++ .setup_size = sizeof(struct mcast_init), ++}; ++ ++static int register_mcast(void) ++{ ++ register_transport(&mcast_transport); ++ return(1); ++} ++ ++__initcall(register_mcast); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mcast_user.c um/arch/um/drivers/mcast_user.c +--- orig/arch/um/drivers/mcast_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mcast_user.c Sun Dec 15 21:19:16 2002 +@@ -0,0 +1,175 @@ ++/* ++ * user-mode-linux networking multicast transport ++ * Copyright (C) 2001 by Harald Welte ++ * ++ * based on the existing uml-networking code, which is ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * ++ * Licensed under the GPL. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "net_user.h" ++#include "mcast.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "user.h" ++ ++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER) ++ ++static struct sockaddr_in *new_addr(char *addr, unsigned short port) ++{ ++ struct sockaddr_in *sin; ++ ++ sin = um_kmalloc(sizeof(struct sockaddr_in)); ++ if(sin == NULL){ ++ printk("new_addr: allocation of sockaddr_in failed\n"); ++ return(NULL); ++ } ++ sin->sin_family = AF_INET; ++ sin->sin_addr.s_addr = in_aton(addr); ++ sin->sin_port = port; ++ return(sin); ++} ++ ++static void mcast_user_init(void *data, void *dev) ++{ ++ struct mcast_data *pri = data; ++ ++ pri->mcast_addr = new_addr(pri->addr, pri->port); ++ pri->dev = dev; ++} ++ ++static int mcast_open(void *data) ++{ ++ struct mcast_data *pri = data; ++ struct sockaddr_in *sin = pri->mcast_addr; ++ struct ip_mreq mreq; ++ int fd, yes = 1; ++ ++ ++ if ((sin->sin_addr.s_addr == 0) || (sin->sin_port == 0)) { ++ fd = -EINVAL; ++ goto out; ++ } ++ ++ if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0){ ++ printk("mcast_open : data socket failed, errno = %d\n", ++ errno); ++ fd = -ENOMEM; ++ goto out; ++ } ++ ++ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { ++ printk("mcast_open: SO_REUSEADDR failed, errno = %d\n", ++ errno); ++ close(fd); ++ fd = -EINVAL; ++ goto out; ++ } ++ ++ /* set ttl according to config */ ++ if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl, ++ sizeof(pri->ttl)) < 0) { ++ printk("mcast_open: IP_MULTICAST_TTL failed, error = %d\n", ++ errno); ++ close(fd); ++ fd = -EINVAL; ++ goto out; ++ } ++ ++ /* set LOOP, so data does get fed back to local sockets */ ++ if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP, &yes, sizeof(yes)) < 0) { ++ printk("mcast_open: IP_MULTICAST_LOOP failed, error = %d\n", ++ errno); ++ close(fd); ++ fd = -EINVAL; ++ goto out; ++ } ++ ++ /* bind socket to mcast address */ ++ if (bind(fd, (struct sockaddr *) sin, sizeof(*sin)) < 0) { ++ printk("mcast_open : data bind failed, errno = %d\n", errno); ++ close(fd); ++ fd = -EINVAL; ++ goto out; ++ } ++ ++ /* subscribe to the multicast group */ ++ mreq.imr_multiaddr.s_addr = sin->sin_addr.s_addr; ++ mreq.imr_interface.s_addr = 0; ++ if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP, ++ &mreq, sizeof(mreq)) < 0) { ++ printk("mcast_open: IP_ADD_MEMBERSHIP failed, error = %d\n", ++ errno); ++ printk("There appears not to be a multicast-capable network " ++ "interface on the host.\n"); ++ printk("eth0 should be configured in order to use the " ++ "multicast transport.\n"); ++ close(fd); ++ fd = -EINVAL; ++ } ++ ++ out: ++ return(fd); ++} ++ ++static void mcast_close(int fd, void *data) ++{ ++ struct ip_mreq mreq; ++ struct mcast_data *pri = data; ++ struct sockaddr_in *sin = pri->mcast_addr; ++ ++ mreq.imr_multiaddr.s_addr = sin->sin_addr.s_addr; ++ mreq.imr_interface.s_addr = 0; ++ if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP, ++ &mreq, sizeof(mreq)) < 0) { ++ printk("mcast_open: IP_DROP_MEMBERSHIP failed, error = %d\n", ++ errno); ++ } ++ ++ close(fd); ++} ++ ++int mcast_user_write(int fd, void *buf, int len, struct mcast_data *pri) ++{ ++ struct sockaddr_in *data_addr = pri->mcast_addr; ++ ++ return(net_sendto(fd, buf, len, data_addr, sizeof(*data_addr))); ++} ++ ++static int mcast_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++struct net_user_info mcast_user_info = { ++ .init = mcast_user_init, ++ .open = mcast_open, ++ .close = mcast_close, ++ .remove = NULL, ++ .set_mtu = mcast_set_mtu, ++ .add_address = NULL, ++ .delete_address = NULL, ++ .max_packet = MAX_PACKET - ETH_HEADER_OTHER ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mconsole_kern.c um/arch/um/drivers/mconsole_kern.c +--- orig/arch/um/drivers/mconsole_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mconsole_kern.c Fri Mar 28 21:58:11 2003 +@@ -0,0 +1,453 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/slab.h" ++#include "linux/init.h" ++#include "linux/notifier.h" ++#include "linux/reboot.h" ++#include "linux/utsname.h" ++#include "linux/ctype.h" ++#include "linux/interrupt.h" ++#include "linux/sysrq.h" ++#include "linux/tqueue.h" ++#include "linux/module.h" ++#include "linux/proc_fs.h" ++#include "asm/irq.h" ++#include "asm/uaccess.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "mconsole.h" ++#include "mconsole_kern.h" ++#include "irq_user.h" ++#include "init.h" ++#include "os.h" ++#include "umid.h" ++ ++static int do_unlink_socket(struct notifier_block *notifier, ++ unsigned long what, void *data) ++{ ++ return(mconsole_unlink_socket()); ++} ++ ++ ++static struct notifier_block reboot_notifier = { ++ .notifier_call = do_unlink_socket, ++ .priority = 0, ++}; ++ ++/* Safe without explicit locking for now. Tasklets provide their own ++ * locking, and the interrupt handler is safe because it can't interrupt ++ * itself and it can only happen on CPU 0. ++ */ ++ ++LIST_HEAD(mc_requests); ++ ++void mc_task_proc(void *unused) ++{ ++ struct mconsole_entry *req; ++ unsigned long flags; ++ int done; ++ ++ do { ++ save_flags(flags); ++ req = list_entry(mc_requests.next, struct mconsole_entry, ++ list); ++ list_del(&req->list); ++ done = list_empty(&mc_requests); ++ restore_flags(flags); ++ req->request.cmd->handler(&req->request); ++ kfree(req); ++ } while(!done); ++} ++ ++struct tq_struct mconsole_task = { ++ .routine = mc_task_proc, ++ .data = NULL ++}; ++ ++void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ int fd; ++ struct mconsole_entry *new; ++ struct mc_request req; ++ ++ fd = (int) dev_id; ++ while (mconsole_get_request(fd, &req)){ ++ if(req.cmd->as_interrupt) (*req.cmd->handler)(&req); ++ else { ++ new = kmalloc(sizeof(req), GFP_ATOMIC); ++ if(new == NULL) ++ mconsole_reply(&req, "Out of memory", 1, 0); ++ else { ++ new->request = req; ++ list_add(&new->list, &mc_requests); ++ } ++ } ++ } ++ if(!list_empty(&mc_requests)) schedule_task(&mconsole_task); ++ reactivate_fd(fd, MCONSOLE_IRQ); ++} ++ ++void mconsole_version(struct mc_request *req) ++{ ++ char version[256]; ++ ++ sprintf(version, "%s %s %s %s %s", system_utsname.sysname, ++ system_utsname.nodename, system_utsname.release, ++ system_utsname.version, system_utsname.machine); ++ mconsole_reply(req, version, 0, 0); ++} ++ ++#define UML_MCONSOLE_HELPTEXT \ ++"Commands: \n\ ++ version - Get kernel version \n\ ++ help - Print this message \n\ ++ halt - Halt UML \n\ ++ reboot - Reboot UML \n\ ++ config = - Add a new device to UML; \n\ ++ same syntax as command line \n\ ++ config - Query the configuration of a device \n\ ++ remove - Remove a device from UML \n\ ++ sysrq - Performs the SysRq action controlled by the letter \n\ ++ cad - invoke the Ctl-Alt-Del handler \n\ ++ stop - pause the UML; it will do nothing until it receives a 'go' \n\ ++ go - continue the UML after a 'stop' \n\ ++" ++ ++void mconsole_help(struct mc_request *req) ++{ ++ mconsole_reply(req, UML_MCONSOLE_HELPTEXT, 0, 0); ++} ++ ++void mconsole_halt(struct mc_request *req) ++{ ++ mconsole_reply(req, "", 0, 0); ++ machine_halt(); ++} ++ ++void mconsole_reboot(struct mc_request *req) ++{ ++ mconsole_reply(req, "", 0, 0); ++ machine_restart(NULL); ++} ++ ++extern void ctrl_alt_del(void); ++ ++void mconsole_cad(struct mc_request *req) ++{ ++ mconsole_reply(req, "", 0, 0); ++ ctrl_alt_del(); ++} ++ ++void mconsole_go(struct mc_request *req) ++{ ++ mconsole_reply(req, "Not stopped", 1, 0); ++} ++ ++void mconsole_stop(struct mc_request *req) ++{ ++ deactivate_fd(req->originating_fd, MCONSOLE_IRQ); ++ os_set_fd_block(req->originating_fd, 1); ++ mconsole_reply(req, "", 0, 0); ++ while(mconsole_get_request(req->originating_fd, req)){ ++ if(req->cmd->handler == mconsole_go) break; ++ (*req->cmd->handler)(req); ++ } ++ os_set_fd_block(req->originating_fd, 0); ++ reactivate_fd(req->originating_fd, MCONSOLE_IRQ); ++ mconsole_reply(req, "", 0, 0); ++} ++ ++/* This list is populated by __initcall routines. */ ++ ++LIST_HEAD(mconsole_devices); ++ ++void mconsole_register_dev(struct mc_device *new) ++{ ++ list_add(&new->list, &mconsole_devices); ++} ++ ++static struct mc_device *mconsole_find_dev(char *name) ++{ ++ struct list_head *ele; ++ struct mc_device *dev; ++ ++ list_for_each(ele, &mconsole_devices){ ++ dev = list_entry(ele, struct mc_device, list); ++ if(!strncmp(name, dev->name, strlen(dev->name))) ++ return(dev); ++ } ++ return(NULL); ++} ++ ++#define CONFIG_BUF_SIZE 64 ++ ++static void mconsole_get_config(int (*get_config)(char *, char *, int, ++ char **), ++ struct mc_request *req, char *name) ++{ ++ char default_buf[CONFIG_BUF_SIZE], *error, *buf; ++ int n, size; ++ ++ if(get_config == NULL){ ++ mconsole_reply(req, "No get_config routine defined", 1, 0); ++ return; ++ } ++ ++ error = NULL; ++ size = sizeof(default_buf)/sizeof(default_buf[0]); ++ buf = default_buf; ++ ++ while(1){ ++ n = (*get_config)(name, buf, size, &error); ++ if(error != NULL){ ++ mconsole_reply(req, error, 1, 0); ++ goto out; ++ } ++ ++ if(n <= size){ ++ mconsole_reply(req, buf, 0, 0); ++ goto out; ++ } ++ ++ if(buf != default_buf) ++ kfree(buf); ++ ++ size = n; ++ buf = kmalloc(size, GFP_KERNEL); ++ if(buf == NULL){ ++ mconsole_reply(req, "Failed to allocate buffer", 1, 0); ++ return; ++ } ++ } ++ out: ++ if(buf != default_buf) ++ kfree(buf); ++ ++} ++ ++void mconsole_config(struct mc_request *req) ++{ ++ struct mc_device *dev; ++ char *ptr = req->request.data, *name; ++ int err; ++ ++ ptr += strlen("config"); ++ while(isspace(*ptr)) ptr++; ++ dev = mconsole_find_dev(ptr); ++ if(dev == NULL){ ++ mconsole_reply(req, "Bad configuration option", 1, 0); ++ return; ++ } ++ ++ name = &ptr[strlen(dev->name)]; ++ ptr = name; ++ while((*ptr != '=') && (*ptr != '\0')) ++ ptr++; ++ ++ if(*ptr == '='){ ++ err = (*dev->config)(name); ++ mconsole_reply(req, "", err, 0); ++ } ++ else mconsole_get_config(dev->get_config, req, name); ++} ++ ++void mconsole_remove(struct mc_request *req) ++{ ++ struct mc_device *dev; ++ char *ptr = req->request.data; ++ int err; ++ ++ ptr += strlen("remove"); ++ while(isspace(*ptr)) ptr++; ++ dev = mconsole_find_dev(ptr); ++ if(dev == NULL){ ++ mconsole_reply(req, "Bad remove option", 1, 0); ++ return; ++ } ++ err = (*dev->remove)(&ptr[strlen(dev->name)]); ++ mconsole_reply(req, "", err, 0); ++} ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void mconsole_sysrq(struct mc_request *req) ++{ ++ char *ptr = req->request.data; ++ ++ ptr += strlen("sysrq"); ++ while(isspace(*ptr)) ptr++; ++ ++ handle_sysrq(*ptr, ¤t->thread.regs, NULL, NULL); ++ mconsole_reply(req, "", 0, 0); ++} ++#else ++void mconsole_sysrq(struct mc_request *req) ++{ ++ mconsole_reply(req, "Sysrq not compiled in", 1, 0); ++} ++#endif ++ ++/* Changed by mconsole_setup, which is __setup, and called before SMP is ++ * active. ++ */ ++static char *notify_socket = NULL; ++ ++int mconsole_init(void) ++{ ++ int err, sock; ++ char file[256]; ++ ++ if(umid_file_name("mconsole", file, sizeof(file))) return(-1); ++ snprintf(mconsole_socket_name, sizeof(file), "%s", file); ++ ++ sock = create_unix_socket(file, sizeof(file)); ++ if (sock < 0){ ++ printk("Failed to initialize management console\n"); ++ return(1); ++ } ++ ++ register_reboot_notifier(&reboot_notifier); ++ ++ err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt, ++ SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, ++ "mconsole", (void *)sock); ++ if (err){ ++ printk("Failed to get IRQ for management console\n"); ++ return(1); ++ } ++ ++ if(notify_socket != NULL){ ++ notify_socket = uml_strdup(notify_socket); ++ if(notify_socket != NULL) ++ mconsole_notify(notify_socket, MCONSOLE_SOCKET, ++ mconsole_socket_name, ++ strlen(mconsole_socket_name) + 1); ++ else printk(KERN_ERR "mconsole_setup failed to strdup " ++ "string\n"); ++ } ++ ++ printk("mconsole (version %d) initialized on %s\n", ++ MCONSOLE_VERSION, mconsole_socket_name); ++ return(0); ++} ++ ++__initcall(mconsole_init); ++ ++static int write_proc_mconsole(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char *buf; ++ ++ buf = kmalloc(count + 1, GFP_KERNEL); ++ if(buf == NULL) ++ return(-ENOMEM); ++ ++ if(copy_from_user(buf, buffer, count)) ++ return(-EFAULT); ++ buf[count] = '\0'; ++ ++ mconsole_notify(notify_socket, MCONSOLE_USER_NOTIFY, buf, count); ++ return(count); ++} ++ ++static int create_proc_mconsole(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ if(notify_socket == NULL) return(0); ++ ++ ent = create_proc_entry("mconsole", S_IFREG | 0200, NULL); ++ if(ent == NULL){ ++ printk("create_proc_mconsole : create_proc_entry failed\n"); ++ return(0); ++ } ++ ++ ent->read_proc = NULL; ++ ent->write_proc = write_proc_mconsole; ++ return(0); ++} ++ ++static spinlock_t notify_spinlock = SPIN_LOCK_UNLOCKED; ++ ++void lock_notify(void) ++{ ++ spin_lock(¬ify_spinlock); ++} ++ ++void unlock_notify(void) ++{ ++ spin_unlock(¬ify_spinlock); ++} ++ ++__initcall(create_proc_mconsole); ++ ++#define NOTIFY "=notify:" ++ ++static int mconsole_setup(char *str) ++{ ++ if(!strncmp(str, NOTIFY, strlen(NOTIFY))){ ++ str += strlen(NOTIFY); ++ notify_socket = str; ++ } ++ else printk(KERN_ERR "mconsole_setup : Unknown option - '%s'\n", str); ++ return(1); ++} ++ ++__setup("mconsole", mconsole_setup); ++ ++__uml_help(mconsole_setup, ++"mconsole=notify:\n" ++" Requests that the mconsole driver send a message to the named Unix\n" ++" socket containing the name of the mconsole socket. This also serves\n" ++" to notify outside processes when UML has booted far enough to respond\n" ++" to mconsole requests.\n\n" ++); ++ ++static int notify_panic(struct notifier_block *self, unsigned long unused1, ++ void *ptr) ++{ ++ char *message = ptr; ++ ++ if(notify_socket == NULL) return(0); ++ ++ mconsole_notify(notify_socket, MCONSOLE_PANIC, message, ++ strlen(message) + 1); ++ return(0); ++} ++ ++static struct notifier_block panic_exit_notifier = { ++ .notifier_call = notify_panic, ++ .next = NULL, ++ .priority = 1 ++}; ++ ++static int add_notifier(void) ++{ ++ notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); ++ return(0); ++} ++ ++__initcall(add_notifier); ++ ++char *mconsole_notify_socket(void) ++{ ++ return(notify_socket); ++} ++ ++EXPORT_SYMBOL(mconsole_notify_socket); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mconsole_user.c um/arch/um/drivers/mconsole_user.c +--- orig/arch/um/drivers/mconsole_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mconsole_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,212 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "user.h" ++#include "mconsole.h" ++#include "umid.h" ++ ++static struct mconsole_command commands[] = { ++ { "version", mconsole_version, 1 }, ++ { "halt", mconsole_halt, 0 }, ++ { "reboot", mconsole_reboot, 0 }, ++ { "config", mconsole_config, 0 }, ++ { "remove", mconsole_remove, 0 }, ++ { "sysrq", mconsole_sysrq, 1 }, ++ { "help", mconsole_help, 1 }, ++ { "cad", mconsole_cad, 1 }, ++ { "stop", mconsole_stop, 0 }, ++ { "go", mconsole_go, 1 }, ++}; ++ ++/* Initialized in mconsole_init, which is an initcall */ ++char mconsole_socket_name[256]; ++ ++int mconsole_reply_v0(struct mc_request *req, char *reply) ++{ ++ struct iovec iov; ++ struct msghdr msg; ++ ++ iov.iov_base = reply; ++ iov.iov_len = strlen(reply); ++ ++ msg.msg_name = &(req->origin); ++ msg.msg_namelen = req->originlen; ++ msg.msg_iov = &iov; ++ msg.msg_iovlen = 1; ++ msg.msg_control = NULL; ++ msg.msg_controllen = 0; ++ msg.msg_flags = 0; ++ ++ return sendmsg(req->originating_fd, &msg, 0); ++} ++ ++static struct mconsole_command *mconsole_parse(struct mc_request *req) ++{ ++ struct mconsole_command *cmd; ++ int i; ++ ++ for(i=0;irequest.data, cmd->command, ++ strlen(cmd->command))){ ++ return(cmd); ++ } ++ } ++ return(NULL); ++} ++ ++#define MIN(a,b) ((a)<(b) ? (a):(b)) ++ ++#define STRINGX(x) #x ++#define STRING(x) STRINGX(x) ++ ++int mconsole_get_request(int fd, struct mc_request *req) ++{ ++ int len; ++ ++ req->originlen = sizeof(req->origin); ++ req->len = recvfrom(fd, &req->request, sizeof(req->request), 0, ++ (struct sockaddr *) req->origin, &req->originlen); ++ if (req->len < 0) ++ return 0; ++ ++ req->originating_fd = fd; ++ ++ if(req->request.magic != MCONSOLE_MAGIC){ ++ /* Unversioned request */ ++ len = MIN(sizeof(req->request.data) - 1, ++ strlen((char *) &req->request)); ++ memmove(req->request.data, &req->request, len); ++ req->request.data[len] = '\0'; ++ ++ req->request.magic = MCONSOLE_MAGIC; ++ req->request.version = 0; ++ req->request.len = len; ++ ++ mconsole_reply_v0(req, "ERR Version 0 mconsole clients are " ++ "not supported by this driver"); ++ return(0); ++ } ++ ++ if(req->request.len >= MCONSOLE_MAX_DATA){ ++ mconsole_reply(req, "Request too large", 1, 0); ++ return(0); ++ } ++ if(req->request.version != MCONSOLE_VERSION){ ++ mconsole_reply(req, "This driver only supports version " ++ STRING(MCONSOLE_VERSION) " clients", 1, 0); ++ } ++ ++ req->request.data[req->request.len] = '\0'; ++ req->cmd = mconsole_parse(req); ++ if(req->cmd == NULL){ ++ mconsole_reply(req, "Unknown command", 1, 0); ++ return(0); ++ } ++ ++ return(1); ++} ++ ++int mconsole_reply(struct mc_request *req, char *str, int err, int more) ++{ ++ struct mconsole_reply reply; ++ int total, len, n; ++ ++ total = strlen(str); ++ do { ++ reply.err = err; ++ ++ /* err can only be true on the first packet */ ++ err = 0; ++ ++ len = MIN(total, MCONSOLE_MAX_DATA - 1); ++ ++ if(len == total) reply.more = more; ++ else reply.more = 1; ++ ++ memcpy(reply.data, str, len); ++ reply.data[len] = '\0'; ++ total -= len; ++ reply.len = len + 1; ++ ++ len = sizeof(reply) + reply.len - sizeof(reply.data); ++ ++ n = sendto(req->originating_fd, &reply, len, 0, ++ (struct sockaddr *) req->origin, req->originlen); ++ ++ if(n < 0) return(-errno); ++ } while(total > 0); ++ return(0); ++} ++ ++int mconsole_unlink_socket(void) ++{ ++ unlink(mconsole_socket_name); ++ return 0; ++} ++ ++static int notify_sock = -1; ++ ++int mconsole_notify(char *sock_name, int type, const void *data, int len) ++{ ++ struct sockaddr_un target; ++ struct mconsole_notify packet; ++ int n, err = 0; ++ ++ lock_notify(); ++ if(notify_sock < 0){ ++ notify_sock = socket(PF_UNIX, SOCK_DGRAM, 0); ++ if(notify_sock < 0){ ++ printk("mconsole_notify - socket failed, errno = %d\n", ++ errno); ++ err = -errno; ++ } ++ } ++ unlock_notify(); ++ ++ if(err) ++ return(err); ++ ++ target.sun_family = AF_UNIX; ++ strcpy(target.sun_path, sock_name); ++ ++ packet.magic = MCONSOLE_MAGIC; ++ packet.version = MCONSOLE_VERSION; ++ packet.type = type; ++ len = (len > sizeof(packet.data)) ? sizeof(packet.data) : len; ++ packet.len = len; ++ memcpy(packet.data, data, len); ++ ++ err = 0; ++ len = sizeof(packet) + packet.len - sizeof(packet.data); ++ n = sendto(notify_sock, &packet, len, 0, (struct sockaddr *) &target, ++ sizeof(target)); ++ if(n < 0){ ++ printk("mconsole_notify - sendto failed, errno = %d\n", errno); ++ err = -errno; ++ } ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/mmapper_kern.c um/arch/um/drivers/mmapper_kern.c +--- orig/arch/um/drivers/mmapper_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/mmapper_kern.c Sun Dec 15 21:03:08 2002 +@@ -0,0 +1,148 @@ ++/* ++ * arch/um/drivers/mmapper_kern.c ++ * ++ * BRIEF MODULE DESCRIPTION ++ * ++ * Copyright (C) 2000 RidgeRun, Inc. ++ * Author: RidgeRun, Inc. ++ * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "mem_user.h" ++#include "user_util.h" ++ ++/* These are set in mmapper_init, which is called at boot time */ ++static unsigned long mmapper_size; ++static unsigned long p_buf = 0; ++static char *v_buf = NULL; ++ ++static ssize_t ++mmapper_read(struct file *file, char *buf, size_t count, loff_t *ppos) ++{ ++ if(*ppos > mmapper_size) ++ return -EINVAL; ++ ++ if(count + *ppos > mmapper_size) ++ count = count + *ppos - mmapper_size; ++ ++ if(count < 0) ++ return -EINVAL; ++ ++ copy_to_user(buf,&v_buf[*ppos],count); ++ ++ return count; ++} ++ ++static ssize_t ++mmapper_write(struct file *file, const char *buf, size_t count, loff_t *ppos) ++{ ++ if(*ppos > mmapper_size) ++ return -EINVAL; ++ ++ if(count + *ppos > mmapper_size) ++ count = count + *ppos - mmapper_size; ++ ++ if(count < 0) ++ return -EINVAL; ++ ++ copy_from_user(&v_buf[*ppos],buf,count); ++ ++ return count; ++} ++ ++static int ++mmapper_ioctl(struct inode *inode, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ return(-ENOIOCTLCMD); ++} ++ ++static int ++mmapper_mmap(struct file *file, struct vm_area_struct * vma) ++{ ++ int ret = -EINVAL; ++ int size; ++ ++ lock_kernel(); ++ if (vma->vm_pgoff != 0) ++ goto out; ++ ++ size = vma->vm_end - vma->vm_start; ++ if(size > mmapper_size) return(-EFAULT); ++ ++ /* XXX A comment above remap_page_range says it should only be ++ * called when the mm semaphore is held ++ */ ++ if (remap_page_range(vma->vm_start, p_buf, size, vma->vm_page_prot)) ++ goto out; ++ ret = 0; ++out: ++ unlock_kernel(); ++ return ret; ++} ++ ++static int ++mmapper_open(struct inode *inode, struct file *file) ++{ ++ return 0; ++} ++ ++static int ++mmapper_release(struct inode *inode, struct file *file) ++{ ++ return 0; ++} ++ ++static struct file_operations mmapper_fops = { ++ .owner = THIS_MODULE, ++ .read = mmapper_read, ++ .write = mmapper_write, ++ .ioctl = mmapper_ioctl, ++ .mmap = mmapper_mmap, ++ .open = mmapper_open, ++ .release = mmapper_release, ++}; ++ ++static int __init mmapper_init(void) ++{ ++ printk(KERN_INFO "Mapper v0.1\n"); ++ ++ v_buf = (char *) find_iomem("mmapper", &mmapper_size); ++ if(mmapper_size == 0) return(0); ++ ++ p_buf = __pa(v_buf); ++ ++ devfs_register (NULL, "mmapper", DEVFS_FL_DEFAULT, ++ 30, 0, S_IFCHR | S_IRUGO | S_IWUGO, ++ &mmapper_fops, NULL); ++ devfs_mk_symlink(NULL, "mmapper0", DEVFS_FL_DEFAULT, "mmapper", ++ NULL, NULL); ++ return(0); ++} ++ ++static void mmapper_exit(void) ++{ ++} ++ ++module_init(mmapper_init); ++module_exit(mmapper_exit); ++ ++MODULE_AUTHOR("Greg Lonnon "); ++MODULE_DESCRIPTION("DSPLinux simulator mmapper driver"); ++/* ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/net_kern.c um/arch/um/drivers/net_kern.c +--- orig/arch/um/drivers/net_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/net_kern.c Sun Dec 15 21:19:16 2002 +@@ -0,0 +1,870 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * Licensed under the GPL. ++ */ ++ ++#include "linux/config.h" ++#include "linux/kernel.h" ++#include "linux/netdevice.h" ++#include "linux/rtnetlink.h" ++#include "linux/skbuff.h" ++#include "linux/socket.h" ++#include "linux/spinlock.h" ++#include "linux/module.h" ++#include "linux/init.h" ++#include "linux/etherdevice.h" ++#include "linux/list.h" ++#include "linux/inetdevice.h" ++#include "linux/ctype.h" ++#include "linux/bootmem.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "mconsole_kern.h" ++#include "init.h" ++#include "irq_user.h" ++ ++static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; ++LIST_HEAD(opened); ++ ++static int uml_net_rx(struct net_device *dev) ++{ ++ struct uml_net_private *lp = dev->priv; ++ int pkt_len; ++ struct sk_buff *skb; ++ ++ /* If we can't allocate memory, try again next round. */ ++ if ((skb = dev_alloc_skb(dev->mtu)) == NULL) { ++ lp->stats.rx_dropped++; ++ return 0; ++ } ++ ++ skb->dev = dev; ++ skb_put(skb, dev->mtu); ++ skb->mac.raw = skb->data; ++ pkt_len = (*lp->read)(lp->fd, &skb, lp); ++ ++ if (pkt_len > 0) { ++ skb_trim(skb, pkt_len); ++ skb->protocol = (*lp->protocol)(skb); ++ netif_rx(skb); ++ ++ lp->stats.rx_bytes += skb->len; ++ lp->stats.rx_packets++; ++ return pkt_len; ++ } ++ ++ kfree_skb(skb); ++ return pkt_len; ++} ++ ++void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct net_device *dev = dev_id; ++ struct uml_net_private *lp = dev->priv; ++ int err; ++ ++ if(!netif_running(dev)) ++ return; ++ ++ spin_lock(&lp->lock); ++ while((err = uml_net_rx(dev)) > 0) ; ++ if(err < 0) { ++ printk(KERN_ERR ++ "Device '%s' read returned %d, shutting it down\n", ++ dev->name, err); ++ dev_close(dev); ++ goto out; ++ } ++ reactivate_fd(lp->fd, UM_ETH_IRQ); ++ ++ out: ++ spin_unlock(&lp->lock); ++} ++ ++static int uml_net_open(struct net_device *dev) ++{ ++ struct uml_net_private *lp = dev->priv; ++ char addr[sizeof("255.255.255.255\0")]; ++ int err; ++ ++ spin_lock(&lp->lock); ++ ++ if(lp->fd >= 0){ ++ err = -ENXIO; ++ goto out; ++ } ++ ++ if(!lp->have_mac){ ++ dev_ip_addr(dev, addr, &lp->mac[2]); ++ set_ether_mac(dev, lp->mac); ++ } ++ ++ lp->fd = (*lp->open)(&lp->user); ++ if(lp->fd < 0){ ++ err = lp->fd; ++ goto out; ++ } ++ ++ err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, ++ SA_INTERRUPT | SA_SHIRQ, dev->name, dev); ++ if(err != 0){ ++ printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); ++ if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); ++ lp->fd = -1; ++ err = -ENETUNREACH; ++ } ++ ++ lp->tl.data = (unsigned long) &lp->user; ++ netif_start_queue(dev); ++ ++ spin_lock(&opened_lock); ++ list_add(&lp->list, &opened); ++ spin_unlock(&opened_lock); ++ MOD_INC_USE_COUNT; ++ out: ++ spin_unlock(&lp->lock); ++ return(err); ++} ++ ++static int uml_net_close(struct net_device *dev) ++{ ++ struct uml_net_private *lp = dev->priv; ++ ++ netif_stop_queue(dev); ++ spin_lock(&lp->lock); ++ ++ free_irq(dev->irq, dev); ++ if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); ++ lp->fd = -1; ++ spin_lock(&opened_lock); ++ list_del(&lp->list); ++ spin_unlock(&opened_lock); ++ ++ MOD_DEC_USE_COUNT; ++ spin_unlock(&lp->lock); ++ return 0; ++} ++ ++static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct uml_net_private *lp = dev->priv; ++ unsigned long flags; ++ int len; ++ ++ netif_stop_queue(dev); ++ ++ spin_lock_irqsave(&lp->lock, flags); ++ ++ len = (*lp->write)(lp->fd, &skb, lp); ++ ++ if(len == skb->len) { ++ lp->stats.tx_packets++; ++ lp->stats.tx_bytes += skb->len; ++ dev->trans_start = jiffies; ++ netif_start_queue(dev); ++ ++ /* this is normally done in the interrupt when tx finishes */ ++ netif_wake_queue(dev); ++ } ++ else if(len == 0){ ++ netif_start_queue(dev); ++ lp->stats.tx_dropped++; ++ } ++ else { ++ netif_start_queue(dev); ++ printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); ++ } ++ ++ spin_unlock_irqrestore(&lp->lock, flags); ++ ++ dev_kfree_skb(skb); ++ ++ return 0; ++} ++ ++static struct net_device_stats *uml_net_get_stats(struct net_device *dev) ++{ ++ struct uml_net_private *lp = dev->priv; ++ return &lp->stats; ++} ++ ++static void uml_net_set_multicast_list(struct net_device *dev) ++{ ++ if (dev->flags & IFF_PROMISC) return; ++ else if (dev->mc_count) dev->flags |= IFF_ALLMULTI; ++ else dev->flags &= ~IFF_ALLMULTI; ++} ++ ++static void uml_net_tx_timeout(struct net_device *dev) ++{ ++ dev->trans_start = jiffies; ++ netif_wake_queue(dev); ++} ++ ++static int uml_net_set_mac(struct net_device *dev, void *addr) ++{ ++ struct uml_net_private *lp = dev->priv; ++ struct sockaddr *hwaddr = addr; ++ ++ spin_lock(&lp->lock); ++ memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN); ++ spin_unlock(&lp->lock); ++ ++ return(0); ++} ++ ++static int uml_net_change_mtu(struct net_device *dev, int new_mtu) ++{ ++ struct uml_net_private *lp = dev->priv; ++ int err = 0; ++ ++ spin_lock(&lp->lock); ++ ++ new_mtu = (*lp->set_mtu)(new_mtu, &lp->user); ++ if(new_mtu < 0){ ++ err = new_mtu; ++ goto out; ++ } ++ ++ dev->mtu = new_mtu; ++ ++ out: ++ spin_unlock(&lp->lock); ++ return err; ++} ++ ++static int uml_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ++{ ++ return(-EINVAL); ++} ++ ++void uml_net_user_timer_expire(unsigned long _conn) ++{ ++#ifdef undef ++ struct connection *conn = (struct connection *)_conn; ++ ++ dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); ++ do_connect(conn); ++#endif ++} ++ ++/* ++ * default do nothing hard header packet routines for struct net_device init. ++ * real ethernet transports will overwrite with real routines. ++ */ ++static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev, ++ unsigned short type, void *daddr, void *saddr, unsigned len) ++{ ++ return(0); /* no change */ ++} ++ ++static int uml_net_rebuild_header(struct sk_buff *skb) ++{ ++ return(0); /* ignore */ ++} ++ ++static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh) ++{ ++ return(-1); /* fail */ ++} ++ ++static void uml_net_header_cache_update(struct hh_cache *hh, ++ struct net_device *dev, unsigned char * haddr) ++{ ++ /* ignore */ ++} ++ ++static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr) ++{ ++ return(0); /* nothing */ ++} ++ ++static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; ++static struct list_head devices = LIST_HEAD_INIT(devices); ++ ++static int eth_configure(int n, void *init, char *mac, ++ struct transport *transport) ++{ ++ struct uml_net *device; ++ struct net_device *dev; ++ struct uml_net_private *lp; ++ int save, err, size; ++ ++ size = transport->private_size + sizeof(struct uml_net_private) + ++ sizeof(((struct uml_net_private *) 0)->user); ++ ++ device = kmalloc(sizeof(*device), GFP_KERNEL); ++ if(device == NULL){ ++ printk(KERN_ERR "eth_configure failed to allocate uml_net\n"); ++ return(1); ++ } ++ ++ *device = ((struct uml_net) { .list = LIST_HEAD_INIT(device->list), ++ .dev = NULL, ++ .index = n, ++ .mac = { [ 0 ... 5 ] = 0 }, ++ .have_mac = 0 }); ++ ++ spin_lock(&devices_lock); ++ list_add(&device->list, &devices); ++ spin_unlock(&devices_lock); ++ ++ if(setup_etheraddr(mac, device->mac)) ++ device->have_mac = 1; ++ ++ printk(KERN_INFO "Netdevice %d ", n); ++ if(device->have_mac) printk("(%02x:%02x:%02x:%02x:%02x:%02x) ", ++ device->mac[0], device->mac[1], ++ device->mac[2], device->mac[3], ++ device->mac[4], device->mac[5]); ++ printk(": "); ++ dev = kmalloc(sizeof(*dev) + size, GFP_KERNEL); ++ if(dev == NULL){ ++ printk(KERN_ERR "eth_configure: failed to allocate device\n"); ++ return(1); ++ } ++ memset(dev, 0, sizeof(*dev) + size); ++ ++ snprintf(dev->name, sizeof(dev->name), "eth%d", n); ++ dev->priv = (void *) &dev[1]; ++ device->dev = dev; ++ ++ dev->hard_header = uml_net_hard_header; ++ dev->rebuild_header = uml_net_rebuild_header; ++ dev->hard_header_cache = uml_net_header_cache; ++ dev->header_cache_update= uml_net_header_cache_update; ++ dev->hard_header_parse = uml_net_header_parse; ++ ++ (*transport->kern->init)(dev, init); ++ ++ dev->mtu = transport->user->max_packet; ++ dev->open = uml_net_open; ++ dev->hard_start_xmit = uml_net_start_xmit; ++ dev->stop = uml_net_close; ++ dev->get_stats = uml_net_get_stats; ++ dev->set_multicast_list = uml_net_set_multicast_list; ++ dev->tx_timeout = uml_net_tx_timeout; ++ dev->set_mac_address = uml_net_set_mac; ++ dev->change_mtu = uml_net_change_mtu; ++ dev->do_ioctl = uml_net_ioctl; ++ dev->watchdog_timeo = (HZ >> 1); ++ dev->irq = UM_ETH_IRQ; ++ ++ rtnl_lock(); ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ if(err) ++ return(1); ++ lp = dev->priv; ++ ++ /* lp.user is the first four bytes of the transport data, which ++ * has already been initialized. This structure assignment will ++ * overwrite that, so we make sure that .user gets overwritten with ++ * what it already has. ++ */ ++ save = lp->user[0]; ++ *lp = ((struct uml_net_private) ++ { .list = LIST_HEAD_INIT(lp->list), ++ .lock = SPIN_LOCK_UNLOCKED, ++ .dev = dev, ++ .fd = -1, ++ .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, ++ .have_mac = device->have_mac, ++ .protocol = transport->kern->protocol, ++ .open = transport->user->open, ++ .close = transport->user->close, ++ .remove = transport->user->remove, ++ .read = transport->kern->read, ++ .write = transport->kern->write, ++ .add_address = transport->user->add_address, ++ .delete_address = transport->user->delete_address, ++ .set_mtu = transport->user->set_mtu, ++ .user = { save } }); ++ init_timer(&lp->tl); ++ lp->tl.function = uml_net_user_timer_expire; ++ memset(&lp->stats, 0, sizeof(lp->stats)); ++ if(lp->have_mac) memcpy(lp->mac, device->mac, sizeof(lp->mac)); ++ ++ if(transport->user->init) ++ (*transport->user->init)(&lp->user, dev); ++ ++ if(device->have_mac) ++ set_ether_mac(dev, device->mac); ++ return(0); ++} ++ ++static struct uml_net *find_device(int n) ++{ ++ struct uml_net *device; ++ struct list_head *ele; ++ ++ spin_lock(&devices_lock); ++ list_for_each(ele, &devices){ ++ device = list_entry(ele, struct uml_net, list); ++ if(device->index == n) ++ goto out; ++ } ++ device = NULL; ++ out: ++ spin_unlock(&devices_lock); ++ return(device); ++} ++ ++static int eth_parse(char *str, int *index_out, char **str_out) ++{ ++ char *end; ++ int n; ++ ++ n = simple_strtoul(str, &end, 0); ++ if(end == str){ ++ printk(KERN_ERR "eth_setup: Failed to parse '%s'\n", str); ++ return(1); ++ } ++ if(n < 0){ ++ printk(KERN_ERR "eth_setup: device %d is negative\n", n); ++ return(1); ++ } ++ str = end; ++ if(*str != '='){ ++ printk(KERN_ERR ++ "eth_setup: expected '=' after device number\n"); ++ return(1); ++ } ++ str++; ++ if(find_device(n)){ ++ printk(KERN_ERR "eth_setup: Device %d already configured\n", ++ n); ++ return(1); ++ } ++ if(index_out) *index_out = n; ++ *str_out = str; ++ return(0); ++} ++ ++struct eth_init { ++ struct list_head list; ++ char *init; ++ int index; ++}; ++ ++/* Filled in at boot time. Will need locking if the transports become ++ * modular. ++ */ ++struct list_head transports = LIST_HEAD_INIT(transports); ++ ++/* Filled in during early boot */ ++struct list_head eth_cmd_line = LIST_HEAD_INIT(eth_cmd_line); ++ ++static int check_transport(struct transport *transport, char *eth, int n, ++ void **init_out, char **mac_out) ++{ ++ int len; ++ ++ len = strlen(transport->name); ++ if(strncmp(eth, transport->name, len)) ++ return(0); ++ ++ eth += len; ++ if(*eth == ',') ++ eth++; ++ else if(*eth != '\0') ++ return(0); ++ ++ *init_out = kmalloc(transport->setup_size, GFP_KERNEL); ++ if(*init_out == NULL) ++ return(1); ++ ++ if(!transport->setup(eth, mac_out, *init_out)){ ++ kfree(*init_out); ++ *init_out = NULL; ++ } ++ return(1); ++} ++ ++void register_transport(struct transport *new) ++{ ++ struct list_head *ele, *next; ++ struct eth_init *eth; ++ void *init; ++ char *mac = NULL; ++ int match; ++ ++ list_add(&new->list, &transports); ++ ++ list_for_each_safe(ele, next, ð_cmd_line){ ++ eth = list_entry(ele, struct eth_init, list); ++ match = check_transport(new, eth->init, eth->index, &init, ++ &mac); ++ if(!match) ++ continue; ++ else if(init != NULL){ ++ eth_configure(eth->index, init, mac, new); ++ kfree(init); ++ } ++ list_del(ð->list); ++ } ++} ++ ++static int eth_setup_common(char *str, int index) ++{ ++ struct list_head *ele; ++ struct transport *transport; ++ void *init; ++ char *mac = NULL; ++ ++ list_for_each(ele, &transports){ ++ transport = list_entry(ele, struct transport, list); ++ if(!check_transport(transport, str, index, &init, &mac)) ++ continue; ++ if(init != NULL){ ++ eth_configure(index, init, mac, transport); ++ kfree(init); ++ } ++ return(1); ++ } ++ return(0); ++} ++ ++static int eth_setup(char *str) ++{ ++ struct eth_init *new; ++ int n, err; ++ ++ err = eth_parse(str, &n, &str); ++ if(err) return(1); ++ ++ new = alloc_bootmem(sizeof(new)); ++ if(new == NULL){ ++ printk("eth_init : alloc_bootmem failed\n"); ++ return(1); ++ } ++ *new = ((struct eth_init) { .list = LIST_HEAD_INIT(new->list), ++ .index = n, ++ .init = str }); ++ list_add_tail(&new->list, ð_cmd_line); ++ return(1); ++} ++ ++__setup("eth", eth_setup); ++__uml_help(eth_setup, ++"eth[0-9]+=,\n" ++" Configure a network device.\n\n" ++); ++ ++static int eth_init(void) ++{ ++ struct list_head *ele, *next; ++ struct eth_init *eth; ++ ++ list_for_each_safe(ele, next, ð_cmd_line){ ++ eth = list_entry(ele, struct eth_init, list); ++ ++ if(eth_setup_common(eth->init, eth->index)) ++ list_del(ð->list); ++ } ++ ++ return(1); ++} ++ ++__initcall(eth_init); ++ ++static int net_config(char *str) ++{ ++ int n, err; ++ ++ err = eth_parse(str, &n, &str); ++ if(err) return(err); ++ ++ str = uml_strdup(str); ++ if(str == NULL){ ++ printk(KERN_ERR "net_config failed to strdup string\n"); ++ return(-1); ++ } ++ err = !eth_setup_common(str, n); ++ if(err) ++ kfree(str); ++ return(err); ++} ++ ++static int net_remove(char *str) ++{ ++ struct uml_net *device; ++ struct net_device *dev; ++ struct uml_net_private *lp; ++ char *end; ++ int n; ++ ++ n = simple_strtoul(str, &end, 0); ++ if((*end != '\0') || (end == str)) ++ return(-1); ++ ++ device = find_device(n); ++ if(device == NULL) ++ return(0); ++ ++ dev = device->dev; ++ lp = dev->priv; ++ if(lp->fd > 0) return(-1); ++ if(lp->remove != NULL) (*lp->remove)(&lp->user); ++ unregister_netdev(dev); ++ ++ list_del(&device->list); ++ kfree(device); ++ return(0); ++} ++ ++static struct mc_device net_mc = { ++ .name = "eth", ++ .config = net_config, ++ .get_config = NULL, ++ .remove = net_remove, ++}; ++ ++static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, ++ void *ptr) ++{ ++ struct in_ifaddr *ifa = ptr; ++ u32 addr = ifa->ifa_address; ++ u32 netmask = ifa->ifa_mask; ++ struct net_device *dev = ifa->ifa_dev->dev; ++ struct uml_net_private *lp; ++ void (*proc)(unsigned char *, unsigned char *, void *); ++ unsigned char addr_buf[4], netmask_buf[4]; ++ ++ if(dev->open != uml_net_open) return(NOTIFY_DONE); ++ ++ lp = dev->priv; ++ ++ proc = NULL; ++ switch (event){ ++ case NETDEV_UP: ++ proc = lp->add_address; ++ break; ++ case NETDEV_DOWN: ++ proc = lp->delete_address; ++ break; ++ } ++ if(proc != NULL){ ++ addr_buf[0] = addr & 0xff; ++ addr_buf[1] = (addr >> 8) & 0xff; ++ addr_buf[2] = (addr >> 16) & 0xff; ++ addr_buf[3] = addr >> 24; ++ netmask_buf[0] = netmask & 0xff; ++ netmask_buf[1] = (netmask >> 8) & 0xff; ++ netmask_buf[2] = (netmask >> 16) & 0xff; ++ netmask_buf[3] = netmask >> 24; ++ (*proc)(addr_buf, netmask_buf, &lp->user); ++ } ++ return(NOTIFY_DONE); ++} ++ ++struct notifier_block uml_inetaddr_notifier = { ++ .notifier_call = uml_inetaddr_event, ++}; ++ ++static int uml_net_init(void) ++{ ++ struct list_head *ele; ++ struct uml_net_private *lp; ++ struct in_device *ip; ++ struct in_ifaddr *in; ++ ++ mconsole_register_dev(&net_mc); ++ register_inetaddr_notifier(¨_inetaddr_notifier); ++ ++ /* Devices may have been opened already, so the uml_inetaddr_notifier ++ * didn't get a chance to run for them. This fakes it so that ++ * addresses which have already been set up get handled properly. ++ */ ++ list_for_each(ele, &opened){ ++ lp = list_entry(ele, struct uml_net_private, list); ++ ip = lp->dev->ip_ptr; ++ if(ip == NULL) continue; ++ in = ip->ifa_list; ++ while(in != NULL){ ++ uml_inetaddr_event(NULL, NETDEV_UP, in); ++ in = in->ifa_next; ++ } ++ } ++ ++ return(0); ++} ++ ++__initcall(uml_net_init); ++ ++static void close_devices(void) ++{ ++ struct list_head *ele; ++ struct uml_net_private *lp; ++ ++ list_for_each(ele, &opened){ ++ lp = list_entry(ele, struct uml_net_private, list); ++ if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); ++ if(lp->remove != NULL) (*lp->remove)(&lp->user); ++ } ++} ++ ++__uml_exitcall(close_devices); ++ ++int setup_etheraddr(char *str, unsigned char *addr) ++{ ++ char *end; ++ int i; ++ ++ if(str == NULL) ++ return(0); ++ for(i=0;i<6;i++){ ++ addr[i] = simple_strtoul(str, &end, 16); ++ if((end == str) || ++ ((*end != ':') && (*end != ',') && (*end != '\0'))){ ++ printk(KERN_ERR ++ "setup_etheraddr: failed to parse '%s' " ++ "as an ethernet address\n", str); ++ return(0); ++ } ++ str = end + 1; ++ } ++ if(addr[0] & 1){ ++ printk(KERN_ERR ++ "Attempt to assign a broadcast ethernet address to a " ++ "device disallowed\n"); ++ return(0); ++ } ++ return(1); ++} ++ ++void dev_ip_addr(void *d, char *buf, char *bin_buf) ++{ ++ struct net_device *dev = d; ++ struct in_device *ip = dev->ip_ptr; ++ struct in_ifaddr *in; ++ u32 addr; ++ ++ if((ip == NULL) || ((in = ip->ifa_list) == NULL)){ ++ printk(KERN_WARNING "dev_ip_addr - device not assigned an " ++ "IP address\n"); ++ return; ++ } ++ addr = in->ifa_address; ++ sprintf(buf, "%d.%d.%d.%d", addr & 0xff, (addr >> 8) & 0xff, ++ (addr >> 16) & 0xff, addr >> 24); ++ if(bin_buf){ ++ bin_buf[0] = addr & 0xff; ++ bin_buf[1] = (addr >> 8) & 0xff; ++ bin_buf[2] = (addr >> 16) & 0xff; ++ bin_buf[3] = addr >> 24; ++ } ++} ++ ++void set_ether_mac(void *d, unsigned char *addr) ++{ ++ struct net_device *dev = d; ++ ++ memcpy(dev->dev_addr, addr, ETH_ALEN); ++} ++ ++struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra) ++{ ++ if((skb != NULL) && (skb_tailroom(skb) < extra)){ ++ struct sk_buff *skb2; ++ ++ skb2 = skb_copy_expand(skb, 0, extra, GFP_ATOMIC); ++ dev_kfree_skb(skb); ++ skb = skb2; ++ } ++ if(skb != NULL) skb_put(skb, extra); ++ return(skb); ++} ++ ++void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, ++ void *), ++ void *arg) ++{ ++ struct net_device *dev = d; ++ struct in_device *ip = dev->ip_ptr; ++ struct in_ifaddr *in; ++ unsigned char address[4], netmask[4]; ++ ++ if(ip == NULL) return; ++ in = ip->ifa_list; ++ while(in != NULL){ ++ address[0] = in->ifa_address & 0xff; ++ address[1] = (in->ifa_address >> 8) & 0xff; ++ address[2] = (in->ifa_address >> 16) & 0xff; ++ address[3] = in->ifa_address >> 24; ++ netmask[0] = in->ifa_mask & 0xff; ++ netmask[1] = (in->ifa_mask >> 8) & 0xff; ++ netmask[2] = (in->ifa_mask >> 16) & 0xff; ++ netmask[3] = in->ifa_mask >> 24; ++ (*cb)(address, netmask, arg); ++ in = in->ifa_next; ++ } ++} ++ ++int dev_netmask(void *d, void *m) ++{ ++ struct net_device *dev = d; ++ struct in_device *ip = dev->ip_ptr; ++ struct in_ifaddr *in; ++ __u32 *mask_out = m; ++ ++ if(ip == NULL) ++ return(1); ++ ++ in = ip->ifa_list; ++ if(in == NULL) ++ return(1); ++ ++ *mask_out = in->ifa_mask; ++ return(0); ++} ++ ++void *get_output_buffer(int *len_out) ++{ ++ void *ret; ++ ++ ret = (void *) __get_free_pages(GFP_KERNEL, 0); ++ if(ret) *len_out = PAGE_SIZE; ++ else *len_out = 0; ++ return(ret); ++} ++ ++void free_output_buffer(void *buffer) ++{ ++ free_pages((unsigned long) buffer, 0); ++} ++ ++int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, ++ char **gate_addr) ++{ ++ char *remain; ++ ++ remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); ++ if(remain != NULL){ ++ printk("tap_setup_common - Extra garbage on specification : " ++ "'%s'\n", remain); ++ return(1); ++ } ++ ++ return(0); ++} ++ ++unsigned short eth_protocol(struct sk_buff *skb) ++{ ++ return(eth_type_trans(skb, skb->dev)); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/net_user.c um/arch/um/drivers/net_user.c +--- orig/arch/um/drivers/net_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/net_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,254 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "user.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "net_user.h" ++#include "helper.h" ++#include "os.h" ++ ++int tap_open_common(void *dev, char *gate_addr) ++{ ++ int tap_addr[4]; ++ ++ if(gate_addr == NULL) return(0); ++ if(sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], ++ &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4){ ++ printk("Invalid tap IP address - '%s'\n", ++ gate_addr); ++ return(-EINVAL); ++ } ++ return(0); ++} ++ ++void tap_check_ips(char *gate_addr, char *eth_addr) ++{ ++ int tap_addr[4]; ++ ++ if((gate_addr != NULL) && ++ (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], ++ &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) && ++ (eth_addr[0] == tap_addr[0]) && ++ (eth_addr[1] == tap_addr[1]) && ++ (eth_addr[2] == tap_addr[2]) && ++ (eth_addr[3] == tap_addr[3])){ ++ printk("The tap IP address and the UML eth IP address" ++ " must be different\n"); ++ } ++} ++ ++void read_output(int fd, char *output, int len) ++{ ++ int remain, n, actual; ++ char c; ++ ++ if(output == NULL){ ++ output = &c; ++ len = sizeof(c); ++ } ++ ++ *output = '\0'; ++ if(read(fd, &remain, sizeof(remain)) != sizeof(remain)){ ++ printk("read_output - read of length failed, errno = %d\n", ++ errno); ++ return; ++ } ++ ++ while(remain != 0){ ++ n = (remain < len) ? remain : len; ++ actual = read(fd, output, n); ++ if(actual != n){ ++ printk("read_output - read of data failed, " ++ "errno = %d\n", errno); ++ return; ++ } ++ remain -= actual; ++ } ++ return; ++} ++ ++int net_read(int fd, void *buf, int len) ++{ ++ int n; ++ ++ while(((n = read(fd, buf, len)) < 0) && (errno == EINTR)) ; ++ ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-ENOTCONN); ++ return(n); ++} ++ ++int net_recvfrom(int fd, void *buf, int len) ++{ ++ int n; ++ ++ while(((n = recvfrom(fd, buf, len, 0, NULL, NULL)) < 0) && ++ (errno == EINTR)) ; ++ ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-ENOTCONN); ++ return(n); ++} ++ ++int net_write(int fd, void *buf, int len) ++{ ++ int n; ++ ++ while(((n = write(fd, buf, len)) < 0) && (errno == EINTR)) ; ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-ENOTCONN); ++ return(n); ++} ++ ++int net_send(int fd, void *buf, int len) ++{ ++ int n; ++ ++ while(((n = send(fd, buf, len, 0)) < 0) && (errno == EINTR)) ; ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-ENOTCONN); ++ return(n); ++} ++ ++int net_sendto(int fd, void *buf, int len, void *to, int sock_len) ++{ ++ int n; ++ ++ while(((n = sendto(fd, buf, len, 0, (struct sockaddr *) to, ++ sock_len)) < 0) && (errno == EINTR)) ; ++ if(n < 0){ ++ if(errno == EAGAIN) return(0); ++ return(-errno); ++ } ++ else if(n == 0) return(-ENOTCONN); ++ return(n); ++} ++ ++struct change_pre_exec_data { ++ int close_me; ++ int stdout; ++}; ++ ++static void change_pre_exec(void *arg) ++{ ++ struct change_pre_exec_data *data = arg; ++ ++ close(data->close_me); ++ dup2(data->stdout, 1); ++} ++ ++static int change_tramp(char **argv, char *output, int output_len) ++{ ++ int pid, fds[2], err; ++ struct change_pre_exec_data pe_data; ++ ++ err = os_pipe(fds, 1, 0); ++ if(err){ ++ printk("change_tramp - pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ pe_data.close_me = fds[0]; ++ pe_data.stdout = fds[1]; ++ pid = run_helper(change_pre_exec, &pe_data, argv, NULL); ++ ++ close(fds[1]); ++ read_output(fds[0], output, output_len); ++ waitpid(pid, NULL, 0); ++ return(pid); ++} ++ ++static void change(char *dev, char *what, unsigned char *addr, ++ unsigned char *netmask) ++{ ++ char addr_buf[sizeof("255.255.255.255\0")]; ++ char netmask_buf[sizeof("255.255.255.255\0")]; ++ char version[sizeof("nnnnn\0")]; ++ char *argv[] = { "uml_net", version, what, dev, addr_buf, ++ netmask_buf, NULL }; ++ char *output; ++ int output_len, pid; ++ ++ sprintf(version, "%d", UML_NET_VERSION); ++ sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]); ++ sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1], ++ netmask[2], netmask[3]); ++ ++ output_len = page_size(); ++ output = um_kmalloc(output_len); ++ if(output == NULL) ++ printk("change : failed to allocate output buffer\n"); ++ ++ pid = change_tramp(argv, output, output_len); ++ if(pid < 0) return; ++ ++ if(output != NULL){ ++ printk("%s", output); ++ kfree(output); ++ } ++} ++ ++void open_addr(unsigned char *addr, unsigned char *netmask, void *arg) ++{ ++ change(arg, "add", addr, netmask); ++} ++ ++void close_addr(unsigned char *addr, unsigned char *netmask, void *arg) ++{ ++ change(arg, "del", addr, netmask); ++} ++ ++char *split_if_spec(char *str, ...) ++{ ++ char **arg, *end; ++ va_list ap; ++ ++ va_start(ap, str); ++ while((arg = va_arg(ap, char **)) != NULL){ ++ if(*str == '\0') ++ return(NULL); ++ end = strchr(str, ','); ++ if(end != str) ++ *arg = str; ++ if(end == NULL) ++ return(NULL); ++ *end++ = '\0'; ++ str = end; ++ } ++ va_end(ap); ++ return(str); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/null.c um/arch/um/drivers/null.c +--- orig/arch/um/drivers/null.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/null.c Sun Dec 15 21:04:00 2002 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include "chan_user.h" ++#include "os.h" ++ ++static int null_chan; ++ ++void *null_init(char *str, int device, struct chan_opts *opts) ++{ ++ return(&null_chan); ++} ++ ++int null_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ *dev_out = NULL; ++ return(os_open_file(DEV_NULL, of_rdwr(OPENFLAGS()), 0)); ++} ++ ++int null_read(int fd, char *c_out, void *unused) ++{ ++ return(-ENODEV); ++} ++ ++void null_free(void *data) ++{ ++} ++ ++struct chan_ops null_ops = { ++ .type = "null", ++ .init = null_init, ++ .open = null_open, ++ .close = generic_close, ++ .read = null_read, ++ .write = generic_write, ++ .console_write = generic_console_write, ++ .window_size = generic_window_size, ++ .free = null_free, ++ .winch = 0, ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_kern.c um/arch/um/drivers/pcap_kern.c +--- orig/arch/um/drivers/pcap_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/pcap_kern.c Sun Dec 15 21:19:15 2002 +@@ -0,0 +1,127 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike ++ * Licensed under the GPL. ++ */ ++ ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/etherdevice.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "pcap_user.h" ++ ++struct pcap_init { ++ char *host_if; ++ int promisc; ++ int optimize; ++ char *filter; ++}; ++ ++void pcap_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *pri; ++ struct pcap_data *ppri; ++ struct pcap_init *init = data; ++ ++ init_etherdev(dev, 0); ++ pri = dev->priv; ++ ppri = (struct pcap_data *) pri->user; ++ *ppri = ((struct pcap_data) ++ { .host_if = init->host_if, ++ .promisc = init->promisc, ++ .optimize = init->optimize, ++ .filter = init->filter, ++ .compiled = NULL, ++ .pcap = NULL }); ++} ++ ++static int pcap_read(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER); ++ if(*skb == NULL) return(-ENOMEM); ++ return(pcap_user_read(fd, (*skb)->mac.raw, ++ (*skb)->dev->mtu + ETH_HEADER_OTHER, ++ (struct pcap_data *) &lp->user)); ++} ++ ++static int pcap_write(int fd, struct sk_buff **skb, struct uml_net_private *lp) ++{ ++ return(-EPERM); ++} ++ ++static struct net_kern_info pcap_kern_info = { ++ .init = pcap_init, ++ .protocol = eth_protocol, ++ .read = pcap_read, ++ .write = pcap_write, ++}; ++ ++int pcap_setup(char *str, char **mac_out, void *data) ++{ ++ struct pcap_init *init = data; ++ char *remain, *host_if = NULL, *options[2] = { NULL, NULL }; ++ int i; ++ ++ *init = ((struct pcap_init) ++ { .host_if = "eth0", ++ .promisc = 1, ++ .optimize = 0, ++ .filter = NULL }); ++ ++ remain = split_if_spec(str, &host_if, &init->filter, ++ &options[0], &options[1], NULL); ++ if(remain != NULL){ ++ printk(KERN_ERR "pcap_setup - Extra garbage on " ++ "specification : '%s'\n", remain); ++ return(0); ++ } ++ ++ if(host_if != NULL) ++ init->host_if = host_if; ++ ++ for(i = 0; i < sizeof(options)/sizeof(options[0]); i++){ ++ if(options[i] == NULL) ++ continue; ++ if(!strcmp(options[i], "promisc")) ++ init->promisc = 1; ++ else if(!strcmp(options[i], "nopromisc")) ++ init->promisc = 0; ++ else if(!strcmp(options[i], "optimize")) ++ init->optimize = 1; ++ else if(!strcmp(options[i], "nooptimize")) ++ init->optimize = 0; ++ else printk("pcap_setup : bad option - '%s'\n", options[i]); ++ } ++ ++ return(1); ++} ++ ++static struct transport pcap_transport = { ++ .list = LIST_HEAD_INIT(pcap_transport.list), ++ .name = "pcap", ++ .setup = pcap_setup, ++ .user = &pcap_user_info, ++ .kern = &pcap_kern_info, ++ .private_size = sizeof(struct pcap_data), ++ .setup_size = sizeof(struct pcap_init), ++}; ++ ++static int register_pcap(void) ++{ ++ register_transport(&pcap_transport); ++ return(1); ++} ++ ++__initcall(register_pcap); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_user.c um/arch/um/drivers/pcap_user.c +--- orig/arch/um/drivers/pcap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/pcap_user.c Sun Dec 15 21:04:39 2002 +@@ -0,0 +1,143 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike ++ * Licensed under the GPL. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "net_user.h" ++#include "pcap_user.h" ++#include "user.h" ++ ++#define MAX_PACKET (ETH_MAX_PACKET + ETH_HEADER_OTHER) ++ ++#define PCAP_FD(p) (*(int *)(p)) ++ ++static void pcap_user_init(void *data, void *dev) ++{ ++ struct pcap_data *pri = data; ++ pcap_t *p; ++ char errors[PCAP_ERRBUF_SIZE]; ++ ++ p = pcap_open_live(pri->host_if, MAX_PACKET, pri->promisc, 0, errors); ++ if(p == NULL){ ++ printk("pcap_user_init : pcap_open_live failed - '%s'\n", ++ errors); ++ return; ++ } ++ ++ pri->dev = dev; ++ pri->pcap = p; ++} ++ ++static int pcap_open(void *data) ++{ ++ struct pcap_data *pri = data; ++ __u32 netmask; ++ int err; ++ ++ if(pri->pcap == NULL) ++ return(-ENODEV); ++ ++ if(pri->filter != NULL){ ++ err = dev_netmask(pri->dev, &netmask); ++ if(err < 0){ ++ printk("pcap_open : dev_netmask failed\n"); ++ return(-EIO); ++ } ++ ++ pri->compiled = um_kmalloc(sizeof(struct bpf_program)); ++ if(pri->compiled == NULL){ ++ printk("pcap_open : kmalloc failed\n"); ++ return(-ENOMEM); ++ } ++ ++ err = pcap_compile(pri->pcap, ++ (struct bpf_program *) pri->compiled, ++ pri->filter, pri->optimize, netmask); ++ if(err < 0){ ++ printk("pcap_open : pcap_compile failed - '%s'\n", ++ pcap_geterr(pri->pcap)); ++ return(-EIO); ++ } ++ ++ err = pcap_setfilter(pri->pcap, pri->compiled); ++ if(err < 0){ ++ printk("pcap_open : pcap_setfilter failed - '%s'\n", ++ pcap_geterr(pri->pcap)); ++ return(-EIO); ++ } ++ } ++ ++ return(PCAP_FD(pri->pcap)); ++} ++ ++static void pcap_remove(void *data) ++{ ++ struct pcap_data *pri = data; ++ ++ if(pri->compiled != NULL) ++ pcap_freecode(pri->compiled); ++ ++ pcap_close(pri->pcap); ++} ++ ++struct pcap_handler_data { ++ char *buffer; ++ int len; ++}; ++ ++static void handler(u_char *data, const struct pcap_pkthdr *header, ++ const u_char *packet) ++{ ++ int len; ++ ++ struct pcap_handler_data *hdata = (struct pcap_handler_data *) data; ++ ++ len = hdata->len < header->caplen ? hdata->len : header->caplen; ++ memcpy(hdata->buffer, packet, len); ++ hdata->len = len; ++} ++ ++int pcap_user_read(int fd, void *buffer, int len, struct pcap_data *pri) ++{ ++ struct pcap_handler_data hdata = ((struct pcap_handler_data) ++ { .buffer = buffer, ++ .len = len }); ++ int n; ++ ++ n = pcap_dispatch(pri->pcap, 1, handler, (u_char *) &hdata); ++ if(n < 0){ ++ printk("pcap_dispatch failed - %s\n", pcap_geterr(pri->pcap)); ++ return(-EIO); ++ } ++ else if(n == 0) ++ return(0); ++ return(hdata.len); ++} ++ ++struct net_user_info pcap_user_info = { ++ .init = pcap_user_init, ++ .open = pcap_open, ++ .close = NULL, ++ .remove = pcap_remove, ++ .set_mtu = NULL, ++ .add_address = NULL, ++ .delete_address = NULL, ++ .max_packet = MAX_PACKET - ETH_HEADER_OTHER ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/pcap_user.h um/arch/um/drivers/pcap_user.h +--- orig/arch/um/drivers/pcap_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/pcap_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,31 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "net_user.h" ++ ++struct pcap_data { ++ char *host_if; ++ int promisc; ++ int optimize; ++ char *filter; ++ void *compiled; ++ void *pcap; ++ void *dev; ++}; ++ ++extern struct net_user_info pcap_user_info; ++ ++extern int pcap_user_read(int fd, void *buf, int len, struct pcap_data *pri); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/port.h um/arch/um/drivers/port.h +--- orig/arch/um/drivers/port.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/port.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PORT_H__ ++#define __PORT_H__ ++ ++extern void *port_data(int port); ++extern int port_wait(void *data); ++extern void port_kern_close(void *d); ++extern int port_connection(int fd, int *socket_out, int *pid_out); ++extern int port_listen_fd(int port); ++extern void port_read(int fd, void *data); ++extern void port_kern_free(void *d); ++extern int port_rcv_fd(int fd); ++extern void port_remove_dev(void *d); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/port_kern.c um/arch/um/drivers/port_kern.c +--- orig/arch/um/drivers/port_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/port_kern.c Mon Dec 30 20:57:42 2002 +@@ -0,0 +1,302 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/list.h" ++#include "linux/sched.h" ++#include "linux/slab.h" ++#include "linux/irq.h" ++#include "linux/spinlock.h" ++#include "linux/errno.h" ++#include "asm/semaphore.h" ++#include "asm/errno.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "irq_user.h" ++#include "port.h" ++#include "init.h" ++#include "os.h" ++ ++struct port_list { ++ struct list_head list; ++ int has_connection; ++ struct semaphore sem; ++ int port; ++ int fd; ++ spinlock_t lock; ++ struct list_head pending; ++ struct list_head connections; ++}; ++ ++struct port_dev { ++ struct port_list *port; ++ int helper_pid; ++ int telnetd_pid; ++}; ++ ++struct connection { ++ struct list_head list; ++ int fd; ++ int helper_pid; ++ int socket[2]; ++ int telnetd_pid; ++ struct port_list *port; ++}; ++ ++static void pipe_interrupt(int irq, void *data, struct pt_regs *regs) ++{ ++ struct connection *conn = data; ++ int fd; ++ ++ fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); ++ if(fd < 0){ ++ if(fd == -EAGAIN) ++ return; ++ ++ printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", ++ -fd); ++ os_close_file(conn->fd); ++ } ++ ++ list_del(&conn->list); ++ ++ conn->fd = fd; ++ list_add(&conn->list, &conn->port->connections); ++ ++ up(&conn->port->sem); ++} ++ ++static int port_accept(struct port_list *port) ++{ ++ struct connection *conn; ++ int fd, socket[2], pid, ret = 0; ++ ++ fd = port_connection(port->fd, socket, &pid); ++ if(fd < 0){ ++ if(fd != -EAGAIN) ++ printk(KERN_ERR "port_accept : port_connection " ++ "returned %d\n", -fd); ++ goto out; ++ } ++ ++ conn = kmalloc(sizeof(*conn), GFP_ATOMIC); ++ if(conn == NULL){ ++ printk(KERN_ERR "port_accept : failed to allocate " ++ "connection\n"); ++ goto out_close; ++ } ++ *conn = ((struct connection) ++ { .list = LIST_HEAD_INIT(conn->list), ++ .fd = fd, ++ .socket = { socket[0], socket[1] }, ++ .telnetd_pid = pid, ++ .port = port }); ++ ++ if(um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, ++ SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, ++ "telnetd", conn)){ ++ printk(KERN_ERR "port_accept : failed to get IRQ for " ++ "telnetd\n"); ++ goto out_free; ++ } ++ ++ list_add(&conn->list, &port->pending); ++ return(1); ++ ++ out_free: ++ kfree(conn); ++ out_close: ++ os_close_file(fd); ++ if(pid != -1) ++ os_kill_process(pid, 1); ++ out: ++ return(ret); ++} ++ ++DECLARE_MUTEX(ports_sem); ++struct list_head ports = LIST_HEAD_INIT(ports); ++ ++void port_task_proc(void *unused) ++{ ++ struct port_list *port; ++ struct list_head *ele; ++ unsigned long flags; ++ ++ save_flags(flags); ++ list_for_each(ele, &ports){ ++ port = list_entry(ele, struct port_list, list); ++ if(!port->has_connection) ++ continue; ++ reactivate_fd(port->fd, ACCEPT_IRQ); ++ while(port_accept(port)) ; ++ port->has_connection = 0; ++ } ++ restore_flags(flags); ++} ++ ++struct tq_struct port_task = { ++ .routine = port_task_proc, ++ .data = NULL ++}; ++ ++static void port_interrupt(int irq, void *data, struct pt_regs *regs) ++{ ++ struct port_list *port = data; ++ ++ port->has_connection = 1; ++ schedule_task(&port_task); ++} ++ ++void *port_data(int port_num) ++{ ++ struct list_head *ele; ++ struct port_list *port; ++ struct port_dev *dev = NULL; ++ int fd; ++ ++ down(&ports_sem); ++ list_for_each(ele, &ports){ ++ port = list_entry(ele, struct port_list, list); ++ if(port->port == port_num) goto found; ++ } ++ port = kmalloc(sizeof(struct port_list), GFP_KERNEL); ++ if(port == NULL){ ++ printk(KERN_ERR "Allocation of port list failed\n"); ++ goto out; ++ } ++ ++ fd = port_listen_fd(port_num); ++ if(fd < 0){ ++ printk(KERN_ERR "binding to port %d failed, errno = %d\n", ++ port_num, -fd); ++ goto out_free; ++ } ++ if(um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, ++ SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, "port", ++ port)){ ++ printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num); ++ goto out_close; ++ } ++ ++ *port = ((struct port_list) ++ { .list = LIST_HEAD_INIT(port->list), ++ .has_connection = 0, ++ .sem = __SEMAPHORE_INITIALIZER(port->sem, ++ 0), ++ .lock = SPIN_LOCK_UNLOCKED, ++ .port = port_num, ++ .fd = fd, ++ .pending = LIST_HEAD_INIT(port->pending), ++ .connections = LIST_HEAD_INIT(port->connections) }); ++ list_add(&port->list, &ports); ++ ++ found: ++ dev = kmalloc(sizeof(struct port_dev), GFP_KERNEL); ++ if(dev == NULL){ ++ printk(KERN_ERR "Allocation of port device entry failed\n"); ++ goto out; ++ } ++ ++ *dev = ((struct port_dev) { .port = port, ++ .helper_pid = -1, ++ .telnetd_pid = -1 }); ++ goto out; ++ ++ out_free: ++ kfree(port); ++ out_close: ++ os_close_file(fd); ++ out: ++ up(&ports_sem); ++ return(dev); ++} ++ ++int port_wait(void *data) ++{ ++ struct port_dev *dev = data; ++ struct connection *conn; ++ struct port_list *port = dev->port; ++ int fd; ++ ++ while(1){ ++ if(down_interruptible(&port->sem)) ++ return(-ERESTARTSYS); ++ ++ spin_lock(&port->lock); ++ ++ conn = list_entry(port->connections.next, struct connection, ++ list); ++ list_del(&conn->list); ++ spin_unlock(&port->lock); ++ ++ os_shutdown_socket(conn->socket[0], 1, 1); ++ os_close_file(conn->socket[0]); ++ os_shutdown_socket(conn->socket[1], 1, 1); ++ os_close_file(conn->socket[1]); ++ ++ /* This is done here because freeing an IRQ can't be done ++ * within the IRQ handler. So, pipe_interrupt always ups ++ * the semaphore regardless of whether it got a successful ++ * connection. Then we loop here throwing out failed ++ * connections until a good one is found. ++ */ ++ free_irq(TELNETD_IRQ, conn); ++ ++ if(conn->fd >= 0) break; ++ os_close_file(conn->fd); ++ kfree(conn); ++ } ++ ++ fd = conn->fd; ++ dev->helper_pid = conn->helper_pid; ++ dev->telnetd_pid = conn->telnetd_pid; ++ kfree(conn); ++ ++ return(fd); ++} ++ ++void port_remove_dev(void *d) ++{ ++ struct port_dev *dev = d; ++ ++ if(dev->helper_pid != -1) ++ os_kill_process(dev->helper_pid, 0); ++ if(dev->telnetd_pid != -1) ++ os_kill_process(dev->telnetd_pid, 1); ++ dev->helper_pid = -1; ++ dev->telnetd_pid = -1; ++} ++ ++void port_kern_free(void *d) ++{ ++ struct port_dev *dev = d; ++ ++ port_remove_dev(dev); ++ kfree(dev); ++} ++ ++static void free_port(void) ++{ ++ struct list_head *ele; ++ struct port_list *port; ++ ++ list_for_each(ele, &ports){ ++ port = list_entry(ele, struct port_list, list); ++ free_irq_by_fd(port->fd); ++ os_close_file(port->fd); ++ } ++} ++ ++__uml_exitcall(free_port); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/port_user.c um/arch/um/drivers/port_user.c +--- orig/arch/um/drivers/port_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/port_user.c Mon Dec 16 22:46:20 2002 +@@ -0,0 +1,206 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "chan_user.h" ++#include "port.h" ++#include "helper.h" ++#include "os.h" ++ ++struct port_chan { ++ int raw; ++ struct termios tt; ++ void *kernel_data; ++ char dev[sizeof("32768\0")]; ++}; ++ ++void *port_init(char *str, int device, struct chan_opts *opts) ++{ ++ struct port_chan *data; ++ void *kern_data; ++ char *end; ++ int port; ++ ++ if(*str != ':'){ ++ printk("port_init : channel type 'port' must specify a " ++ "port number\n"); ++ return(NULL); ++ } ++ str++; ++ port = strtoul(str, &end, 0); ++ if((*end != '\0') || (end == str)){ ++ printk("port_init : couldn't parse port '%s'\n", str); ++ return(NULL); ++ } ++ ++ if((kern_data = port_data(port)) == NULL) ++ return(NULL); ++ ++ if((data = um_kmalloc(sizeof(*data))) == NULL) ++ goto err; ++ ++ *data = ((struct port_chan) { .raw = opts->raw, ++ .kernel_data = kern_data }); ++ sprintf(data->dev, "%d", port); ++ ++ return(data); ++ err: ++ port_kern_free(kern_data); ++ return(NULL); ++} ++ ++void port_free(void *d) ++{ ++ struct port_chan *data = d; ++ ++ port_kern_free(data->kernel_data); ++ kfree(data); ++} ++ ++int port_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct port_chan *data = d; ++ int fd; ++ ++ fd = port_wait(data->kernel_data); ++ if((fd >= 0) && data->raw){ ++ tcgetattr(fd, &data->tt); ++ raw(fd, 0); ++ } ++ *dev_out = data->dev; ++ return(fd); ++} ++ ++void port_close(int fd, void *d) ++{ ++ struct port_chan *data = d; ++ ++ port_remove_dev(data->kernel_data); ++ close(fd); ++} ++ ++int port_console_write(int fd, const char *buf, int n, void *d) ++{ ++ struct port_chan *data = d; ++ ++ return(generic_console_write(fd, buf, n, &data->tt)); ++} ++ ++struct chan_ops port_ops = { ++ .type = "port", ++ .init = port_init, ++ .open = port_open, ++ .close = port_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = port_console_write, ++ .window_size = generic_window_size, ++ .free = port_free, ++ .winch = 1, ++}; ++ ++int port_listen_fd(int port) ++{ ++ struct sockaddr_in addr; ++ int fd, err; ++ ++ fd = socket(PF_INET, SOCK_STREAM, 0); ++ if(fd == -1) ++ return(-errno); ++ ++ addr.sin_family = AF_INET; ++ addr.sin_port = htons(port); ++ addr.sin_addr.s_addr = htonl(INADDR_ANY); ++ if(bind(fd, (struct sockaddr *) &addr, sizeof(addr)) < 0){ ++ err = -errno; ++ goto out; ++ } ++ ++ if((listen(fd, 1) < 0) || (os_set_fd_block(fd, 0))){ ++ err = -errno; ++ goto out; ++ } ++ ++ return(fd); ++ out: ++ os_close_file(fd); ++ return(err); ++} ++ ++struct port_pre_exec_data { ++ int sock_fd; ++ int pipe_fd; ++}; ++ ++void port_pre_exec(void *arg) ++{ ++ struct port_pre_exec_data *data = arg; ++ ++ dup2(data->sock_fd, 0); ++ dup2(data->sock_fd, 1); ++ dup2(data->sock_fd, 2); ++ close(data->sock_fd); ++ dup2(data->pipe_fd, 3); ++ os_shutdown_socket(3, 1, 0); ++ close(data->pipe_fd); ++} ++ ++int port_connection(int fd, int *socket, int *pid_out) ++{ ++ int new, err; ++ char *argv[] = { "/usr/sbin/in.telnetd", "-L", ++ "/usr/lib/uml/port-helper", NULL }; ++ struct port_pre_exec_data data; ++ ++ if((new = os_accept_connection(fd)) < 0) ++ return(-errno); ++ ++ err = os_pipe(socket, 0, 0); ++ if(err) ++ goto out_close; ++ ++ data = ((struct port_pre_exec_data) ++ { .sock_fd = new, ++ .pipe_fd = socket[1] }); ++ ++ err = run_helper(port_pre_exec, &data, argv, NULL); ++ if(err < 0) ++ goto out_shutdown; ++ ++ *pid_out = err; ++ return(new); ++ ++ out_shutdown: ++ os_shutdown_socket(socket[0], 1, 1); ++ close(socket[0]); ++ os_shutdown_socket(socket[1], 1, 1); ++ close(socket[1]); ++ out_close: ++ close(new); ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/pty.c um/arch/um/drivers/pty.c +--- orig/arch/um/drivers/pty.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/pty.c Sun Dec 15 21:06:01 2002 +@@ -0,0 +1,148 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include "chan_user.h" ++#include "user.h" ++#include "user_util.h" ++#include "kern_util.h" ++ ++struct pty_chan { ++ void (*announce)(char *dev_name, int dev); ++ int dev; ++ int raw; ++ struct termios tt; ++ char dev_name[sizeof("/dev/pts/0123456\0")]; ++}; ++ ++void *pty_chan_init(char *str, int device, struct chan_opts *opts) ++{ ++ struct pty_chan *data; ++ ++ if((data = um_kmalloc(sizeof(*data))) == NULL) return(NULL); ++ *data = ((struct pty_chan) { .announce = opts->announce, ++ .dev = device, ++ .raw = opts->raw }); ++ return(data); ++} ++ ++int pts_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct pty_chan *data = d; ++ char *dev; ++ int fd; ++ ++ if((fd = get_pty()) < 0){ ++ printk("open_pts : Failed to open pts\n"); ++ return(-errno); ++ } ++ if(data->raw){ ++ tcgetattr(fd, &data->tt); ++ raw(fd, 0); ++ } ++ ++ dev = ptsname(fd); ++ sprintf(data->dev_name, "%s", dev); ++ *dev_out = data->dev_name; ++ if(data->announce) (*data->announce)(dev, data->dev); ++ return(fd); ++} ++ ++int getmaster(char *line) ++{ ++ struct stat stb; ++ char *pty, *bank, *cp; ++ int master; ++ ++ pty = &line[strlen("/dev/ptyp")]; ++ for (bank = "pqrs"; *bank; bank++) { ++ line[strlen("/dev/pty")] = *bank; ++ *pty = '0'; ++ if (stat(line, &stb) < 0) ++ break; ++ for (cp = "0123456789abcdef"; *cp; cp++) { ++ *pty = *cp; ++ master = open(line, O_RDWR); ++ if (master >= 0) { ++ char *tp = &line[strlen("/dev/")]; ++ int ok; ++ ++ /* verify slave side is usable */ ++ *tp = 't'; ++ ok = access(line, R_OK|W_OK) == 0; ++ *tp = 'p'; ++ if (ok) return(master); ++ (void) close(master); ++ } ++ } ++ } ++ return(-1); ++} ++ ++int pty_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct pty_chan *data = d; ++ int fd; ++ char dev[sizeof("/dev/ptyxx\0")] = "/dev/ptyxx"; ++ ++ fd = getmaster(dev); ++ if(fd < 0) return(-errno); ++ ++ if(data->raw) raw(fd, 0); ++ if(data->announce) (*data->announce)(dev, data->dev); ++ ++ sprintf(data->dev_name, "%s", dev); ++ *dev_out = data->dev_name; ++ return(fd); ++} ++ ++int pty_console_write(int fd, const char *buf, int n, void *d) ++{ ++ struct pty_chan *data = d; ++ ++ return(generic_console_write(fd, buf, n, &data->tt)); ++} ++ ++struct chan_ops pty_ops = { ++ .type = "pty", ++ .init = pty_chan_init, ++ .open = pty_open, ++ .close = generic_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = pty_console_write, ++ .window_size = generic_window_size, ++ .free = generic_free, ++ .winch = 0, ++}; ++ ++struct chan_ops pts_ops = { ++ .type = "pts", ++ .init = pty_chan_init, ++ .open = pts_open, ++ .close = generic_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = pty_console_write, ++ .window_size = generic_window_size, ++ .free = generic_free, ++ .winch = 0, ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slip.h um/arch/um/drivers/slip.h +--- orig/arch/um/drivers/slip.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slip.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,39 @@ ++#ifndef __UM_SLIP_H ++#define __UM_SLIP_H ++ ++#define BUF_SIZE 1500 ++ /* two bytes each for a (pathological) max packet of escaped chars + * ++ * terminating END char + initial END char */ ++#define ENC_BUF_SIZE (2 * BUF_SIZE + 2) ++ ++struct slip_data { ++ void *dev; ++ char name[sizeof("slnnnnn\0")]; ++ char *addr; ++ char *gate_addr; ++ int slave; ++ char ibuf[ENC_BUF_SIZE]; ++ char obuf[ENC_BUF_SIZE]; ++ int more; /* more data: do not read fd until ibuf has been drained */ ++ int pos; ++ int esc; ++}; ++ ++extern struct net_user_info slip_user_info; ++ ++extern int set_umn_addr(int fd, char *addr, char *ptp_addr); ++extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri); ++extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_kern.c um/arch/um/drivers/slip_kern.c +--- orig/arch/um/drivers/slip_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slip_kern.c Sun Dec 15 21:06:24 2002 +@@ -0,0 +1,109 @@ ++#include "linux/config.h" ++#include "linux/kernel.h" ++#include "linux/stddef.h" ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/if_arp.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "kern.h" ++#include "slip.h" ++ ++struct slip_init { ++ char *gate_addr; ++}; ++ ++void slip_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *private; ++ struct slip_data *spri; ++ struct slip_init *init = data; ++ ++ private = dev->priv; ++ spri = (struct slip_data *) private->user; ++ *spri = ((struct slip_data) ++ { .name = { '\0' }, ++ .addr = NULL, ++ .gate_addr = init->gate_addr, ++ .slave = -1, ++ .ibuf = { '\0' }, ++ .obuf = { '\0' }, ++ .pos = 0, ++ .esc = 0, ++ .dev = dev }); ++ ++ dev->init = NULL; ++ dev->hard_header_len = 0; ++ dev->addr_len = 4; ++ dev->type = ARPHRD_ETHER; ++ dev->tx_queue_len = 256; ++ dev->flags = IFF_NOARP; ++ printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr); ++} ++ ++static unsigned short slip_protocol(struct sk_buff *skbuff) ++{ ++ return(htons(ETH_P_IP)); ++} ++ ++static int slip_read(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(slip_user_read(fd, (*skb)->mac.raw, (*skb)->dev->mtu, ++ (struct slip_data *) &lp->user)); ++} ++ ++static int slip_write(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(slip_user_write(fd, (*skb)->data, (*skb)->len, ++ (struct slip_data *) &lp->user)); ++} ++ ++struct net_kern_info slip_kern_info = { ++ .init = slip_init, ++ .protocol = slip_protocol, ++ .read = slip_read, ++ .write = slip_write, ++}; ++ ++static int slip_setup(char *str, char **mac_out, void *data) ++{ ++ struct slip_init *init = data; ++ ++ *init = ((struct slip_init) ++ { .gate_addr = NULL }); ++ ++ if(str[0] != '\0') ++ init->gate_addr = str; ++ return(1); ++} ++ ++static struct transport slip_transport = { ++ .list = LIST_HEAD_INIT(slip_transport.list), ++ .name = "slip", ++ .setup = slip_setup, ++ .user = &slip_user_info, ++ .kern = &slip_kern_info, ++ .private_size = sizeof(struct slip_data), ++ .setup_size = sizeof(struct slip_init), ++}; ++ ++static int register_slip(void) ++{ ++ register_transport(&slip_transport); ++ return(1); ++} ++ ++__initcall(register_slip); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_proto.h um/arch/um/drivers/slip_proto.h +--- orig/arch/um/drivers/slip_proto.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slip_proto.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,93 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_SLIP_PROTO_H__ ++#define __UM_SLIP_PROTO_H__ ++ ++/* SLIP protocol characters. */ ++#define SLIP_END 0300 /* indicates end of frame */ ++#define SLIP_ESC 0333 /* indicates byte stuffing */ ++#define SLIP_ESC_END 0334 /* ESC ESC_END means END 'data' */ ++#define SLIP_ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ ++ ++static inline int slip_unesc(unsigned char c,char *buf,int *pos, int *esc) ++{ ++ int ret; ++ ++ switch(c){ ++ case SLIP_END: ++ *esc = 0; ++ ret=*pos; ++ *pos=0; ++ return(ret); ++ case SLIP_ESC: ++ *esc = 1; ++ return(0); ++ case SLIP_ESC_ESC: ++ if(*esc){ ++ *esc = 0; ++ c = SLIP_ESC; ++ } ++ break; ++ case SLIP_ESC_END: ++ if(*esc){ ++ *esc = 0; ++ c = SLIP_END; ++ } ++ break; ++ } ++ buf[(*pos)++] = c; ++ return(0); ++} ++ ++static inline int slip_esc(unsigned char *s, unsigned char *d, int len) ++{ ++ unsigned char *ptr = d; ++ unsigned char c; ++ ++ /* ++ * Send an initial END character to flush out any ++ * data that may have accumulated in the receiver ++ * due to line noise. ++ */ ++ ++ *ptr++ = SLIP_END; ++ ++ /* ++ * For each byte in the packet, send the appropriate ++ * character sequence, according to the SLIP protocol. ++ */ ++ ++ while (len-- > 0) { ++ switch(c = *s++) { ++ case SLIP_END: ++ *ptr++ = SLIP_ESC; ++ *ptr++ = SLIP_ESC_END; ++ break; ++ case SLIP_ESC: ++ *ptr++ = SLIP_ESC; ++ *ptr++ = SLIP_ESC_ESC; ++ break; ++ default: ++ *ptr++ = c; ++ break; ++ } ++ } ++ *ptr++ = SLIP_END; ++ return (ptr - d); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slip_user.c um/arch/um/drivers/slip_user.c +--- orig/arch/um/drivers/slip_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slip_user.c Sun Dec 15 21:06:35 2002 +@@ -0,0 +1,279 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "net_user.h" ++#include "slip.h" ++#include "slip_proto.h" ++#include "helper.h" ++#include "os.h" ++ ++void slip_user_init(void *data, void *dev) ++{ ++ struct slip_data *pri = data; ++ ++ pri->dev = dev; ++} ++ ++static int set_up_tty(int fd) ++{ ++ int i; ++ struct termios tios; ++ ++ if (tcgetattr(fd, &tios) < 0) { ++ printk("could not get initial terminal attributes\n"); ++ return(-1); ++ } ++ ++ tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL; ++ tios.c_iflag = IGNBRK | IGNPAR; ++ tios.c_oflag = 0; ++ tios.c_lflag = 0; ++ for (i = 0; i < NCCS; i++) ++ tios.c_cc[i] = 0; ++ tios.c_cc[VMIN] = 1; ++ tios.c_cc[VTIME] = 0; ++ ++ cfsetospeed(&tios, B38400); ++ cfsetispeed(&tios, B38400); ++ ++ if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) { ++ printk("failed to set terminal attributes\n"); ++ return(-1); ++ } ++ return(0); ++} ++ ++struct slip_pre_exec_data { ++ int stdin; ++ int stdout; ++ int close_me; ++}; ++ ++static void slip_pre_exec(void *arg) ++{ ++ struct slip_pre_exec_data *data = arg; ++ ++ if(data->stdin != -1) dup2(data->stdin, 0); ++ dup2(data->stdout, 1); ++ if(data->close_me != -1) close(data->close_me); ++} ++ ++static int slip_tramp(char **argv, int fd) ++{ ++ struct slip_pre_exec_data pe_data; ++ char *output; ++ int status, pid, fds[2], err, output_len; ++ ++ err = os_pipe(fds, 1, 0); ++ if(err){ ++ printk("slip_tramp : pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ ++ err = 0; ++ pe_data.stdin = fd; ++ pe_data.stdout = fds[1]; ++ pe_data.close_me = fds[0]; ++ pid = run_helper(slip_pre_exec, &pe_data, argv, NULL); ++ ++ if(pid < 0) err = pid; ++ else { ++ output_len = page_size(); ++ output = um_kmalloc(output_len); ++ if(output == NULL) ++ printk("slip_tramp : failed to allocate output " ++ "buffer\n"); ++ ++ close(fds[1]); ++ read_output(fds[0], output, output_len); ++ if(output != NULL){ ++ printk("%s", output); ++ kfree(output); ++ } ++ if(waitpid(pid, &status, 0) < 0) err = errno; ++ else if(!WIFEXITED(status) || (WEXITSTATUS(status) != 0)){ ++ printk("'%s' didn't exit with status 0\n", argv[0]); ++ err = EINVAL; ++ } ++ } ++ return(err); ++} ++ ++static int slip_open(void *data) ++{ ++ struct slip_data *pri = data; ++ char version_buf[sizeof("nnnnn\0")]; ++ char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; ++ char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf, ++ NULL }; ++ int sfd, mfd, disc, sencap, err; ++ ++ if((mfd = get_pty()) < 0){ ++ printk("umn : Failed to open pty\n"); ++ return(-1); ++ } ++ if((sfd = os_open_file(ptsname(mfd), of_rdwr(OPENFLAGS()), 0)) < 0){ ++ printk("Couldn't open tty for slip line\n"); ++ return(-1); ++ } ++ if(set_up_tty(sfd)) return(-1); ++ pri->slave = sfd; ++ pri->pos = 0; ++ pri->esc = 0; ++ if(pri->gate_addr != NULL){ ++ sprintf(version_buf, "%d", UML_NET_VERSION); ++ strcpy(gate_buf, pri->gate_addr); ++ ++ err = slip_tramp(argv, sfd); ++ ++ if(err != 0){ ++ printk("slip_tramp failed - errno = %d\n", err); ++ return(-err); ++ } ++ if(ioctl(pri->slave, SIOCGIFNAME, pri->name) < 0){ ++ printk("SIOCGIFNAME failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ iter_addresses(pri->dev, open_addr, pri->name); ++ } ++ else { ++ disc = N_SLIP; ++ if(ioctl(sfd, TIOCSETD, &disc) < 0){ ++ printk("Failed to set slip line discipline - " ++ "errno = %d\n", errno); ++ return(-errno); ++ } ++ sencap = 0; ++ if(ioctl(sfd, SIOCSIFENCAP, &sencap) < 0){ ++ printk("Failed to set slip encapsulation - " ++ "errno = %d\n", errno); ++ return(-errno); ++ } ++ } ++ return(mfd); ++} ++ ++static void slip_close(int fd, void *data) ++{ ++ struct slip_data *pri = data; ++ char version_buf[sizeof("nnnnn\0")]; ++ char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name, ++ NULL }; ++ int err; ++ ++ if(pri->gate_addr != NULL) ++ iter_addresses(pri->dev, close_addr, pri->name); ++ ++ sprintf(version_buf, "%d", UML_NET_VERSION); ++ ++ err = slip_tramp(argv, -1); ++ ++ if(err != 0) ++ printk("slip_tramp failed - errno = %d\n", err); ++ close(fd); ++ close(pri->slave); ++ pri->slave = -1; ++} ++ ++int slip_user_read(int fd, void *buf, int len, struct slip_data *pri) ++{ ++ int i, n, size, start; ++ ++ if(pri->more>0) { ++ i = 0; ++ while(i < pri->more) { ++ size = slip_unesc(pri->ibuf[i++], ++ pri->ibuf, &pri->pos, &pri->esc); ++ if(size){ ++ memcpy(buf, pri->ibuf, size); ++ memmove(pri->ibuf, &pri->ibuf[i], pri->more-i); ++ pri->more=pri->more-i; ++ return(size); ++ } ++ } ++ pri->more=0; ++ } ++ ++ n = net_read(fd, &pri->ibuf[pri->pos], sizeof(pri->ibuf) - pri->pos); ++ if(n <= 0) return(n); ++ ++ start = pri->pos; ++ for(i = 0; i < n; i++){ ++ size = slip_unesc(pri->ibuf[start + i], ++ pri->ibuf, &pri->pos, &pri->esc); ++ if(size){ ++ memcpy(buf, pri->ibuf, size); ++ memmove(pri->ibuf, &pri->ibuf[start+i+1], n-(i+1)); ++ pri->more=n-(i+1); ++ return(size); ++ } ++ } ++ return(0); ++} ++ ++int slip_user_write(int fd, void *buf, int len, struct slip_data *pri) ++{ ++ int actual, n; ++ ++ actual = slip_esc(buf, pri->obuf, len); ++ n = net_write(fd, pri->obuf, actual); ++ if(n < 0) return(n); ++ else return(len); ++} ++ ++static int slip_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++static void slip_add_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct slip_data *pri = data; ++ ++ if(pri->slave == -1) return; ++ open_addr(addr, netmask, pri->name); ++} ++ ++static void slip_del_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct slip_data *pri = data; ++ ++ if(pri->slave == -1) return; ++ close_addr(addr, netmask, pri->name); ++} ++ ++struct net_user_info slip_user_info = { ++ .init = slip_user_init, ++ .open = slip_open, ++ .close = slip_close, ++ .remove = NULL, ++ .set_mtu = slip_set_mtu, ++ .add_address = slip_add_addr, ++ .delete_address = slip_del_addr, ++ .max_packet = BUF_SIZE ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp.h um/arch/um/drivers/slirp.h +--- orig/arch/um/drivers/slirp.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slirp.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,51 @@ ++#ifndef __UM_SLIRP_H ++#define __UM_SLIRP_H ++ ++#define BUF_SIZE 1500 ++ /* two bytes each for a (pathological) max packet of escaped chars + * ++ * terminating END char + initial END char */ ++#define ENC_BUF_SIZE (2 * BUF_SIZE + 2) ++ ++#define SLIRP_MAX_ARGS 100 ++/* ++ * XXX this next definition is here because I don't understand why this ++ * initializer doesn't work in slirp_kern.c: ++ * ++ * argv : { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] }, ++ * ++ * or why I can't typecast like this: ++ * ++ * argv : (char* [SLIRP_MAX_ARGS])(init->argv), ++ */ ++struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; }; ++ ++struct slirp_data { ++ void *dev; ++ struct arg_list_dummy_wrapper argw; ++ int pid; ++ int slave; ++ char ibuf[ENC_BUF_SIZE]; ++ char obuf[ENC_BUF_SIZE]; ++ int more; /* more data: do not read fd until ibuf has been drained */ ++ int pos; ++ int esc; ++}; ++ ++extern struct net_user_info slirp_user_info; ++ ++extern int set_umn_addr(int fd, char *addr, char *ptp_addr); ++extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri); ++extern int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp_kern.c um/arch/um/drivers/slirp_kern.c +--- orig/arch/um/drivers/slirp_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slirp_kern.c Sun Dec 15 21:06:54 2002 +@@ -0,0 +1,132 @@ ++#include "linux/kernel.h" ++#include "linux/stddef.h" ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/if_arp.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "kern.h" ++#include "slirp.h" ++ ++struct slirp_init { ++ struct arg_list_dummy_wrapper argw; /* XXX should be simpler... */ ++}; ++ ++void slirp_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *private; ++ struct slirp_data *spri; ++ struct slirp_init *init = data; ++ int i; ++ ++ private = dev->priv; ++ spri = (struct slirp_data *) private->user; ++ *spri = ((struct slirp_data) ++ { .argw = init->argw, ++ .pid = -1, ++ .slave = -1, ++ .ibuf = { '\0' }, ++ .obuf = { '\0' }, ++ .pos = 0, ++ .esc = 0, ++ .dev = dev }); ++ ++ dev->init = NULL; ++ dev->hard_header_len = 0; ++ dev->addr_len = 4; ++ dev->type = ARPHRD_ETHER; ++ dev->tx_queue_len = 256; ++ dev->flags = IFF_NOARP; ++ printk("SLIRP backend - command line:"); ++ for(i=0;spri->argw.argv[i]!=NULL;i++) { ++ printk(" '%s'",spri->argw.argv[i]); ++ } ++ printk("\n"); ++} ++ ++static unsigned short slirp_protocol(struct sk_buff *skbuff) ++{ ++ return(htons(ETH_P_IP)); ++} ++ ++static int slirp_read(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(slirp_user_read(fd, (*skb)->mac.raw, (*skb)->dev->mtu, ++ (struct slirp_data *) &lp->user)); ++} ++ ++static int slirp_write(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(slirp_user_write(fd, (*skb)->data, (*skb)->len, ++ (struct slirp_data *) &lp->user)); ++} ++ ++struct net_kern_info slirp_kern_info = { ++ .init = slirp_init, ++ .protocol = slirp_protocol, ++ .read = slirp_read, ++ .write = slirp_write, ++}; ++ ++static int slirp_setup(char *str, char **mac_out, void *data) ++{ ++ struct slirp_init *init = data; ++ int i=0; ++ ++ *init = ((struct slirp_init) ++ { argw : { { "slirp", NULL } } }); ++ ++ str = split_if_spec(str, mac_out, NULL); ++ ++ if(str == NULL) { /* no command line given after MAC addr */ ++ return(1); ++ } ++ ++ do { ++ if(i>=SLIRP_MAX_ARGS-1) { ++ printk("slirp_setup: truncating slirp arguments\n"); ++ break; ++ } ++ init->argw.argv[i++] = str; ++ while(*str && *str!=',') { ++ if(*str=='_') *str=' '; ++ str++; ++ } ++ if(*str!=',') ++ break; ++ *str++='\0'; ++ } while(1); ++ init->argw.argv[i]=NULL; ++ return(1); ++} ++ ++static struct transport slirp_transport = { ++ .list = LIST_HEAD_INIT(slirp_transport.list), ++ .name = "slirp", ++ .setup = slirp_setup, ++ .user = &slirp_user_info, ++ .kern = &slirp_kern_info, ++ .private_size = sizeof(struct slirp_data), ++ .setup_size = sizeof(struct slirp_init), ++}; ++ ++static int register_slirp(void) ++{ ++ register_transport(&slirp_transport); ++ return(1); ++} ++ ++__initcall(register_slirp); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/slirp_user.c um/arch/um/drivers/slirp_user.c +--- orig/arch/um/drivers/slirp_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/slirp_user.c Sun Dec 15 21:07:08 2002 +@@ -0,0 +1,202 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "net_user.h" ++#include "slirp.h" ++#include "slip_proto.h" ++#include "helper.h" ++#include "os.h" ++ ++void slirp_user_init(void *data, void *dev) ++{ ++ struct slirp_data *pri = data; ++ ++ pri->dev = dev; ++} ++ ++struct slirp_pre_exec_data { ++ int stdin; ++ int stdout; ++}; ++ ++static void slirp_pre_exec(void *arg) ++{ ++ struct slirp_pre_exec_data *data = arg; ++ ++ if(data->stdin != -1) dup2(data->stdin, 0); ++ if(data->stdout != -1) dup2(data->stdout, 1); ++} ++ ++static int slirp_tramp(char **argv, int fd) ++{ ++ struct slirp_pre_exec_data pe_data; ++ int pid; ++ ++ pe_data.stdin = fd; ++ pe_data.stdout = fd; ++ pid = run_helper(slirp_pre_exec, &pe_data, argv, NULL); ++ ++ return(pid); ++} ++ ++static int slirp_datachan(int *mfd, int *sfd) ++{ ++ int fds[2], err; ++ ++ err = os_pipe(fds, 1, 1); ++ if(err){ ++ printk("slirp_datachan: Failed to open pipe, errno = %d\n", ++ -err); ++ return(err); ++ } ++ ++ *mfd = fds[0]; ++ *sfd = fds[1]; ++ return(0); ++} ++ ++static int slirp_open(void *data) ++{ ++ struct slirp_data *pri = data; ++ int sfd, mfd, pid, err; ++ ++ err = slirp_datachan(&mfd, &sfd); ++ if(err) ++ return(err); ++ ++ pid = slirp_tramp(pri->argw.argv, sfd); ++ ++ if(pid < 0){ ++ printk("slirp_tramp failed - errno = %d\n", pid); ++ os_close_file(sfd); ++ os_close_file(mfd); ++ return(pid); ++ } ++ ++ pri->slave = sfd; ++ pri->pos = 0; ++ pri->esc = 0; ++ ++ pri->pid = pid; ++ ++ return(mfd); ++} ++ ++static void slirp_close(int fd, void *data) ++{ ++ struct slirp_data *pri = data; ++ int status,err; ++ ++ close(fd); ++ close(pri->slave); ++ ++ pri->slave = -1; ++ ++ if(pri->pid<1) { ++ printk("slirp_close: no child process to shut down\n"); ++ return; ++ } ++ ++#if 0 ++ if(kill(pri->pid, SIGHUP)<0) { ++ printk("slirp_close: sending hangup to %d failed (%d)\n", ++ pri->pid, errno); ++ } ++#endif ++ ++ err = waitpid(pri->pid, &status, WNOHANG); ++ if(err<0) { ++ printk("slirp_close: waitpid returned %d\n", errno); ++ return; ++ } ++ ++ if(err==0) { ++ printk("slirp_close: process %d has not exited\n"); ++ return; ++ } ++ ++ pri->pid = -1; ++} ++ ++int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri) ++{ ++ int i, n, size, start; ++ ++ if(pri->more>0) { ++ i = 0; ++ while(i < pri->more) { ++ size = slip_unesc(pri->ibuf[i++], ++ pri->ibuf,&pri->pos,&pri->esc); ++ if(size){ ++ memcpy(buf, pri->ibuf, size); ++ memmove(pri->ibuf, &pri->ibuf[i], pri->more-i); ++ pri->more=pri->more-i; ++ return(size); ++ } ++ } ++ pri->more=0; ++ } ++ ++ n = net_read(fd, &pri->ibuf[pri->pos], sizeof(pri->ibuf) - pri->pos); ++ if(n <= 0) return(n); ++ ++ start = pri->pos; ++ for(i = 0; i < n; i++){ ++ size = slip_unesc(pri->ibuf[start + i], ++ pri->ibuf,&pri->pos,&pri->esc); ++ if(size){ ++ memcpy(buf, pri->ibuf, size); ++ memmove(pri->ibuf, &pri->ibuf[start+i+1], n-(i+1)); ++ pri->more=n-(i+1); ++ return(size); ++ } ++ } ++ return(0); ++} ++ ++int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri) ++{ ++ int actual, n; ++ ++ actual = slip_esc(buf, pri->obuf, len); ++ n = net_write(fd, pri->obuf, actual); ++ if(n < 0) return(n); ++ else return(len); ++} ++ ++static int slirp_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++struct net_user_info slirp_user_info = { ++ .init = slirp_user_init, ++ .open = slirp_open, ++ .close = slirp_close, ++ .remove = NULL, ++ .set_mtu = slirp_set_mtu, ++ .add_address = NULL, ++ .delete_address = NULL, ++ .max_packet = BUF_SIZE ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/ssl.c um/arch/um/drivers/ssl.c +--- orig/arch/um/drivers/ssl.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/ssl.c Thu Mar 6 18:55:01 2003 +@@ -0,0 +1,265 @@ ++/* ++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/fs.h" ++#include "linux/tty.h" ++#include "linux/tty_driver.h" ++#include "linux/major.h" ++#include "linux/mm.h" ++#include "linux/init.h" ++#include "asm/termbits.h" ++#include "asm/irq.h" ++#include "line.h" ++#include "ssl.h" ++#include "chan_kern.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "init.h" ++#include "irq_user.h" ++#include "mconsole_kern.h" ++#include "2_5compat.h" ++ ++static int ssl_version = 1; ++ ++/* Referenced only by tty_driver below - presumably it's locked correctly ++ * by the tty driver. ++ */ ++static int ssl_refcount = 0; ++ ++static struct tty_driver ssl_driver; ++ ++#define NR_PORTS 64 ++ ++void ssl_announce(char *dev_name, int dev) ++{ ++ printk(KERN_INFO "Serial line %d assigned device '%s'\n", dev, ++ dev_name); ++} ++ ++static struct chan_opts opts = { ++ .announce = ssl_announce, ++ .xterm_title = "Serial Line #%d", ++ .raw = 1, ++ .tramp_stack = 0, ++ .in_kernel = 1, ++}; ++ ++static int ssl_config(char *str); ++static int ssl_get_config(char *dev, char *str, int size, char **error_out); ++static int ssl_remove(char *str); ++ ++static struct line_driver driver = { ++ .name = "UML serial line", ++ .devfs_name = "tts/%d", ++ .major = TTY_MAJOR, ++ .minor_start = 64, ++ .type = TTY_DRIVER_TYPE_SERIAL, ++ .subtype = 0, ++ .read_irq = SSL_IRQ, ++ .read_irq_name = "ssl", ++ .write_irq = SSL_WRITE_IRQ, ++ .write_irq_name = "ssl-write", ++ .symlink_from = "serial", ++ .symlink_to = "tts", ++ .mc = { ++ .name = "ssl", ++ .config = ssl_config, ++ .get_config = ssl_get_config, ++ .remove = ssl_remove, ++ }, ++}; ++ ++/* The array is initialized by line_init, which is an initcall. The ++ * individual elements are protected by individual semaphores. ++ */ ++static struct line serial_lines[NR_PORTS] = ++ { [0 ... NR_PORTS - 1] = LINE_INIT(CONFIG_SSL_CHAN, &driver) }; ++ ++static struct lines lines = LINES_INIT(NR_PORTS); ++ ++static int ssl_config(char *str) ++{ ++ return(line_config(serial_lines, ++ sizeof(serial_lines)/sizeof(serial_lines[0]), str)); ++} ++ ++static int ssl_get_config(char *dev, char *str, int size, char **error_out) ++{ ++ return(line_get_config(dev, serial_lines, ++ sizeof(serial_lines)/sizeof(serial_lines[0]), ++ str, size, error_out)); ++} ++ ++static int ssl_remove(char *str) ++{ ++ return(line_remove(serial_lines, ++ sizeof(serial_lines)/sizeof(serial_lines[0]), str)); ++} ++ ++int ssl_open(struct tty_struct *tty, struct file *filp) ++{ ++ return(line_open(serial_lines, tty, &opts)); ++} ++ ++static void ssl_close(struct tty_struct *tty, struct file * filp) ++{ ++ line_close(serial_lines, tty); ++} ++ ++static int ssl_write(struct tty_struct * tty, int from_user, ++ const unsigned char *buf, int count) ++{ ++ return(line_write(serial_lines, tty, from_user, buf, count)); ++} ++ ++static void ssl_put_char(struct tty_struct *tty, unsigned char ch) ++{ ++ line_write(serial_lines, tty, 0, &ch, sizeof(ch)); ++} ++ ++static void ssl_flush_chars(struct tty_struct *tty) ++{ ++ return; ++} ++ ++static int ssl_chars_in_buffer(struct tty_struct *tty) ++{ ++ return(0); ++} ++ ++static void ssl_flush_buffer(struct tty_struct *tty) ++{ ++ return; ++} ++ ++static int ssl_ioctl(struct tty_struct *tty, struct file * file, ++ unsigned int cmd, unsigned long arg) ++{ ++ int ret; ++ ++ ret = 0; ++ switch(cmd){ ++ case TCGETS: ++ case TCSETS: ++ case TCFLSH: ++ case TCSETSF: ++ case TCSETSW: ++ case TCGETA: ++ case TIOCMGET: ++ ret = -ENOIOCTLCMD; ++ break; ++ default: ++ printk(KERN_ERR ++ "Unimplemented ioctl in ssl_ioctl : 0x%x\n", cmd); ++ ret = -ENOIOCTLCMD; ++ break; ++ } ++ return(ret); ++} ++ ++static void ssl_throttle(struct tty_struct * tty) ++{ ++ printk(KERN_ERR "Someone should implement ssl_throttle\n"); ++} ++ ++static void ssl_unthrottle(struct tty_struct * tty) ++{ ++ printk(KERN_ERR "Someone should implement ssl_unthrottle\n"); ++} ++ ++static void ssl_set_termios(struct tty_struct *tty, ++ struct termios *old_termios) ++{ ++} ++ ++static void ssl_stop(struct tty_struct *tty) ++{ ++ printk(KERN_ERR "Someone should implement ssl_stop\n"); ++} ++ ++static void ssl_start(struct tty_struct *tty) ++{ ++ printk(KERN_ERR "Someone should implement ssl_start\n"); ++} ++ ++void ssl_hangup(struct tty_struct *tty) ++{ ++} ++ ++static struct tty_driver ssl_driver = { ++ .refcount = &ssl_refcount, ++ .open = ssl_open, ++ .close = ssl_close, ++ .write = ssl_write, ++ .put_char = ssl_put_char, ++ .flush_chars = ssl_flush_chars, ++ .chars_in_buffer = ssl_chars_in_buffer, ++ .flush_buffer = ssl_flush_buffer, ++ .ioctl = ssl_ioctl, ++ .throttle = ssl_throttle, ++ .unthrottle = ssl_unthrottle, ++ .set_termios = ssl_set_termios, ++ .stop = ssl_stop, ++ .start = ssl_start, ++ .hangup = ssl_hangup ++}; ++ ++/* Changed by ssl_init and referenced by ssl_exit, which are both serialized ++ * by being an initcall and exitcall, respectively. ++ */ ++static int ssl_init_done = 0; ++ ++int ssl_init(void) ++{ ++ char *new_title; ++ ++ printk(KERN_INFO "Initializing software serial port version %d\n", ++ ssl_version); ++ ++ line_register_devfs(&lines, &driver, &ssl_driver, serial_lines, ++ sizeof(serial_lines)/sizeof(serial_lines[0])); ++ ++ lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0])); ++ ++ new_title = add_xterm_umid(opts.xterm_title); ++ if(new_title != NULL) opts.xterm_title = new_title; ++ ++ ssl_init_done = 1; ++ return(0); ++} ++ ++__initcall(ssl_init); ++ ++static int ssl_chan_setup(char *str) ++{ ++ line_setup(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]), ++ str, 1); ++ return(1); ++} ++ ++__setup("ssl", ssl_chan_setup); ++__channel_help(ssl_chan_setup, "ssl"); ++ ++static void ssl_exit(void) ++{ ++ if(!ssl_init_done) return; ++ close_lines(serial_lines, ++ sizeof(serial_lines)/sizeof(serial_lines[0])); ++} ++ ++__uml_exitcall(ssl_exit); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/ssl.h um/arch/um/drivers/ssl.h +--- orig/arch/um/drivers/ssl.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/ssl.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SSL_H__ ++#define __SSL_H__ ++ ++extern int ssl_read(int fd, int line); ++extern void ssl_receive_char(int line, char ch); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/stdio_console.c um/arch/um/drivers/stdio_console.c +--- orig/arch/um/drivers/stdio_console.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/stdio_console.c Sun Dec 15 21:08:20 2002 +@@ -0,0 +1,250 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/posix_types.h" ++#include "linux/tty.h" ++#include "linux/tty_flip.h" ++#include "linux/types.h" ++#include "linux/major.h" ++#include "linux/kdev_t.h" ++#include "linux/console.h" ++#include "linux/string.h" ++#include "linux/sched.h" ++#include "linux/list.h" ++#include "linux/init.h" ++#include "linux/interrupt.h" ++#include "linux/slab.h" ++#include "asm/current.h" ++#include "asm/softirq.h" ++#include "asm/hardirq.h" ++#include "asm/irq.h" ++#include "stdio_console.h" ++#include "line.h" ++#include "chan_kern.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "irq_user.h" ++#include "mconsole_kern.h" ++#include "init.h" ++#include "2_5compat.h" ++ ++#define MAX_TTYS (8) ++ ++/* Referenced only by tty_driver below - presumably it's locked correctly ++ * by the tty driver. ++ */ ++ ++static struct tty_driver console_driver; ++ ++static int console_refcount = 0; ++ ++static struct chan_ops init_console_ops = { ++ .type = "you shouldn't see this", ++ .init = NULL, ++ .open = NULL, ++ .close = NULL, ++ .read = NULL, ++ .write = NULL, ++ .console_write = generic_write, ++ .window_size = NULL, ++ .free = NULL, ++ .winch = 0, ++}; ++ ++static struct chan init_console_chan = { ++ .list = { }, ++ .primary = 1, ++ .input = 0, ++ .output = 1, ++ .opened = 1, ++ .fd = 1, ++ .pri = INIT_STATIC, ++ .ops = &init_console_ops, ++ .data = NULL ++}; ++ ++void stdio_announce(char *dev_name, int dev) ++{ ++ printk(KERN_INFO "Virtual console %d assigned device '%s'\n", dev, ++ dev_name); ++} ++ ++static struct chan_opts opts = { ++ .announce = stdio_announce, ++ .xterm_title = "Virtual Console #%d", ++ .raw = 1, ++ .tramp_stack = 0, ++ .in_kernel = 1, ++}; ++ ++static int con_config(char *str); ++static int con_get_config(char *dev, char *str, int size, char **error_out); ++static int con_remove(char *str); ++ ++static struct line_driver driver = { ++ .name = "UML console", ++ .devfs_name = "vc/%d", ++ .major = TTY_MAJOR, ++ .minor_start = 0, ++ .type = TTY_DRIVER_TYPE_CONSOLE, ++ .subtype = SYSTEM_TYPE_CONSOLE, ++ .read_irq = CONSOLE_IRQ, ++ .read_irq_name = "console", ++ .write_irq = CONSOLE_WRITE_IRQ, ++ .write_irq_name = "console-write", ++ .symlink_from = "ttys", ++ .symlink_to = "vc", ++ .mc = { ++ .name = "con", ++ .config = con_config, ++ .get_config = con_get_config, ++ .remove = con_remove, ++ }, ++}; ++ ++static struct lines console_lines = LINES_INIT(MAX_TTYS); ++ ++/* The array is initialized by line_init, which is an initcall. The ++ * individual elements are protected by individual semaphores. ++ */ ++struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver), ++ [ 1 ... MAX_TTYS - 1 ] = ++ LINE_INIT(CONFIG_CON_CHAN, &driver) }; ++ ++static int con_config(char *str) ++{ ++ return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str)); ++} ++ ++static int con_get_config(char *dev, char *str, int size, char **error_out) ++{ ++ return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, ++ size, error_out)); ++} ++ ++static int con_remove(char *str) ++{ ++ return(line_remove(vts, sizeof(vts)/sizeof(vts[0]), str)); ++} ++ ++static int open_console(struct tty_struct *tty) ++{ ++ return(line_open(vts, tty, &opts)); ++} ++ ++static int con_open(struct tty_struct *tty, struct file *filp) ++{ ++ return(open_console(tty)); ++} ++ ++static void con_close(struct tty_struct *tty, struct file *filp) ++{ ++ line_close(vts, tty); ++} ++ ++static int con_write(struct tty_struct *tty, int from_user, ++ const unsigned char *buf, int count) ++{ ++ return(line_write(vts, tty, from_user, buf, count)); ++} ++ ++static void set_termios(struct tty_struct *tty, struct termios * old) ++{ ++} ++ ++static int chars_in_buffer(struct tty_struct *tty) ++{ ++ return(0); ++} ++ ++static int con_init_done = 0; ++ ++int stdio_init(void) ++{ ++ char *new_title; ++ ++ printk(KERN_INFO "Initializing stdio console driver\n"); ++ ++ line_register_devfs(&console_lines, &driver, &console_driver, vts, ++ sizeof(vts)/sizeof(vts[0])); ++ ++ lines_init(vts, sizeof(vts)/sizeof(vts[0])); ++ ++ new_title = add_xterm_umid(opts.xterm_title); ++ if(new_title != NULL) opts.xterm_title = new_title; ++ ++ open_console(NULL); ++ con_init_done = 1; ++ return(0); ++} ++ ++__initcall(stdio_init); ++ ++static void console_write(struct console *console, const char *string, ++ unsigned len) ++{ ++ if(con_init_done) down(&vts[console->index].sem); ++ console_write_chan(&vts[console->index].chan_list, string, len); ++ if(con_init_done) up(&vts[console->index].sem); ++} ++ ++static struct tty_driver console_driver = { ++ .refcount = &console_refcount, ++ .open = con_open, ++ .close = con_close, ++ .write = con_write, ++ .chars_in_buffer = chars_in_buffer, ++ .set_termios = set_termios ++}; ++ ++static kdev_t console_device(struct console *c) ++{ ++ return mk_kdev(TTY_MAJOR, c->index); ++} ++ ++static int console_setup(struct console *co, char *options) ++{ ++ return(0); ++} ++ ++static struct console stdiocons = INIT_CONSOLE("tty", console_write, ++ console_device, console_setup, ++ CON_PRINTBUFFER); ++ ++void stdio_console_init(void) ++{ ++ INIT_LIST_HEAD(&vts[0].chan_list); ++ list_add(&init_console_chan.list, &vts[0].chan_list); ++ register_console(&stdiocons); ++} ++ ++static int console_chan_setup(char *str) ++{ ++ line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1); ++ return(1); ++} ++ ++__setup("con", console_chan_setup); ++__channel_help(console_chan_setup, "con"); ++ ++static void console_exit(void) ++{ ++ if(!con_init_done) return; ++ close_lines(vts, sizeof(vts)/sizeof(vts[0])); ++} ++ ++__uml_exitcall(console_exit); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/stdio_console.h um/arch/um/drivers/stdio_console.h +--- orig/arch/um/drivers/stdio_console.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/stdio_console.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,21 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __STDIO_CONSOLE_H ++#define __STDIO_CONSOLE_H ++ ++extern void save_console_flags(void); ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/tty.c um/arch/um/drivers/tty.c +--- orig/arch/um/drivers/tty.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/tty.c Sun Dec 15 21:08:41 2002 +@@ -0,0 +1,86 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include "chan_user.h" ++#include "user_util.h" ++#include "user.h" ++#include "os.h" ++ ++struct tty_chan { ++ char *dev; ++ int raw; ++ struct termios tt; ++}; ++ ++void *tty_chan_init(char *str, int device, struct chan_opts *opts) ++{ ++ struct tty_chan *data; ++ ++ if(*str != ':'){ ++ printk("tty_init : channel type 'tty' must specify " ++ "a device\n"); ++ return(NULL); ++ } ++ str++; ++ ++ if((data = um_kmalloc(sizeof(*data))) == NULL) ++ return(NULL); ++ *data = ((struct tty_chan) { .dev = str, ++ .raw = opts->raw }); ++ ++ return(data); ++} ++ ++int tty_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct tty_chan *data = d; ++ int fd; ++ ++ fd = os_open_file(data->dev, of_set_rw(OPENFLAGS(), input, output), 0); ++ if(fd < 0) return(fd); ++ if(data->raw){ ++ tcgetattr(fd, &data->tt); ++ raw(fd, 0); ++ } ++ ++ *dev_out = data->dev; ++ return(fd); ++} ++ ++int tty_console_write(int fd, const char *buf, int n, void *d) ++{ ++ struct tty_chan *data = d; ++ ++ return(generic_console_write(fd, buf, n, &data->tt)); ++} ++ ++struct chan_ops tty_ops = { ++ .type = "tty", ++ .init = tty_chan_init, ++ .open = tty_open, ++ .close = generic_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = tty_console_write, ++ .window_size = generic_window_size, ++ .free = generic_free, ++ .winch = 0, ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/ubd_kern.c um/arch/um/drivers/ubd_kern.c +--- orig/arch/um/drivers/ubd_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/ubd_kern.c Tue Mar 11 15:46:36 2003 +@@ -0,0 +1,1067 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++/* 2001-09-28...2002-04-17 ++ * Partition stuff by James_McMechan@hotmail.com ++ * old style ubd by setting UBD_SHIFT to 0 ++ */ ++ ++#define MAJOR_NR UBD_MAJOR ++#define UBD_SHIFT 4 ++ ++#include "linux/config.h" ++#include "linux/blk.h" ++#include "linux/blkdev.h" ++#include "linux/hdreg.h" ++#include "linux/init.h" ++#include "linux/devfs_fs_kernel.h" ++#include "linux/cdrom.h" ++#include "linux/proc_fs.h" ++#include "linux/ctype.h" ++#include "linux/capability.h" ++#include "linux/mm.h" ++#include "linux/vmalloc.h" ++#include "linux/blkpg.h" ++#include "linux/genhd.h" ++#include "linux/spinlock.h" ++#include "asm/segment.h" ++#include "asm/uaccess.h" ++#include "asm/irq.h" ++#include "asm/types.h" ++#include "user_util.h" ++#include "mem_user.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "mconsole_kern.h" ++#include "init.h" ++#include "irq_user.h" ++#include "ubd_user.h" ++#include "2_5compat.h" ++#include "os.h" ++ ++static int ubd_open(struct inode * inode, struct file * filp); ++static int ubd_release(struct inode * inode, struct file * file); ++static int ubd_ioctl(struct inode * inode, struct file * file, ++ unsigned int cmd, unsigned long arg); ++static int ubd_revalidate(kdev_t rdev); ++static int ubd_revalidate1(kdev_t rdev); ++ ++#define MAX_DEV (8) ++#define MAX_MINOR (MAX_DEV << UBD_SHIFT) ++ ++/* Not modified by this driver */ ++static int blk_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = BLOCK_SIZE }; ++static int hardsect_sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 512 }; ++ ++/* Protected by ubd_lock */ ++static int sizes[MAX_MINOR] = { [ 0 ... MAX_MINOR - 1 ] = 0 }; ++ ++static struct block_device_operations ubd_blops = { ++ .open = ubd_open, ++ .release = ubd_release, ++ .ioctl = ubd_ioctl, ++ .revalidate = ubd_revalidate, ++}; ++ ++/* Protected by ubd_lock, except in prepare_request and ubd_ioctl because ++ * the block layer should ensure that the device is idle before closing it. ++ */ ++static struct hd_struct ubd_part[MAX_MINOR] = ++ { [ 0 ... MAX_MINOR - 1 ] = { 0, 0, 0 } }; ++ ++/* Protected by io_request_lock */ ++static request_queue_t *ubd_queue; ++ ++/* Protected by ubd_lock */ ++static int fake_major = MAJOR_NR; ++ ++static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED; ++ ++#define INIT_GENDISK(maj, name, parts, shift, bsizes, max, blops) \ ++{ \ ++ .major = maj, \ ++ .major_name = name, \ ++ .minor_shift = shift, \ ++ .max_p = 1 << shift, \ ++ .part = parts, \ ++ .sizes = bsizes, \ ++ .nr_real = max, \ ++ .real_devices = NULL, \ ++ .next = NULL, \ ++ .fops = blops, \ ++ .de_arr = NULL, \ ++ .flags = 0 \ ++} ++ ++static struct gendisk ubd_gendisk = INIT_GENDISK(MAJOR_NR, "ubd", ubd_part, ++ UBD_SHIFT, sizes, MAX_DEV, ++ &ubd_blops); ++static struct gendisk fake_gendisk = INIT_GENDISK(0, "ubd", ubd_part, ++ UBD_SHIFT, sizes, MAX_DEV, ++ &ubd_blops); ++ ++#ifdef CONFIG_BLK_DEV_UBD_SYNC ++#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ ++ .cl = 1 }) ++#else ++#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \ ++ .cl = 1 }) ++#endif ++ ++/* Not protected - changed only in ubd_setup_common and then only to ++ * to enable O_SYNC. ++ */ ++static struct openflags global_openflags = OPEN_FLAGS; ++ ++struct cow { ++ char *file; ++ int fd; ++ unsigned long *bitmap; ++ unsigned long bitmap_len; ++ int bitmap_offset; ++ int data_offset; ++}; ++ ++struct ubd { ++ char *file; ++ int count; ++ int fd; ++ __u64 size; ++ struct openflags boot_openflags; ++ struct openflags openflags; ++ devfs_handle_t devfs; ++ struct cow cow; ++}; ++ ++#define DEFAULT_COW { \ ++ .file = NULL, \ ++ .fd = -1, \ ++ .bitmap = NULL, \ ++ .bitmap_offset = 0, \ ++ .data_offset = 0, \ ++} ++ ++#define DEFAULT_UBD { \ ++ .file = NULL, \ ++ .count = 0, \ ++ .fd = -1, \ ++ .size = -1, \ ++ .boot_openflags = OPEN_FLAGS, \ ++ .openflags = OPEN_FLAGS, \ ++ .devfs = NULL, \ ++ .cow = DEFAULT_COW, \ ++} ++ ++struct ubd ubd_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_UBD }; ++ ++static int ubd0_init(void) ++{ ++ struct ubd *dev = &ubd_dev[0]; ++ ++ if(dev->file == NULL) ++ dev->file = "root_fs"; ++ return(0); ++} ++ ++__initcall(ubd0_init); ++ ++/* Only changed by fake_ide_setup which is a setup */ ++static int fake_ide = 0; ++static struct proc_dir_entry *proc_ide_root = NULL; ++static struct proc_dir_entry *proc_ide = NULL; ++ ++static void make_proc_ide(void) ++{ ++ proc_ide_root = proc_mkdir("ide", 0); ++ proc_ide = proc_mkdir("ide0", proc_ide_root); ++} ++ ++static int proc_ide_read_media(char *page, char **start, off_t off, int count, ++ int *eof, void *data) ++{ ++ int len; ++ ++ strcpy(page, "disk\n"); ++ len = strlen("disk\n"); ++ len -= off; ++ if (len < count){ ++ *eof = 1; ++ if (len <= 0) return 0; ++ } ++ else len = count; ++ *start = page + off; ++ return len; ++} ++ ++static void make_ide_entries(char *dev_name) ++{ ++ struct proc_dir_entry *dir, *ent; ++ char name[64]; ++ ++ if(!fake_ide) return; ++ ++ /* Without locking this could race if a UML was booted with no ++ * disks and then two mconsole requests which add disks came in ++ * at the same time. ++ */ ++ spin_lock(&ubd_lock); ++ if(proc_ide_root == NULL) make_proc_ide(); ++ spin_unlock(&ubd_lock); ++ ++ dir = proc_mkdir(dev_name, proc_ide); ++ if(!dir) return; ++ ++ ent = create_proc_entry("media", S_IFREG|S_IRUGO, dir); ++ if(!ent) return; ++ ent->nlink = 1; ++ ent->data = NULL; ++ ent->read_proc = proc_ide_read_media; ++ ent->write_proc = NULL; ++ sprintf(name,"ide0/%s", dev_name); ++ proc_symlink(dev_name, proc_ide_root, name); ++} ++ ++static int fake_ide_setup(char *str) ++{ ++ fake_ide = 1; ++ return(1); ++} ++ ++__setup("fake_ide", fake_ide_setup); ++ ++__uml_help(fake_ide_setup, ++"fake_ide\n" ++" Create ide0 entries that map onto ubd devices.\n\n" ++); ++ ++static int parse_unit(char **ptr) ++{ ++ char *str = *ptr, *end; ++ int n = -1; ++ ++ if(isdigit(*str)) { ++ n = simple_strtoul(str, &end, 0); ++ if(end == str) ++ return(-1); ++ *ptr = end; ++ } ++ else if (('a' <= *str) && (*str <= 'h')) { ++ n = *str - 'a'; ++ str++; ++ *ptr = str; ++ } ++ return(n); ++} ++ ++static int ubd_setup_common(char *str, int *index_out) ++{ ++ struct openflags flags = global_openflags; ++ struct ubd *dev; ++ char *backing_file; ++ int n, err; ++ ++ if(index_out) *index_out = -1; ++ n = *str; ++ if(n == '='){ ++ char *end; ++ int major; ++ ++ str++; ++ if(!strcmp(str, "sync")){ ++ global_openflags.s = 1; ++ return(0); ++ } ++ major = simple_strtoul(str, &end, 0); ++ if((*end != '\0') || (end == str)){ ++ printk(KERN_ERR ++ "ubd_setup : didn't parse major number\n"); ++ return(1); ++ } ++ ++ err = 1; ++ spin_lock(&ubd_lock); ++ if(fake_major != MAJOR_NR){ ++ printk(KERN_ERR "Can't assign a fake major twice\n"); ++ goto out1; ++ } ++ ++ fake_gendisk.major = major; ++ fake_major = major; ++ ++ printk(KERN_INFO "Setting extra ubd major number to %d\n", ++ major); ++ err = 0; ++ out1: ++ spin_unlock(&ubd_lock); ++ return(err); ++ } ++ ++ n = parse_unit(&str); ++ if(n < 0){ ++ printk(KERN_ERR "ubd_setup : couldn't parse unit number " ++ "'%s'\n", str); ++ return(1); ++ } ++ ++ if(n >= MAX_DEV){ ++ printk(KERN_ERR "ubd_setup : index %d out of range " ++ "(%d devices)\n", n, MAX_DEV); ++ return(1); ++ } ++ ++ err = 1; ++ spin_lock(&ubd_lock); ++ ++ dev = &ubd_dev[n]; ++ if(dev->file != NULL){ ++ printk(KERN_ERR "ubd_setup : device already configured\n"); ++ goto out2; ++ } ++ ++ if(index_out) *index_out = n; ++ ++ if (*str == 'r'){ ++ flags.w = 0; ++ str++; ++ } ++ if (*str == 's'){ ++ flags.s = 1; ++ str++; ++ } ++ if(*str++ != '='){ ++ printk(KERN_ERR "ubd_setup : Expected '='\n"); ++ goto out2; ++ } ++ ++ err = 0; ++ backing_file = strchr(str, ','); ++ if(backing_file){ ++ *backing_file = '\0'; ++ backing_file++; ++ } ++ dev->file = str; ++ dev->cow.file = backing_file; ++ dev->boot_openflags = flags; ++ out2: ++ spin_unlock(&ubd_lock); ++ return(err); ++} ++ ++static int ubd_setup(char *str) ++{ ++ ubd_setup_common(str, NULL); ++ return(1); ++} ++ ++__setup("ubd", ubd_setup); ++__uml_help(ubd_setup, ++"ubd=\n" ++" This is used to associate a device with a file in the underlying\n" ++" filesystem. Usually, there is a filesystem in the file, but \n" ++" that's not required. Swap devices containing swap files can be\n" ++" specified like this. Also, a file which doesn't contain a\n" ++" filesystem can have its contents read in the virtual \n" ++" machine by running dd on the device. n must be in the range\n" ++" 0 to 7. Appending an 'r' to the number will cause that device\n" ++" to be mounted read-only. For example ubd1r=./ext_fs. Appending\n" ++" an 's' (has to be _after_ 'r', if there is one) will cause data\n" ++" to be written to disk on the host immediately.\n\n" ++); ++ ++static int fakehd(char *str) ++{ ++ printk(KERN_INFO ++ "fakehd : Changing ubd_gendisk.major_name to \"hd\".\n"); ++ ubd_gendisk.major_name = "hd"; ++ return(1); ++} ++ ++__setup("fakehd", fakehd); ++__uml_help(fakehd, ++"fakehd\n" ++" Change the ubd device name to \"hd\".\n\n" ++); ++ ++static void do_ubd_request(request_queue_t * q); ++ ++/* Only changed by ubd_init, which is an initcall. */ ++int thread_fd = -1; ++ ++/* Changed by ubd_handler, which is serialized because interrupts only ++ * happen on CPU 0. ++ */ ++int intr_count = 0; ++ ++static void ubd_finish(int error) ++{ ++ int nsect; ++ ++ if(error){ ++ end_request(0); ++ return; ++ } ++ nsect = CURRENT->current_nr_sectors; ++ CURRENT->sector += nsect; ++ CURRENT->buffer += nsect << 9; ++ CURRENT->errors = 0; ++ CURRENT->nr_sectors -= nsect; ++ CURRENT->current_nr_sectors = 0; ++ end_request(1); ++} ++ ++static void ubd_handler(void) ++{ ++ struct io_thread_req req; ++ int n; ++ ++ DEVICE_INTR = NULL; ++ intr_count++; ++ n = read_ubd_fs(thread_fd, &req, sizeof(req)); ++ if(n != sizeof(req)){ ++ printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, " ++ "errno = %d\n", os_getpid(), -n); ++ spin_lock(&io_request_lock); ++ end_request(0); ++ spin_unlock(&io_request_lock); ++ return; ++ } ++ ++ if((req.offset != ((__u64) (CURRENT->sector)) << 9) || ++ (req.length != (CURRENT->current_nr_sectors) << 9)) ++ panic("I/O op mismatch"); ++ ++ spin_lock(&io_request_lock); ++ ubd_finish(req.error); ++ reactivate_fd(thread_fd, UBD_IRQ); ++ do_ubd_request(ubd_queue); ++ spin_unlock(&io_request_lock); ++} ++ ++static void ubd_intr(int irq, void *dev, struct pt_regs *unused) ++{ ++ ubd_handler(); ++} ++ ++/* Only changed by ubd_init, which is an initcall. */ ++static int io_pid = -1; ++ ++void kill_io_thread(void) ++{ ++ if(io_pid != -1) ++ os_kill_process(io_pid, 1); ++} ++ ++__uml_exitcall(kill_io_thread); ++ ++/* Initialized in an initcall, and unchanged thereafter */ ++devfs_handle_t ubd_dir_handle; ++ ++static int ubd_add(int n) ++{ ++ struct ubd *dev = &ubd_dev[n]; ++ char name[sizeof("nnnnnn\0")], dev_name[sizeof("ubd0x")]; ++ int err = -EISDIR; ++ ++ if(dev->file == NULL) ++ goto out; ++ ++ err = ubd_revalidate1(MKDEV(MAJOR_NR, n << UBD_SHIFT)); ++ if(err) ++ goto out; ++ ++ sprintf(name, "%d", n); ++ dev->devfs = devfs_register(ubd_dir_handle, name, DEVFS_FL_REMOVABLE, ++ MAJOR_NR, n << UBD_SHIFT, S_IFBLK | ++ S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP, ++ &ubd_blops, NULL); ++ ++ if(!strcmp(ubd_gendisk.major_name, "ubd")) ++ sprintf(dev_name, "%s%d", ubd_gendisk.major_name, n); ++ else sprintf(dev_name, "%s%c", ubd_gendisk.major_name, ++ n + 'a'); ++ ++ make_ide_entries(dev_name); ++ return(0); ++ ++ out: ++ return(err); ++} ++ ++static int ubd_config(char *str) ++{ ++ int n, err; ++ ++ str = uml_strdup(str); ++ if(str == NULL){ ++ printk(KERN_ERR "ubd_config failed to strdup string\n"); ++ return(1); ++ } ++ err = ubd_setup_common(str, &n); ++ if(err){ ++ kfree(str); ++ return(-1); ++ } ++ if(n == -1) return(0); ++ ++ spin_lock(&ubd_lock); ++ err = ubd_add(n); ++ if(err) ++ ubd_dev[n].file = NULL; ++ spin_unlock(&ubd_lock); ++ ++ return(err); ++} ++ ++static int ubd_get_config(char *name, char *str, int size, char **error_out) ++{ ++ struct ubd *dev; ++ char *end; ++ int n, len = 0; ++ ++ n = simple_strtoul(name, &end, 0); ++ if((*end != '\0') || (end == name)){ ++ *error_out = "ubd_get_config : didn't parse device number"; ++ return(-1); ++ } ++ ++ if((n >= MAX_DEV) || (n < 0)){ ++ *error_out = "ubd_get_config : device number out of range"; ++ return(-1); ++ } ++ ++ dev = &ubd_dev[n]; ++ spin_lock(&ubd_lock); ++ ++ if(dev->file == NULL){ ++ CONFIG_CHUNK(str, size, len, "", 1); ++ goto out; ++ } ++ ++ CONFIG_CHUNK(str, size, len, dev->file, 0); ++ ++ if(dev->cow.file != NULL){ ++ CONFIG_CHUNK(str, size, len, ",", 0); ++ CONFIG_CHUNK(str, size, len, dev->cow.file, 1); ++ } ++ else CONFIG_CHUNK(str, size, len, "", 1); ++ ++ out: ++ spin_unlock(&ubd_lock); ++ return(len); ++} ++ ++static int ubd_remove(char *str) ++{ ++ struct ubd *dev; ++ int n, err = -ENODEV; ++ ++ if(isdigit(*str)){ ++ char *end; ++ n = simple_strtoul(str, &end, 0); ++ if ((*end != '\0') || (end == str)) ++ return(err); ++ } ++ else if (('a' <= *str) && (*str <= 'h')) ++ n = *str - 'a'; ++ else ++ return(err); /* it should be a number 0-7/a-h */ ++ ++ if((n < 0) || (n >= MAX_DEV)) ++ return(err); ++ ++ dev = &ubd_dev[n]; ++ ++ spin_lock(&ubd_lock); ++ err = 0; ++ if(dev->file == NULL) ++ goto out; ++ err = -1; ++ if(dev->count > 0) ++ goto out; ++ if(dev->devfs != NULL) ++ devfs_unregister(dev->devfs); ++ ++ *dev = ((struct ubd) DEFAULT_UBD); ++ err = 0; ++ out: ++ spin_unlock(&ubd_lock); ++ return(err); ++} ++ ++static struct mc_device ubd_mc = { ++ .name = "ubd", ++ .config = ubd_config, ++ .get_config = ubd_get_config, ++ .remove = ubd_remove, ++}; ++ ++static int ubd_mc_init(void) ++{ ++ mconsole_register_dev(&ubd_mc); ++ return(0); ++} ++ ++__initcall(ubd_mc_init); ++ ++static request_queue_t *ubd_get_queue(kdev_t device) ++{ ++ return(ubd_queue); ++} ++ ++int ubd_init(void) ++{ ++ unsigned long stack; ++ int i, err; ++ ++ ubd_dir_handle = devfs_mk_dir (NULL, "ubd", NULL); ++ if (devfs_register_blkdev(MAJOR_NR, "ubd", &ubd_blops)) { ++ printk(KERN_ERR "ubd: unable to get major %d\n", MAJOR_NR); ++ return -1; ++ } ++ read_ahead[MAJOR_NR] = 8; /* 8 sector (4kB) read-ahead */ ++ blksize_size[MAJOR_NR] = blk_sizes; ++ blk_size[MAJOR_NR] = sizes; ++ INIT_HARDSECT(hardsect_size, MAJOR_NR, hardsect_sizes); ++ ++ ubd_queue = BLK_DEFAULT_QUEUE(MAJOR_NR); ++ blk_init_queue(ubd_queue, DEVICE_REQUEST); ++ INIT_ELV(ubd_queue, &ubd_queue->elevator); ++ ++ add_gendisk(&ubd_gendisk); ++ if (fake_major != MAJOR_NR){ ++ /* major number 0 is used to auto select */ ++ err = devfs_register_blkdev(fake_major, "fake", &ubd_blops); ++ if(fake_major == 0){ ++ /* auto device number case */ ++ fake_major = err; ++ if(err == 0) ++ return(-ENODEV); ++ } ++ else if (err){ ++ /* not auto so normal error */ ++ printk(KERN_ERR "ubd: error %d getting major %d\n", ++ err, fake_major); ++ return(-ENODEV); ++ } ++ ++ blk_dev[fake_major].queue = ubd_get_queue; ++ read_ahead[fake_major] = 8; /* 8 sector (4kB) read-ahead */ ++ blksize_size[fake_major] = blk_sizes; ++ blk_size[fake_major] = sizes; ++ INIT_HARDSECT(hardsect_size, fake_major, hardsect_sizes); ++ add_gendisk(&fake_gendisk); ++ } ++ ++ for(i=0;ifd); ++ if(dev->cow.file != NULL) { ++ os_close_file(dev->cow.fd); ++ vfree(dev->cow.bitmap); ++ dev->cow.bitmap = NULL; ++ } ++} ++ ++static int ubd_open_dev(struct ubd *dev) ++{ ++ struct openflags flags; ++ int err, create_cow, *create_ptr; ++ ++ dev->openflags = dev->boot_openflags; ++ create_cow = 0; ++ create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; ++ dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file, ++ &dev->cow.bitmap_offset, &dev->cow.bitmap_len, ++ &dev->cow.data_offset, create_ptr); ++ ++ if((dev->fd == -ENOENT) && create_cow){ ++ dev->fd = create_cow_file(dev->file, dev->cow.file, ++ dev->openflags, 1 << 9, ++ &dev->cow.bitmap_offset, ++ &dev->cow.bitmap_len, ++ &dev->cow.data_offset); ++ if(dev->fd >= 0){ ++ printk(KERN_INFO "Creating \"%s\" as COW file for " ++ "\"%s\"\n", dev->file, dev->cow.file); ++ } ++ } ++ ++ if(dev->fd < 0) return(dev->fd); ++ ++ if(dev->cow.file != NULL){ ++ err = -ENOMEM; ++ dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); ++ if(dev->cow.bitmap == NULL) goto error; ++ flush_tlb_kernel_vm(); ++ ++ err = read_cow_bitmap(dev->fd, dev->cow.bitmap, ++ dev->cow.bitmap_offset, ++ dev->cow.bitmap_len); ++ if(err) goto error; ++ ++ flags = dev->openflags; ++ flags.w = 0; ++ err = open_ubd_file(dev->cow.file, &flags, NULL, NULL, NULL, ++ NULL, NULL); ++ if(err < 0) goto error; ++ dev->cow.fd = err; ++ } ++ return(0); ++ error: ++ os_close_file(dev->fd); ++ return(err); ++} ++ ++static int ubd_file_size(struct ubd *dev, __u64 *size_out) ++{ ++ char *file; ++ ++ file = dev->cow.file ? dev->cow.file : dev->file; ++ return(os_file_size(file, size_out)); ++} ++ ++static int ubd_open(struct inode *inode, struct file *filp) ++{ ++ struct ubd *dev; ++ int n, offset, err = 0; ++ ++ n = DEVICE_NR(inode->i_rdev); ++ dev = &ubd_dev[n]; ++ if(n >= MAX_DEV) ++ return -ENODEV; ++ ++ spin_lock(&ubd_lock); ++ offset = n << UBD_SHIFT; ++ ++ if(dev->count == 0){ ++ err = ubd_open_dev(dev); ++ if(err){ ++ printk(KERN_ERR "ubd%d: Can't open \"%s\": " ++ "errno = %d\n", n, dev->file, -err); ++ goto out; ++ } ++ err = ubd_file_size(dev, &dev->size); ++ if(err) ++ goto out; ++ sizes[offset] = dev->size / BLOCK_SIZE; ++ ubd_part[offset].nr_sects = dev->size / hardsect_sizes[offset]; ++ } ++ dev->count++; ++ if((filp->f_mode & FMODE_WRITE) && !dev->openflags.w){ ++ if(--dev->count == 0) ubd_close(dev); ++ err = -EROFS; ++ } ++ out: ++ spin_unlock(&ubd_lock); ++ return(err); ++} ++ ++static int ubd_release(struct inode * inode, struct file * file) ++{ ++ int n, offset; ++ ++ n = DEVICE_NR(inode->i_rdev); ++ offset = n << UBD_SHIFT; ++ if(n >= MAX_DEV) ++ return -ENODEV; ++ ++ spin_lock(&ubd_lock); ++ if(--ubd_dev[n].count == 0) ++ ubd_close(&ubd_dev[n]); ++ spin_unlock(&ubd_lock); ++ ++ return(0); ++} ++ ++void cowify_req(struct io_thread_req *req, struct ubd *dev) ++{ ++ int i, update_bitmap, sector = req->offset >> 9; ++ ++ if(req->length > (sizeof(req->sector_mask) * 8) << 9) ++ panic("Operation too long"); ++ if(req->op == UBD_READ) { ++ for(i = 0; i < req->length >> 9; i++){ ++ if(ubd_test_bit(sector + i, (unsigned char *) ++ dev->cow.bitmap)){ ++ ubd_set_bit(i, (unsigned char *) ++ &req->sector_mask); ++ } ++ } ++ } ++ else { ++ update_bitmap = 0; ++ for(i = 0; i < req->length >> 9; i++){ ++ ubd_set_bit(i, (unsigned char *) ++ &req->sector_mask); ++ if(!ubd_test_bit(sector + i, (unsigned char *) ++ dev->cow.bitmap)) ++ update_bitmap = 1; ++ ubd_set_bit(sector + i, (unsigned char *) ++ dev->cow.bitmap); ++ } ++ if(update_bitmap){ ++ req->cow_offset = sector / (sizeof(unsigned long) * 8); ++ req->bitmap_words[0] = ++ dev->cow.bitmap[req->cow_offset]; ++ req->bitmap_words[1] = ++ dev->cow.bitmap[req->cow_offset + 1]; ++ req->cow_offset *= sizeof(unsigned long); ++ req->cow_offset += dev->cow.bitmap_offset; ++ } ++ } ++} ++ ++static int prepare_request(struct request *req, struct io_thread_req *io_req) ++{ ++ struct ubd *dev; ++ __u64 block; ++ int nsect, minor, n; ++ ++ if(req->rq_status == RQ_INACTIVE) return(1); ++ ++ minor = MINOR(req->rq_dev); ++ n = minor >> UBD_SHIFT; ++ dev = &ubd_dev[n]; ++ if(IS_WRITE(req) && !dev->openflags.w){ ++ printk("Write attempted on readonly ubd device %d\n", n); ++ end_request(0); ++ return(1); ++ } ++ ++ req->sector += ubd_part[minor].start_sect; ++ block = req->sector; ++ nsect = req->current_nr_sectors; ++ ++ io_req->op = (req->cmd == READ) ? UBD_READ : UBD_WRITE; ++ io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd; ++ io_req->fds[1] = dev->fd; ++ io_req->offsets[0] = 0; ++ io_req->offsets[1] = dev->cow.data_offset; ++ io_req->offset = ((__u64) block) << 9; ++ io_req->length = nsect << 9; ++ io_req->buffer = req->buffer; ++ io_req->sectorsize = 1 << 9; ++ io_req->sector_mask = 0; ++ io_req->cow_offset = -1; ++ io_req->error = 0; ++ ++ if(dev->cow.file != NULL) cowify_req(io_req, dev); ++ return(0); ++} ++ ++static void do_ubd_request(request_queue_t *q) ++{ ++ struct io_thread_req io_req; ++ struct request *req; ++ int err, n; ++ ++ if(thread_fd == -1){ ++ while(!list_empty(&q->queue_head)){ ++ req = blkdev_entry_next_request(&q->queue_head); ++ err = prepare_request(req, &io_req); ++ if(!err){ ++ do_io(&io_req); ++ ubd_finish(io_req.error); ++ } ++ } ++ } ++ else { ++ if(DEVICE_INTR || list_empty(&q->queue_head)) return; ++ req = blkdev_entry_next_request(&q->queue_head); ++ err = prepare_request(req, &io_req); ++ if(!err){ ++ SET_INTR(ubd_handler); ++ n = write_ubd_fs(thread_fd, (char *) &io_req, ++ sizeof(io_req)); ++ if(n != sizeof(io_req)) ++ printk("write to io thread failed, " ++ "errno = %d\n", -n); ++ } ++ } ++} ++ ++static int ubd_ioctl(struct inode * inode, struct file * file, ++ unsigned int cmd, unsigned long arg) ++{ ++ struct hd_geometry *loc = (struct hd_geometry *) arg; ++ struct ubd *dev; ++ int n, minor, err; ++ struct hd_driveid ubd_id = { ++ .cyls = 0, ++ .heads = 128, ++ .sectors = 32, ++ }; ++ ++ if(!inode) return(-EINVAL); ++ minor = MINOR(inode->i_rdev); ++ n = minor >> UBD_SHIFT; ++ if(n >= MAX_DEV) ++ return(-EINVAL); ++ dev = &ubd_dev[n]; ++ switch (cmd) { ++ struct hd_geometry g; ++ struct cdrom_volctrl volume; ++ case HDIO_GETGEO: ++ if(!loc) return(-EINVAL); ++ g.heads = 128; ++ g.sectors = 32; ++ g.cylinders = dev->size / (128 * 32 * hardsect_sizes[minor]); ++ g.start = 2; ++ return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0); ++ case BLKGETSIZE: /* Return device size */ ++ if(!arg) return(-EINVAL); ++ err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); ++ if(err) ++ return(err); ++ put_user(ubd_part[minor].nr_sects, (long *) arg); ++ return(0); ++ case BLKRRPART: /* Re-read partition tables */ ++ return(ubd_revalidate(inode->i_rdev)); ++ ++ case HDIO_SET_UNMASKINTR: ++ if(!capable(CAP_SYS_ADMIN)) return(-EACCES); ++ if((arg > 1) || (minor & 0x3F)) return(-EINVAL); ++ return(0); ++ ++ case HDIO_GET_UNMASKINTR: ++ if(!arg) return(-EINVAL); ++ err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); ++ if(err) ++ return(err); ++ return(0); ++ ++ case HDIO_GET_MULTCOUNT: ++ if(!arg) return(-EINVAL); ++ err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); ++ if(err) ++ return(err); ++ return(0); ++ ++ case HDIO_SET_MULTCOUNT: ++ if(!capable(CAP_SYS_ADMIN)) return(-EACCES); ++ if(MINOR(inode->i_rdev) & 0x3F) return(-EINVAL); ++ return(0); ++ ++ case HDIO_GET_IDENTITY: ++ ubd_id.cyls = dev->size / (128 * 32 * hardsect_sizes[minor]); ++ if(copy_to_user((char *) arg, (char *) &ubd_id, ++ sizeof(ubd_id))) ++ return(-EFAULT); ++ return(0); ++ ++ case CDROMVOLREAD: ++ if(copy_from_user(&volume, (char *) arg, sizeof(volume))) ++ return(-EFAULT); ++ volume.channel0 = 255; ++ volume.channel1 = 255; ++ volume.channel2 = 255; ++ volume.channel3 = 255; ++ if(copy_to_user((char *) arg, &volume, sizeof(volume))) ++ return(-EFAULT); ++ return(0); ++ ++ default: ++ return blk_ioctl(inode->i_rdev, cmd, arg); ++ } ++} ++ ++static int ubd_revalidate1(kdev_t rdev) ++{ ++ int i, n, offset, err = 0, pcount = 1 << UBD_SHIFT; ++ struct ubd *dev; ++ struct hd_struct *part; ++ ++ n = DEVICE_NR(rdev); ++ offset = n << UBD_SHIFT; ++ dev = &ubd_dev[n]; ++ ++ part = &ubd_part[offset]; ++ ++ /* clear all old partition counts */ ++ for(i = 1; i < pcount; i++) { ++ part[i].start_sect = 0; ++ part[i].nr_sects = 0; ++ } ++ ++ /* If it already has been opened we can check the partitions ++ * directly ++ */ ++ if(dev->count){ ++ part->start_sect = 0; ++ register_disk(&ubd_gendisk, MKDEV(MAJOR_NR, offset), pcount, ++ &ubd_blops, part->nr_sects); ++ } ++ else if(dev->file){ ++ err = ubd_open_dev(dev); ++ if(err){ ++ printk(KERN_ERR "unable to open %s for validation\n", ++ dev->file); ++ goto out; ++ } ++ ++ /* have to recompute sizes since we opened it */ ++ err = ubd_file_size(dev, &dev->size); ++ if(err) { ++ ubd_close(dev); ++ goto out; ++ } ++ part->start_sect = 0; ++ part->nr_sects = dev->size / hardsect_sizes[offset]; ++ register_disk(&ubd_gendisk, MKDEV(MAJOR_NR, offset), pcount, ++ &ubd_blops, part->nr_sects); ++ ++ /* we are done so close it */ ++ ubd_close(dev); ++ } ++ else err = -ENODEV; ++ out: ++ return(err); ++} ++ ++static int ubd_revalidate(kdev_t rdev) ++{ ++ int err; ++ ++ spin_lock(&ubd_lock); ++ err = ubd_revalidate1(rdev); ++ spin_unlock(&ubd_lock); ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/ubd_user.c um/arch/um/drivers/ubd_user.c +--- orig/arch/um/drivers/ubd_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/ubd_user.c Thu Mar 6 18:08:55 2003 +@@ -0,0 +1,626 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "asm/types.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "ubd_user.h" ++#include "os.h" ++ ++#include ++#include ++#if __BYTE_ORDER == __BIG_ENDIAN ++# define ntohll(x) (x) ++# define htonll(x) (x) ++#elif __BYTE_ORDER == __LITTLE_ENDIAN ++# define ntohll(x) bswap_64(x) ++# define htonll(x) bswap_64(x) ++#else ++#error "__BYTE_ORDER not defined" ++#endif ++ ++#define PATH_LEN_V1 256 ++ ++struct cow_header_v1 { ++ int magic; ++ int version; ++ char backing_file[PATH_LEN_V1]; ++ time_t mtime; ++ __u64 size; ++ int sectorsize; ++}; ++ ++#define PATH_LEN_V2 MAXPATHLEN ++ ++struct cow_header_v2 { ++ unsigned long magic; ++ unsigned long version; ++ char backing_file[PATH_LEN_V2]; ++ time_t mtime; ++ __u64 size; ++ int sectorsize; ++}; ++ ++union cow_header { ++ struct cow_header_v1 v1; ++ struct cow_header_v2 v2; ++}; ++ ++#define COW_MAGIC 0x4f4f4f4d /* MOOO */ ++#define COW_VERSION 2 ++ ++static void sizes(__u64 size, int sectorsize, int bitmap_offset, ++ unsigned long *bitmap_len_out, int *data_offset_out) ++{ ++ *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); ++ ++ *data_offset_out = bitmap_offset + *bitmap_len_out; ++ *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; ++ *data_offset_out *= sectorsize; ++} ++ ++static int read_cow_header(int fd, int *magic_out, char **backing_file_out, ++ time_t *mtime_out, __u64 *size_out, ++ int *sectorsize_out, int *bitmap_offset_out) ++{ ++ union cow_header *header; ++ char *file; ++ int err, n; ++ unsigned long version, magic; ++ ++ header = um_kmalloc(sizeof(*header)); ++ if(header == NULL){ ++ printk("read_cow_header - Failed to allocate header\n"); ++ return(-ENOMEM); ++ } ++ err = -EINVAL; ++ n = read(fd, header, sizeof(*header)); ++ if(n < offsetof(typeof(header->v1), backing_file)){ ++ printk("read_cow_header - short header\n"); ++ goto out; ++ } ++ ++ magic = header->v1.magic; ++ if(magic == COW_MAGIC) { ++ version = header->v1.version; ++ } ++ else if(magic == ntohl(COW_MAGIC)){ ++ version = ntohl(header->v1.version); ++ } ++ else goto out; ++ ++ *magic_out = COW_MAGIC; ++ ++ if(version == 1){ ++ if(n < sizeof(header->v1)){ ++ printk("read_cow_header - failed to read V1 header\n"); ++ goto out; ++ } ++ *mtime_out = header->v1.mtime; ++ *size_out = header->v1.size; ++ *sectorsize_out = header->v1.sectorsize; ++ *bitmap_offset_out = sizeof(header->v1); ++ file = header->v1.backing_file; ++ } ++ else if(version == 2){ ++ if(n < sizeof(header->v2)){ ++ printk("read_cow_header - failed to read V2 header\n"); ++ goto out; ++ } ++ *mtime_out = ntohl(header->v2.mtime); ++ *size_out = ntohll(header->v2.size); ++ *sectorsize_out = ntohl(header->v2.sectorsize); ++ *bitmap_offset_out = sizeof(header->v2); ++ file = header->v2.backing_file; ++ } ++ else { ++ printk("read_cow_header - invalid COW version\n"); ++ goto out; ++ } ++ err = -ENOMEM; ++ *backing_file_out = uml_strdup(file); ++ if(*backing_file_out == NULL){ ++ printk("read_cow_header - failed to allocate backing file\n"); ++ goto out; ++ } ++ err = 0; ++ out: ++ kfree(header); ++ return(err); ++} ++ ++static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) ++{ ++ struct stat64 buf1, buf2; ++ ++ if(from_cmdline == NULL) return(1); ++ if(!strcmp(from_cmdline, from_cow)) return(1); ++ ++ if(stat64(from_cmdline, &buf1) < 0){ ++ printk("Couldn't stat '%s', errno = %d\n", from_cmdline, ++ errno); ++ return(1); ++ } ++ if(stat64(from_cow, &buf2) < 0){ ++ printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); ++ return(1); ++ } ++ if((buf1.st_dev == buf2.st_dev) && (buf1.st_ino == buf2.st_ino)) ++ return(1); ++ ++ printk("Backing file mismatch - \"%s\" requested,\n" ++ "\"%s\" specified in COW header of \"%s\"\n", ++ from_cmdline, from_cow, cow); ++ return(0); ++} ++ ++static int backing_file_mismatch(char *file, __u64 size, time_t mtime) ++{ ++ struct stat64 buf; ++ long long actual; ++ int err; ++ ++ if(stat64(file, &buf) < 0){ ++ printk("Failed to stat backing file \"%s\", errno = %d\n", ++ file, errno); ++ return(-errno); ++ } ++ ++ err = os_file_size(file, &actual); ++ if(err){ ++ printk("Failed to get size of backing file \"%s\", " ++ "errno = %d\n", file, -err); ++ return(err); ++ } ++ ++ if(actual != size){ ++ printk("Size mismatch (%ld vs %ld) of COW header vs backing " ++ "file\n", size, actual); ++ return(-EINVAL); ++ } ++ if(buf.st_mtime != mtime){ ++ printk("mtime mismatch (%ld vs %ld) of COW header vs backing " ++ "file\n", mtime, buf.st_mtime); ++ return(-EINVAL); ++ } ++ return(0); ++} ++ ++int read_cow_bitmap(int fd, void *buf, int offset, int len) ++{ ++ int err; ++ ++ err = os_seek_file(fd, offset); ++ if(err != 0) return(-errno); ++ err = read(fd, buf, len); ++ if(err < 0) return(-errno); ++ return(0); ++} ++ ++static int absolutize(char *to, int size, char *from) ++{ ++ char save_cwd[256], *slash; ++ int remaining; ++ ++ if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { ++ printk("absolutize : unable to get cwd - errno = %d\n", errno); ++ return(-1); ++ } ++ slash = strrchr(from, '/'); ++ if(slash != NULL){ ++ *slash = '\0'; ++ if(chdir(from)){ ++ *slash = '/'; ++ printk("absolutize : Can't cd to '%s' - errno = %d\n", ++ from, errno); ++ return(-1); ++ } ++ *slash = '/'; ++ if(getcwd(to, size) == NULL){ ++ printk("absolutize : unable to get cwd of '%s' - " ++ "errno = %d\n", from, errno); ++ return(-1); ++ } ++ remaining = size - strlen(to); ++ if(strlen(slash) + 1 > remaining){ ++ printk("absolutize : unable to fit '%s' into %d " ++ "chars\n", from, size); ++ return(-1); ++ } ++ strcat(to, slash); ++ } ++ else { ++ if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ ++ printk("absolutize : unable to fit '%s' into %d " ++ "chars\n", from, size); ++ return(-1); ++ } ++ strcpy(to, save_cwd); ++ strcat(to, "/"); ++ strcat(to, from); ++ } ++ chdir(save_cwd); ++ return(0); ++} ++ ++static int write_cow_header(char *cow_file, int fd, char *backing_file, ++ int sectorsize, long long *size) ++{ ++ struct cow_header_v2 *header; ++ struct stat64 buf; ++ int err; ++ ++ err = os_seek_file(fd, 0); ++ if(err != 0){ ++ printk("write_cow_header - lseek failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ ++ err = -ENOMEM; ++ header = um_kmalloc(sizeof(*header)); ++ if(header == NULL){ ++ printk("Failed to allocate COW V2 header\n"); ++ goto out; ++ } ++ header->magic = htonl(COW_MAGIC); ++ header->version = htonl(COW_VERSION); ++ ++ err = -EINVAL; ++ if(strlen(backing_file) > sizeof(header->backing_file) - 1){ ++ printk("Backing file name \"%s\" is too long - names are " ++ "limited to %d characters\n", backing_file, ++ sizeof(header->backing_file) - 1); ++ goto out_free; ++ } ++ ++ if(absolutize(header->backing_file, sizeof(header->backing_file), ++ backing_file)) ++ goto out_free; ++ ++ err = stat64(header->backing_file, &buf); ++ if(err < 0){ ++ printk("Stat of backing file '%s' failed, errno = %d\n", ++ header->backing_file, errno); ++ err = -errno; ++ goto out_free; ++ } ++ ++ err = os_file_size(header->backing_file, size); ++ if(err){ ++ printk("Couldn't get size of backing file '%s', errno = %d\n", ++ header->backing_file, -*size); ++ goto out_free; ++ } ++ ++ header->mtime = htonl(buf.st_mtime); ++ header->size = htonll(*size); ++ header->sectorsize = htonl(sectorsize); ++ ++ err = write(fd, header, sizeof(*header)); ++ if(err != sizeof(*header)){ ++ printk("Write of header to new COW file '%s' failed, " ++ "errno = %d\n", cow_file, errno); ++ goto out_free; ++ } ++ err = 0; ++ out_free: ++ kfree(header); ++ out: ++ return(err); ++} ++ ++int open_ubd_file(char *file, struct openflags *openflags, ++ char **backing_file_out, int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, int *data_offset_out, ++ int *create_cow_out) ++{ ++ time_t mtime; ++ __u64 size; ++ char *backing_file; ++ int fd, err, sectorsize, magic, same, mode = 0644; ++ ++ if((fd = os_open_file(file, *openflags, mode)) < 0){ ++ if((fd == -ENOENT) && (create_cow_out != NULL)) ++ *create_cow_out = 1; ++ if(!openflags->w || ++ ((errno != EROFS) && (errno != EACCES))) return(-errno); ++ openflags->w = 0; ++ if((fd = os_open_file(file, *openflags, mode)) < 0) ++ return(fd); ++ } ++ ++ err = os_lock_file(fd, openflags->w); ++ if(err){ ++ printk("Failed to lock '%s', errno = %d\n", file, -err); ++ goto error; ++ } ++ ++ if(backing_file_out == NULL) return(fd); ++ ++ err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, ++ §orsize, bitmap_offset_out); ++ if(err && (*backing_file_out != NULL)){ ++ printk("Failed to read COW header from COW file \"%s\", " ++ "errno = %d\n", file, err); ++ goto error; ++ } ++ if(err) return(fd); ++ ++ if(backing_file_out == NULL) return(fd); ++ ++ same = same_backing_files(*backing_file_out, backing_file, file); ++ ++ if(!same && !backing_file_mismatch(*backing_file_out, size, mtime)){ ++ printk("Switching backing file to '%s'\n", *backing_file_out); ++ err = write_cow_header(file, fd, *backing_file_out, ++ sectorsize, &size); ++ if(err){ ++ printk("Switch failed, errno = %d\n", err); ++ return(err); ++ } ++ } ++ else { ++ *backing_file_out = backing_file; ++ err = backing_file_mismatch(*backing_file_out, size, mtime); ++ if(err) goto error; ++ } ++ ++ sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, ++ data_offset_out); ++ ++ return(fd); ++ error: ++ os_close_file(fd); ++ return(err); ++} ++ ++int create_cow_file(char *cow_file, char *backing_file, struct openflags flags, ++ int sectorsize, int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, int *data_offset_out) ++{ ++ __u64 offset; ++ int err, fd; ++ long long size; ++ char zero = 0; ++ ++ flags.c = 1; ++ fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); ++ if(fd < 0){ ++ err = fd; ++ printk("Open of COW file '%s' failed, errno = %d\n", cow_file, ++ -err); ++ goto out; ++ } ++ ++ err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); ++ if(err) goto out_close; ++ ++ sizes(size, sectorsize, sizeof(struct cow_header_v2), ++ bitmap_len_out, data_offset_out); ++ *bitmap_offset_out = sizeof(struct cow_header_v2); ++ ++ offset = *data_offset_out + size - sizeof(zero); ++ err = os_seek_file(fd, offset); ++ if(err != 0){ ++ printk("cow bitmap lseek failed : errno = %d\n", errno); ++ goto out_close; ++ } ++ ++ /* does not really matter how much we write it is just to set EOF ++ * this also sets the entire COW bitmap ++ * to zero without having to allocate it ++ */ ++ err = os_write_file(fd, &zero, sizeof(zero)); ++ if(err != sizeof(zero)){ ++ printk("Write of bitmap to new COW file '%s' failed, " ++ "errno = %d\n", cow_file, errno); ++ goto out_close; ++ } ++ ++ return(fd); ++ ++ out_close: ++ close(fd); ++ out: ++ return(err); ++} ++ ++int read_ubd_fs(int fd, void *buffer, int len) ++{ ++ int n; ++ ++ n = read(fd, buffer, len); ++ if(n < 0) return(-errno); ++ else return(n); ++} ++ ++int write_ubd_fs(int fd, char *buffer, int len) ++{ ++ int n; ++ ++ n = write(fd, buffer, len); ++ if(n < 0) return(-errno); ++ else return(n); ++} ++ ++void do_io(struct io_thread_req *req) ++{ ++ char *buf; ++ unsigned long len; ++ int n, nsectors, start, end, bit; ++ __u64 off; ++ ++ nsectors = req->length / req->sectorsize; ++ start = 0; ++ do { ++ bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); ++ end = start; ++ while((end < nsectors) && ++ (ubd_test_bit(end, (unsigned char *) ++ &req->sector_mask) == bit)) ++ end++; ++ ++ if(end != nsectors) ++ printk("end != nsectors\n"); ++ off = req->offset + req->offsets[bit] + ++ start * req->sectorsize; ++ len = (end - start) * req->sectorsize; ++ buf = &req->buffer[start * req->sectorsize]; ++ ++ if(os_seek_file(req->fds[bit], off) != 0){ ++ printk("do_io - lseek failed : errno = %d\n", errno); ++ req->error = 1; ++ return; ++ } ++ if(req->op == UBD_READ){ ++ n = 0; ++ do { ++ buf = &buf[n]; ++ len -= n; ++ n = read(req->fds[bit], buf, len); ++ if (n < 0) { ++ printk("do_io - read returned %d : " ++ "errno = %d fd = %d\n", n, ++ errno, req->fds[bit]); ++ req->error = 1; ++ return; ++ } ++ } while((n < len) && (n != 0)); ++ if (n < len) memset(&buf[n], 0, len - n); ++ } ++ else { ++ n = write(req->fds[bit], buf, len); ++ if(n != len){ ++ printk("do_io - write returned %d : " ++ "errno = %d fd = %d\n", n, ++ errno, req->fds[bit]); ++ req->error = 1; ++ return; ++ } ++ } ++ ++ start = end; ++ } while(start < nsectors); ++ ++ if(req->cow_offset != -1){ ++ if(os_seek_file(req->fds[1], req->cow_offset) != 0){ ++ printk("do_io - bitmap lseek failed : errno = %d\n", ++ errno); ++ req->error = 1; ++ return; ++ } ++ n = write(req->fds[1], &req->bitmap_words, ++ sizeof(req->bitmap_words)); ++ if(n != sizeof(req->bitmap_words)){ ++ printk("do_io - bitmap update returned %d : " ++ "errno = %d fd = %d\n", n, errno, req->fds[1]); ++ req->error = 1; ++ return; ++ } ++ } ++ req->error = 0; ++ return; ++} ++ ++/* Changed in start_io_thread, which is serialized by being called only ++ * from ubd_init, which is an initcall. ++ */ ++int kernel_fd = -1; ++ ++/* Only changed by the io thread */ ++int io_count = 0; ++ ++int io_thread(void *arg) ++{ ++ struct io_thread_req req; ++ int n; ++ ++ signal(SIGWINCH, SIG_IGN); ++ while(1){ ++ n = read(kernel_fd, &req, sizeof(req)); ++ if(n < 0) printk("io_thread - read returned %d, errno = %d\n", ++ n, errno); ++ else if(n < sizeof(req)){ ++ printk("io_thread - short read : length = %d\n", n); ++ continue; ++ } ++ io_count++; ++ do_io(&req); ++ n = write(kernel_fd, &req, sizeof(req)); ++ if(n != sizeof(req)) ++ printk("io_thread - write failed, errno = %d\n", ++ errno); ++ } ++} ++ ++int start_io_thread(unsigned long sp, int *fd_out) ++{ ++ int pid, fds[2], err; ++ ++ err = os_pipe(fds, 1, 1); ++ if(err){ ++ printk("start_io_thread - os_pipe failed, errno = %d\n", -err); ++ return(-1); ++ } ++ kernel_fd = fds[0]; ++ *fd_out = fds[1]; ++ ++ pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, ++ NULL); ++ if(pid < 0){ ++ printk("start_io_thread - clone failed : errno = %d\n", errno); ++ return(-errno); ++ } ++ return(pid); ++} ++ ++#ifdef notdef ++int start_io_thread(unsigned long sp, int *fd_out) ++{ ++ int pid; ++ ++ if((kernel_fd = get_pty()) < 0) return(-1); ++ raw(kernel_fd, 0); ++ if((*fd_out = open(ptsname(kernel_fd), O_RDWR)) < 0){ ++ printk("Couldn't open tty for IO\n"); ++ return(-1); ++ } ++ ++ pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, ++ NULL); ++ if(pid < 0){ ++ printk("start_io_thread - clone failed : errno = %d\n", errno); ++ return(-errno); ++ } ++ return(pid); ++} ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm.c um/arch/um/drivers/xterm.c +--- orig/arch/um/drivers/xterm.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/xterm.c Mon Dec 30 20:49:22 2002 +@@ -0,0 +1,200 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "kern_util.h" ++#include "chan_user.h" ++#include "helper.h" ++#include "user_util.h" ++#include "user.h" ++#include "os.h" ++#include "xterm.h" ++ ++struct xterm_chan { ++ int pid; ++ int helper_pid; ++ char *title; ++ int device; ++ int raw; ++ struct termios tt; ++ unsigned long stack; ++ int direct_rcv; ++}; ++ ++void *xterm_init(char *str, int device, struct chan_opts *opts) ++{ ++ struct xterm_chan *data; ++ ++ if((data = malloc(sizeof(*data))) == NULL) return(NULL); ++ *data = ((struct xterm_chan) { .pid = -1, ++ .helper_pid = -1, ++ .device = device, ++ .title = opts->xterm_title, ++ .raw = opts->raw, ++ .stack = opts->tramp_stack, ++ .direct_rcv = !opts->in_kernel } ); ++ return(data); ++} ++ ++/* Only changed by xterm_setup, which is a setup */ ++static char *terminal_emulator = "xterm"; ++static char *title_switch = "-T"; ++static char *exec_switch = "-e"; ++ ++static int __init xterm_setup(char *line, int *add) ++{ ++ *add = 0; ++ terminal_emulator = line; ++ ++ line = strchr(line, ','); ++ if(line == NULL) return(0); ++ *line++ = '\0'; ++ if(*line) title_switch = line; ++ ++ line = strchr(line, ','); ++ if(line == NULL) return(0); ++ *line++ = '\0'; ++ if(*line) exec_switch = line; ++ ++ return(0); ++} ++ ++__uml_setup("xterm=", xterm_setup, ++"xterm=,,<exec switch>\n" ++" Specifies an alternate terminal emulator to use for the debugger,\n" ++" consoles, and serial lines when they are attached to the xterm channel.\n" ++" The values are the terminal emulator binary, the switch it uses to set\n" ++" its title, and the switch it uses to execute a subprocess,\n" ++" respectively. The title switch must have the form '<switch> title',\n" ++" not '<switch>=title'. Similarly, the exec switch must have the form\n" ++" '<switch> command arg1 arg2 ...'.\n" ++" The default values are 'xterm=xterm,-T,-e'. Values for gnome-terminal\n" ++" are 'xterm=gnome-terminal,-t,-x'.\n\n" ++); ++ ++int xterm_open(int input, int output, int primary, void *d, char **dev_out) ++{ ++ struct xterm_chan *data = d; ++ unsigned long stack; ++ int pid, fd, new, err; ++ char title[256], file[] = "/tmp/xterm-pipeXXXXXX"; ++ char *argv[] = { terminal_emulator, title_switch, title, exec_switch, ++ "/usr/lib/uml/port-helper", "-uml-socket", ++ file, NULL }; ++ ++ if(access(argv[4], X_OK)) ++ argv[4] = "port-helper"; ++ ++ fd = mkstemp(file); ++ if(fd < 0){ ++ printk("xterm_open : mkstemp failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ ++ if(unlink(file)){ ++ printk("xterm_open : unlink failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ close(fd); ++ ++ fd = create_unix_socket(file, sizeof(file)); ++ if(fd < 0){ ++ printk("xterm_open : create_unix_socket failed, errno = %d\n", ++ -fd); ++ return(-fd); ++ } ++ ++ sprintf(title, data->title, data->device); ++ stack = data->stack; ++ pid = run_helper(NULL, NULL, argv, &stack); ++ if(pid < 0){ ++ printk("xterm_open : run_helper failed, errno = %d\n", -pid); ++ return(pid); ++ } ++ ++ if(data->stack == 0) free_stack(stack, 0); ++ ++ if(data->direct_rcv) ++ new = os_rcv_fd(fd, &data->helper_pid); ++ else { ++ if((err = os_set_fd_block(fd, 0)) != 0){ ++ printk("xterm_open : failed to set descriptor " ++ "non-blocking, errno = %d\n", err); ++ return(err); ++ } ++ new = xterm_fd(fd, &data->helper_pid); ++ } ++ if(new < 0){ ++ printk("xterm_open : os_rcv_fd failed, errno = %d\n", -new); ++ goto out; ++ } ++ ++ tcgetattr(new, &data->tt); ++ if(data->raw) raw(new, 0); ++ ++ data->pid = pid; ++ *dev_out = NULL; ++ out: ++ unlink(file); ++ return(new); ++} ++ ++void xterm_close(int fd, void *d) ++{ ++ struct xterm_chan *data = d; ++ ++ if(data->pid != -1) ++ os_kill_process(data->pid, 1); ++ data->pid = -1; ++ if(data->helper_pid != -1) ++ os_kill_process(data->helper_pid, 0); ++ data->helper_pid = -1; ++ close(fd); ++} ++ ++void xterm_free(void *d) ++{ ++ free(d); ++} ++ ++int xterm_console_write(int fd, const char *buf, int n, void *d) ++{ ++ struct xterm_chan *data = d; ++ ++ return(generic_console_write(fd, buf, n, &data->tt)); ++} ++ ++struct chan_ops xterm_ops = { ++ .type = "xterm", ++ .init = xterm_init, ++ .open = xterm_open, ++ .close = xterm_close, ++ .read = generic_read, ++ .write = generic_write, ++ .console_write = xterm_console_write, ++ .window_size = generic_window_size, ++ .free = xterm_free, ++ .winch = 1, ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm.h um/arch/um/drivers/xterm.h +--- orig/arch/um/drivers/xterm.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/xterm.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __XTERM_H__ ++#define __XTERM_H__ ++ ++extern int xterm_fd(int socket, int *pid_out); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/drivers/xterm_kern.c um/arch/um/drivers/xterm_kern.c +--- orig/arch/um/drivers/xterm_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/drivers/xterm_kern.c Tue Dec 17 17:31:20 2002 +@@ -0,0 +1,79 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/errno.h" ++#include "linux/slab.h" ++#include "asm/semaphore.h" ++#include "asm/irq.h" ++#include "irq_user.h" ++#include "kern_util.h" ++#include "os.h" ++#include "xterm.h" ++ ++struct xterm_wait { ++ struct semaphore sem; ++ int fd; ++ int pid; ++ int new_fd; ++}; ++ ++static void xterm_interrupt(int irq, void *data, struct pt_regs *regs) ++{ ++ struct xterm_wait *xterm = data; ++ int fd; ++ ++ fd = os_rcv_fd(xterm->fd, &xterm->pid); ++ if(fd == -EAGAIN) ++ return; ++ ++ xterm->new_fd = fd; ++ up(&xterm->sem); ++} ++ ++int xterm_fd(int socket, int *pid_out) ++{ ++ struct xterm_wait *data; ++ int err, ret; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL){ ++ printk(KERN_ERR "xterm_fd : failed to allocate xterm_wait\n"); ++ return(-ENOMEM); ++ } ++ *data = ((struct xterm_wait) ++ { .sem = __SEMAPHORE_INITIALIZER(data->sem, 0), ++ .fd = socket, ++ .pid = -1, ++ .new_fd = -1 }); ++ ++ err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, ++ SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, ++ "xterm", data); ++ if(err){ ++ printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, " ++ "err = %d\n", err); ++ return(err); ++ } ++ down(&data->sem); ++ ++ free_irq(XTERM_IRQ, data); ++ ++ ret = data->new_fd; ++ *pid_out = data->pid; ++ kfree(data); ++ ++ return(ret); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/dyn_link.ld.in um/arch/um/dyn_link.ld.in +--- orig/arch/um/dyn_link.ld.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/dyn_link.ld.in Fri Jan 17 23:37:27 2003 +@@ -0,0 +1,172 @@ ++OUTPUT_FORMAT("ELF_FORMAT") ++OUTPUT_ARCH(ELF_ARCH) ++ENTRY(_start) ++SEARCH_DIR("/usr/local/i686-pc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); ++/* Do we need any of these for elf? ++ __DYNAMIC = 0; */ ++SECTIONS ++{ ++ . = START() + SIZEOF_HEADERS; ++ .interp : { *(.interp) } ++ . = ALIGN(4096); ++ __binary_start = .; ++ . = ALIGN(4096); /* Init code and data */ ++ _stext = .; ++ __init_begin = .; ++ .text.init : { *(.text.init) } ++ ++ . = ALIGN(4096); ++ ++ /* Read-only sections, merged into text segment: */ ++ .hash : { *(.hash) } ++ .dynsym : { *(.dynsym) } ++ .dynstr : { *(.dynstr) } ++ .gnu.version : { *(.gnu.version) } ++ .gnu.version_d : { *(.gnu.version_d) } ++ .gnu.version_r : { *(.gnu.version_r) } ++ .rel.init : { *(.rel.init) } ++ .rela.init : { *(.rela.init) } ++ .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } ++ .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } ++ .rel.fini : { *(.rel.fini) } ++ .rela.fini : { *(.rela.fini) } ++ .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } ++ .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } ++ .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } ++ .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } ++ .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } ++ .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } ++ .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } ++ .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } ++ .rel.ctors : { *(.rel.ctors) } ++ .rela.ctors : { *(.rela.ctors) } ++ .rel.dtors : { *(.rel.dtors) } ++ .rela.dtors : { *(.rela.dtors) } ++ .rel.got : { *(.rel.got) } ++ .rela.got : { *(.rela.got) } ++ .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } ++ .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } ++ .rel.plt : { *(.rel.plt) } ++ .rela.plt : { *(.rela.plt) } ++ .init : { ++ KEEP (*(.init)) ++ } =0x90909090 ++ .plt : { *(.plt) } ++ .text : { ++ *(.text .stub .text.* .gnu.linkonce.t.*) ++ /* .gnu.warning sections are handled specially by elf32.em. */ ++ *(.gnu.warning) ++ } =0x90909090 ++ .fini : { ++ KEEP (*(.fini)) ++ } =0x90909090 ++ ++ PROVIDE (__etext = .); ++ PROVIDE (_etext = .); ++ PROVIDE (etext = .); ++ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } ++ .rodata1 : { *(.rodata1) } ++ .eh_frame_hdr : { *(.eh_frame_hdr) } ++ ++ ++ . = ALIGN(4096); ++ PROVIDE (_sdata = .); ++ ++include(`arch/um/common.ld.in') ++ ++ /* Ensure the __preinit_array_start label is properly aligned. We ++ could instead move the label definition inside the section, but ++ the linker would then create the section even if it turns out to ++ be empty, which isn't pretty. */ ++ . = ALIGN(32 / 8); ++ .preinit_array : { *(.preinit_array) } ++ .init_array : { *(.init_array) } ++ .fini_array : { *(.fini_array) } ++ .data : { ++ . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ ++ *(.data.init_task) ++ *(.data .data.* .gnu.linkonce.d.*) ++ SORT(CONSTRUCTORS) ++ } ++ .data1 : { *(.data1) } ++ .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } ++ .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } ++ .eh_frame : { KEEP (*(.eh_frame)) } ++ .gcc_except_table : { *(.gcc_except_table) } ++ .dynamic : { *(.dynamic) } ++ .ctors : { ++ /* gcc uses crtbegin.o to find the start of ++ the constructors, so we make sure it is ++ first. Because this is a wildcard, it ++ doesn't matter if the user does not ++ actually link against crtbegin.o; the ++ linker won't look for a file to match a ++ wildcard. The wildcard also means that it ++ doesn't matter which directory crtbegin.o ++ is in. */ ++ KEEP (*crtbegin.o(.ctors)) ++ /* We don't want to include the .ctor section from ++ from the crtend.o file until after the sorted ctors. ++ The .ctor section from the crtend file contains the ++ end of ctors marker and it must be last */ ++ KEEP (*(EXCLUDE_FILE (*crtend.o ) .ctors)) ++ KEEP (*(SORT(.ctors.*))) ++ KEEP (*(.ctors)) ++ } ++ .dtors : { ++ KEEP (*crtbegin.o(.dtors)) ++ KEEP (*(EXCLUDE_FILE (*crtend.o ) .dtors)) ++ KEEP (*(SORT(.dtors.*))) ++ KEEP (*(.dtors)) ++ } ++ .jcr : { KEEP (*(.jcr)) } ++ .got : { *(.got.plt) *(.got) } ++ _edata = .; ++ PROVIDE (edata = .); ++ __bss_start = .; ++ .bss : { ++ *(.dynbss) ++ *(.bss .bss.* .gnu.linkonce.b.*) ++ *(COMMON) ++ /* Align here to ensure that the .bss section occupies space up to ++ _end. Align after .bss to ensure correct alignment even if the ++ .bss section disappears because there are no input sections. */ ++ . = ALIGN(32 / 8); ++ . = ALIGN(32 / 8); ++ } ++ _end = .; ++ PROVIDE (end = .); ++ /* Stabs debugging sections. */ ++ .stab 0 : { *(.stab) } ++ .stabstr 0 : { *(.stabstr) } ++ .stab.excl 0 : { *(.stab.excl) } ++ .stab.exclstr 0 : { *(.stab.exclstr) } ++ .stab.index 0 : { *(.stab.index) } ++ .stab.indexstr 0 : { *(.stab.indexstr) } ++ .comment 0 : { *(.comment) } ++ /* DWARF debug sections. ++ Symbols in the DWARF debugging sections are relative to the beginning ++ of the section so we begin them at 0. */ ++ /* DWARF 1 */ ++ .debug 0 : { *(.debug) } ++ .line 0 : { *(.line) } ++ /* GNU DWARF 1 extensions */ ++ .debug_srcinfo 0 : { *(.debug_srcinfo) } ++ .debug_sfnames 0 : { *(.debug_sfnames) } ++ /* DWARF 1.1 and DWARF 2 */ ++ .debug_aranges 0 : { *(.debug_aranges) } ++ .debug_pubnames 0 : { *(.debug_pubnames) } ++ /* DWARF 2 */ ++ .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } ++ .debug_abbrev 0 : { *(.debug_abbrev) } ++ .debug_line 0 : { *(.debug_line) } ++ .debug_frame 0 : { *(.debug_frame) } ++ .debug_str 0 : { *(.debug_str) } ++ .debug_loc 0 : { *(.debug_loc) } ++ .debug_macinfo 0 : { *(.debug_macinfo) } ++ /* SGI/MIPS DWARF 2 extensions */ ++ .debug_weaknames 0 : { *(.debug_weaknames) } ++ .debug_funcnames 0 : { *(.debug_funcnames) } ++ .debug_typenames 0 : { *(.debug_typenames) } ++ .debug_varnames 0 : { *(.debug_varnames) } ++} +diff -Naur -X ../exclude-files orig/arch/um/fs/Makefile um/arch/um/fs/Makefile +--- orig/arch/um/fs/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET := built-in.o ++ ++subdir-y = ++subdir-m = ++ ++subdir-$(CONFIG_HOSTFS) += hostfs ++subdir-$(CONFIG_HPPFS) += hppfs ++ ++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) ++obj-m += $(join $(subdir-m),$(subdir-m:%=/%.o)) ++ ++include $(TOPDIR)/Rules.make ++ ++dep: ++ ++clean: ++ ++archmrproper: +diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/Makefile um/arch/um/fs/hostfs/Makefile +--- orig/arch/um/fs/hostfs/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hostfs/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,24 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino ++# to __st_ino. It stayed in the same place, so as long as the correct name ++# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa. ++ ++STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \ ++ echo __)st_ino ++ ++USER_CFLAGS := $(USER_CFLAGS) -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD) ++ ++O_TARGET := hostfs.o ++obj-y = hostfs_kern.o hostfs_user.o ++obj-m = $(O_TARGET) ++ ++USER_OBJS = $(filter %_user.o,$(obj-y)) ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< +diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs.h um/arch/um/fs/hostfs/hostfs.h +--- orig/arch/um/fs/hostfs/hostfs.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hostfs/hostfs.h Mon Feb 24 23:00:47 2003 +@@ -0,0 +1,69 @@ ++#ifndef __UM_FS_HOSTFS ++#define __UM_FS_HOSTFS ++ ++#include "os.h" ++ ++/* These are exactly the same definitions as in fs.h, but the names are ++ * changed so that this file can be included in both kernel and user files. ++ */ ++ ++#define HOSTFS_ATTR_MODE 1 ++#define HOSTFS_ATTR_UID 2 ++#define HOSTFS_ATTR_GID 4 ++#define HOSTFS_ATTR_SIZE 8 ++#define HOSTFS_ATTR_ATIME 16 ++#define HOSTFS_ATTR_MTIME 32 ++#define HOSTFS_ATTR_CTIME 64 ++#define HOSTFS_ATTR_ATIME_SET 128 ++#define HOSTFS_ATTR_MTIME_SET 256 ++#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ ++#define HOSTFS_ATTR_ATTR_FLAG 1024 ++ ++struct hostfs_iattr { ++ unsigned int ia_valid; ++ mode_t ia_mode; ++ uid_t ia_uid; ++ gid_t ia_gid; ++ loff_t ia_size; ++ time_t ia_atime; ++ time_t ia_mtime; ++ time_t ia_ctime; ++ unsigned int ia_attr_flags; ++}; ++ ++extern int stat_file(const char *path, int *dev_out, ++ unsigned long long *inode_out, int *mode_out, ++ int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, unsigned long *atime_out, ++ unsigned long *mtime_out, unsigned long *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out); ++extern int access_file(char *path, int r, int w, int x); ++extern int open_file(char *path, int r, int w); ++extern int file_type(const char *path, int *rdev); ++extern void *open_dir(char *path, int *err_out); ++extern char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out); ++extern void close_file(void *stream); ++extern void close_dir(void *stream); ++extern int read_file(int fd, unsigned long long *offset, char *buf, int len); ++extern int write_file(int fd, unsigned long long *offset, const char *buf, ++ int len); ++extern int lseek_file(int fd, long long offset, int whence); ++extern int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox); ++extern int set_attr(const char *file, struct hostfs_iattr *attrs); ++extern int make_symlink(const char *from, const char *to); ++extern int unlink_file(const char *file); ++extern int do_mkdir(const char *file, int mode); ++extern int do_rmdir(const char *file); ++extern int do_mknod(const char *file, int mode, int dev); ++extern int link_file(const char *from, const char *to); ++extern int do_readlink(char *file, char *buf, int size); ++extern int rename_file(char *from, char *to); ++extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs_kern.c um/arch/um/fs/hostfs/hostfs_kern.c +--- orig/arch/um/fs/hostfs/hostfs_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hostfs/hostfs_kern.c Sun Apr 13 21:29:33 2003 +@@ -0,0 +1,870 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <linux/stddef.h> ++#include <linux/fs.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/blkdev.h> ++#include <asm/uaccess.h> ++#include "hostfs.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "user_util.h" ++#include "2_5compat.h" ++ ++#define file_hostfs_i(file) (&(file)->f_dentry->d_inode->u.hostfs_i) ++ ++int hostfs_d_delete(struct dentry *dentry) ++{ ++ return(1); ++} ++ ++struct dentry_operations hostfs_dentry_ops = { ++ .d_delete = hostfs_d_delete, ++}; ++ ++/* Not changed */ ++static char *root_ino = "/"; ++ ++#define HOSTFS_SUPER_MAGIC 0x00c0ffee ++ ++static struct inode_operations hostfs_iops; ++static struct inode_operations hostfs_dir_iops; ++static struct address_space_operations hostfs_link_aops; ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ int len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = parent->d_inode->u.hostfs_i.host_filename; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len -= parent->d_name.len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], parent->d_name.name, ++ parent->d_name.len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++static char *inode_name(struct inode *ino, int extra) ++{ ++ struct dentry *dentry; ++ ++ dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); ++ return(dentry_name(dentry, extra)); ++} ++ ++static int read_name(struct inode *ino, char *name) ++{ ++ /* The non-int inode fields are copied into ints by stat_file and ++ * then copied into the inode because passing the actual pointers ++ * in and having them treated as int * breaks on big-endian machines ++ */ ++ int err; ++ int i_dev, i_mode, i_nlink, i_blksize; ++ unsigned long long i_size; ++ unsigned long long i_ino; ++ unsigned long long i_blocks; ++ err = stat_file(name, &i_dev, &i_ino, &i_mode, &i_nlink, ++ &ino->i_uid, &ino->i_gid, &i_size, &ino->i_atime, ++ &ino->i_mtime, &ino->i_ctime, &i_blksize, &i_blocks); ++ if(err) return(err); ++ ino->i_ino = i_ino; ++ ino->i_dev = i_dev; ++ ino->i_mode = i_mode; ++ ino->i_nlink = i_nlink; ++ ino->i_size = i_size; ++ ino->i_blksize = i_blksize; ++ ino->i_blocks = i_blocks; ++ if(kdev_same(ino->i_sb->s_dev, ROOT_DEV) && (ino->i_uid == getuid())) ++ ino->i_uid = 0; ++ return(0); ++} ++ ++static char *follow_link(char *link) ++{ ++ int len, n; ++ char *name, *resolved, *end; ++ ++ len = 64; ++ while(1){ ++ n = -ENOMEM; ++ name = kmalloc(len, GFP_KERNEL); ++ if(name == NULL) ++ goto out; ++ ++ n = do_readlink(link, name, len); ++ if(n < len) ++ break; ++ len *= 2; ++ kfree(name); ++ } ++ if(n < 0) ++ goto out_free; ++ ++ if(*name == '/') ++ return(name); ++ ++ end = strrchr(link, '/'); ++ if(end == NULL) ++ return(name); ++ ++ *(end + 1) = '\0'; ++ len = strlen(link) + strlen(name) + 1; ++ ++ resolved = kmalloc(len, GFP_KERNEL); ++ if(resolved == NULL){ ++ n = -ENOMEM; ++ goto out_free; ++ } ++ ++ sprintf(resolved, "%s%s", link, name); ++ kfree(name); ++ kfree(link); ++ return(resolved); ++ ++ out_free: ++ kfree(name); ++ out: ++ return(ERR_PTR(n)); ++} ++ ++static int read_inode(struct inode *ino) ++{ ++ char *name; ++ int err; ++ ++ err = -ENOMEM; ++ name = inode_name(ino, 0); ++ if(name == NULL) ++ goto out; ++ ++ if(file_type(name, NULL) == OS_TYPE_SYMLINK){ ++ name = follow_link(name); ++ if(IS_ERR(name)){ ++ err = PTR_ERR(name); ++ goto out; ++ } ++ } ++ ++ err = read_name(ino, name); ++ kfree(name); ++ out: ++ return(err); ++} ++ ++void hostfs_delete_inode(struct inode *ino) ++{ ++ if(ino->u.hostfs_i.host_filename) ++ kfree(ino->u.hostfs_i.host_filename); ++ ino->u.hostfs_i.host_filename = NULL; ++ ++ if(ino->u.hostfs_i.fd != -1) ++ close_file(&ino->u.hostfs_i.fd); ++ ++ ino->u.hostfs_i.mode = 0; ++ clear_inode(ino); ++} ++ ++int hostfs_statfs(struct super_block *sb, struct statfs *sf) ++{ ++ /* do_statfs uses struct statfs64 internally, but the linux kernel ++ * struct statfs still has 32-bit versions for most of these fields, ++ * so we convert them here ++ */ ++ int err; ++ long long f_blocks; ++ long long f_bfree; ++ long long f_bavail; ++ long long f_files; ++ long long f_ffree; ++ ++ err = do_statfs(sb->s_root->d_inode->u.hostfs_i.host_filename, ++ &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, ++ &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), ++ &sf->f_namelen, sf->f_spare); ++ if(err) return(err); ++ sf->f_blocks = f_blocks; ++ sf->f_bfree = f_bfree; ++ sf->f_bavail = f_bavail; ++ sf->f_files = f_files; ++ sf->f_ffree = f_ffree; ++ sf->f_type = HOSTFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct super_operations hostfs_sbops = { ++ .put_inode = force_delete, ++ .delete_inode = hostfs_delete_inode, ++ .statfs = hostfs_statfs, ++}; ++ ++int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ void *dir; ++ char *name; ++ unsigned long long next, ino; ++ int error, len; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ dir = open_dir(name, &error); ++ kfree(name); ++ if(dir == NULL) return(-error); ++ next = file->f_pos; ++ while((name = read_dir(dir, &next, &ino, &len)) != NULL){ ++ error = (*filldir)(ent, name, len, file->f_pos, ++ ino, DT_UNKNOWN); ++ if(error) break; ++ file->f_pos = next; ++ } ++ close_dir(dir); ++ return(0); ++} ++ ++int hostfs_file_open(struct inode *ino, struct file *file) ++{ ++ char *name; ++ int mode = 0, r = 0, w = 0, fd; ++ ++ mode = file->f_mode & (FMODE_READ | FMODE_WRITE); ++ if((mode & ino->u.hostfs_i.mode) == mode) ++ return(0); ++ ++ /* The file may already have been opened, but with the wrong access, ++ * so this resets things and reopens the file with the new access. ++ */ ++ if(ino->u.hostfs_i.fd != -1){ ++ close_file(&ino->u.hostfs_i.fd); ++ ino->u.hostfs_i.fd = -1; ++ } ++ ++ ino->u.hostfs_i.mode |= mode; ++ if(ino->u.hostfs_i.mode & FMODE_READ) ++ r = 1; ++ if(ino->u.hostfs_i.mode & FMODE_WRITE) ++ w = 1; ++ if(w) ++ r = 1; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) ++ return(-ENOMEM); ++ ++ fd = open_file(name, r, w); ++ kfree(name); ++ if(fd < 0) return(fd); ++ file_hostfs_i(file)->fd = fd; ++ ++ return(0); ++} ++ ++int hostfs_dir_open(struct inode *ino, struct file *file) ++{ ++ return(0); ++} ++ ++int hostfs_dir_release(struct inode *ino, struct file *file) ++{ ++ return(0); ++} ++ ++int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hostfs_file_fops = { ++ .owner = NULL, ++ .read = generic_file_read, ++ .write = generic_file_write, ++ .mmap = generic_file_mmap, ++ .open = hostfs_file_open, ++ .release = NULL, ++ .fsync = hostfs_fsync, ++}; ++ ++static struct file_operations hostfs_dir_fops = { ++ .owner = NULL, ++ .readdir = hostfs_readdir, ++ .open = hostfs_dir_open, ++ .release = hostfs_dir_release, ++ .fsync = hostfs_fsync, ++}; ++ ++int hostfs_writepage(struct page *page) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ unsigned long long base; ++ int count = PAGE_CACHE_SIZE; ++ int end_index = inode->i_size >> PAGE_CACHE_SHIFT; ++ int err; ++ ++ if (page->index >= end_index) ++ count = inode->i_size & (PAGE_CACHE_SIZE-1); ++ ++ buffer = kmap(page); ++ base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; ++ ++ err = write_file(inode->u.hostfs_i.fd, &base, buffer, count); ++ if(err != count){ ++ ClearPageUptodate(page); ++ goto out; ++ } ++ ++ if (base > inode->i_size) ++ inode->i_size = base; ++ ++ if (PageError(page)) ++ ClearPageError(page); ++ err = 0; ++ ++ out: ++ kunmap(page); ++ ++ UnlockPage(page); ++ return err; ++} ++ ++int hostfs_readpage(struct file *file, struct page *page) ++{ ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ err = read_file(file_hostfs_i(file)->fd, &start, buffer, ++ PAGE_CACHE_SIZE); ++ if(err < 0) goto out; ++ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ out: ++ kunmap(page); ++ UnlockPage(page); ++ return(err); ++} ++ ++int hostfs_prepare_write(struct file *file, struct page *page, ++ unsigned int from, unsigned int to) ++{ ++ char *buffer; ++ long long start, tmp; ++ int err; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ if(from != 0){ ++ tmp = start; ++ err = read_file(file_hostfs_i(file)->fd, &tmp, buffer, ++ from); ++ if(err < 0) goto out; ++ } ++ if(to != PAGE_CACHE_SIZE){ ++ start += to; ++ err = read_file(file_hostfs_i(file)->fd, &start, buffer + to, ++ PAGE_CACHE_SIZE - to); ++ if(err < 0) goto out; ++ } ++ err = 0; ++ out: ++ kunmap(page); ++ return(err); ++} ++ ++int hostfs_commit_write(struct file *file, struct page *page, unsigned from, ++ unsigned to) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) (page->index << PAGE_CACHE_SHIFT) + from; ++ buffer = kmap(page); ++ err = write_file(file_hostfs_i(file)->fd, &start, buffer + from, ++ to - from); ++ if(err > 0) err = 0; ++ if(!err && (start > inode->i_size)) ++ inode->i_size = start; ++ ++ kunmap(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_aops = { ++ .writepage = hostfs_writepage, ++ .readpage = hostfs_readpage, ++ .prepare_write = hostfs_prepare_write, ++ .commit_write = hostfs_commit_write ++}; ++ ++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry, ++ int *error) ++{ ++ struct inode *inode; ++ char *name; ++ int type, err = -ENOMEM, rdev; ++ ++ inode = new_inode(sb); ++ if(inode == NULL) ++ goto out; ++ ++ inode->u.hostfs_i.host_filename = NULL; ++ inode->u.hostfs_i.fd = -1; ++ inode->u.hostfs_i.mode = 0; ++ insert_inode_hash(inode); ++ if(dentry){ ++ name = dentry_name(dentry, 0); ++ if(name == NULL){ ++ err = -ENOMEM; ++ goto out_put; ++ } ++ type = file_type(name, &rdev); ++ kfree(name); ++ } ++ else type = OS_TYPE_DIR; ++ inode->i_sb = sb; ++ ++ err = 0; ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_op = &page_symlink_inode_operations; ++ else if(type == OS_TYPE_DIR) ++ inode->i_op = &hostfs_dir_iops; ++ else inode->i_op = &hostfs_iops; ++ ++ if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops; ++ else inode->i_fop = &hostfs_file_fops; ++ ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_mapping->a_ops = &hostfs_link_aops; ++ else inode->i_mapping->a_ops = &hostfs_aops; ++ ++ switch (type) { ++ case OS_TYPE_CHARDEV: ++ init_special_inode(inode, S_IFCHR, rdev); ++ break; ++ case OS_TYPE_BLOCKDEV: ++ init_special_inode(inode, S_IFBLK, rdev); ++ break; ++ case OS_TYPE_FIFO: ++ init_special_inode(inode, S_IFIFO, 0); ++ break; ++ case OS_TYPE_SOCK: ++ init_special_inode(inode, S_IFSOCK, 0); ++ break; ++ } ++ ++ if(error) *error = err; ++ return(inode); ++ out_put: ++ iput(inode); ++ out: ++ if(error) *error = err; ++ return(NULL); ++} ++ ++int hostfs_create(struct inode *dir, struct dentry *dentry, int mode) ++{ ++ struct inode *inode; ++ char *name; ++ int error; ++ ++ inode = get_inode(dir->i_sb, dentry, &error); ++ if(error) return(error); ++ name = dentry_name(dentry, 0); ++ if(name == NULL){ ++ iput(inode); ++ return(-ENOMEM); ++ } ++ error = file_create(name, ++ mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, ++ mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, ++ mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); ++ if(!error) error = read_name(inode, name); ++ kfree(name); ++ if(error){ ++ iput(inode); ++ return(error); ++ } ++ d_instantiate(dentry, inode); ++ return(0); ++} ++ ++struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry) ++{ ++ struct inode *inode; ++ char *name; ++ int error; ++ ++ inode = get_inode(ino->i_sb, dentry, &error); ++ if(error != 0) return(ERR_PTR(error)); ++ name = dentry_name(dentry, 0); ++ if(name == NULL) return(ERR_PTR(-ENOMEM)); ++ error = read_name(inode, name); ++ kfree(name); ++ if(error){ ++ iput(inode); ++ if(error == -ENOENT) inode = NULL; ++ else return(ERR_PTR(error)); ++ } ++ d_add(dentry, inode); ++ dentry->d_op = &hostfs_dentry_ops; ++ return(NULL); ++} ++ ++static char *inode_dentry_name(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int len; ++ ++ file = inode_name(ino, dentry->d_name.len + 1); ++ if(file == NULL) return(NULL); ++ strcat(file, "/"); ++ len = strlen(file); ++ strncat(file, dentry->d_name.name, dentry->d_name.len); ++ file[len + dentry->d_name.len] = '\0'; ++ return(file); ++} ++ ++int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(ino, from)) == NULL) ++ return(-ENOMEM); ++ to_name = dentry_name(to, 0); ++ if(to_name == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = link_file(to_name, from_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++int hostfs_unlink(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = unlink_file(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = make_symlink(file, to); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_mkdir(file, mode); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_rmdir(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_rmdir(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev) ++{ ++ struct inode *inode; ++ char *name; ++ int error; ++ ++ inode = get_inode(dir->i_sb, dentry, &error); ++ if(error) return(error); ++ name = dentry_name(dentry, 0); ++ if(name == NULL){ ++ iput(inode); ++ return(-ENOMEM); ++ } ++ init_special_inode(inode, mode, dev); ++ error = do_mknod(name, mode, dev); ++ if(!error) error = read_name(inode, name); ++ kfree(name); ++ if(error){ ++ iput(inode); ++ return(error); ++ } ++ d_instantiate(dentry, inode); ++ return(0); ++} ++ ++int hostfs_rename(struct inode *from_ino, struct dentry *from, ++ struct inode *to_ino, struct dentry *to) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(from_ino, from)) == NULL) ++ return(-ENOMEM); ++ if((to_name = inode_dentry_name(to_ino, to)) == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = rename_file(from_name, to_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++void hostfs_truncate(struct inode *ino) ++{ ++ not_implemented(); ++} ++ ++int hostfs_permission(struct inode *ino, int desired) ++{ ++ char *name; ++ int r = 0, w = 0, x = 0, err; ++ ++ if(desired & MAY_READ) r = 1; ++ if(desired & MAY_WRITE) w = 1; ++ if(desired & MAY_EXEC) x = 1; ++ name = inode_name(ino, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = access_file(name, r, w, x); ++ kfree(name); ++ if(!err) err = vfs_permission(ino, desired); ++ return(err); ++} ++ ++int hostfs_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++ struct hostfs_iattr attrs; ++ char *name; ++ int err; ++ ++ attrs.ia_valid = 0; ++ if(attr->ia_valid & ATTR_MODE){ ++ attrs.ia_valid |= HOSTFS_ATTR_MODE; ++ attrs.ia_mode = attr->ia_mode; ++ } ++ if(attr->ia_valid & ATTR_UID){ ++ if(kdev_same(dentry->d_inode->i_sb->s_dev, ROOT_DEV) && ++ (attr->ia_uid == 0)) ++ attr->ia_uid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_UID; ++ attrs.ia_uid = attr->ia_uid; ++ } ++ if(attr->ia_valid & ATTR_GID){ ++ if(kdev_same(dentry->d_inode->i_sb->s_dev, ROOT_DEV) && ++ (attr->ia_gid == 0)) ++ attr->ia_gid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_GID; ++ attrs.ia_gid = attr->ia_gid; ++ } ++ if(attr->ia_valid & ATTR_SIZE){ ++ attrs.ia_valid |= HOSTFS_ATTR_SIZE; ++ attrs.ia_size = attr->ia_size; ++ } ++ if(attr->ia_valid & ATTR_ATIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME; ++ attrs.ia_atime = attr->ia_atime; ++ } ++ if(attr->ia_valid & ATTR_MTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME; ++ attrs.ia_mtime = attr->ia_mtime; ++ } ++ if(attr->ia_valid & ATTR_CTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_CTIME; ++ attrs.ia_ctime = attr->ia_ctime; ++ } ++ if(attr->ia_valid & ATTR_ATIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; ++ } ++ if(attr->ia_valid & ATTR_MTIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; ++ } ++ name = dentry_name(dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = set_attr(name, &attrs); ++ kfree(name); ++ if(err) ++ return(err); ++ ++ return(inode_setattr(dentry->d_inode, attr)); ++} ++ ++int hostfs_getattr(struct dentry *dentry, struct iattr *attr) ++{ ++ not_implemented(); ++ return(-EINVAL); ++} ++ ++static struct inode_operations hostfs_iops = { ++ .create = hostfs_create, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++static struct inode_operations hostfs_dir_iops = { ++ .create = hostfs_create, ++ .lookup = hostfs_lookup, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++int hostfs_link_readpage(struct file *file, struct page *page) ++{ ++ char *buffer, *name; ++ long long start; ++ int err; ++ ++ start = page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ name = inode_name(page->mapping->host, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = do_readlink(name, buffer, PAGE_CACHE_SIZE); ++ kfree(name); ++ if(err == PAGE_CACHE_SIZE) ++ err = -E2BIG; ++ else if(err > 0){ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ } ++ kunmap(page); ++ UnlockPage(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_link_aops = { ++ .readpage = hostfs_link_readpage, ++}; ++ ++static struct super_block *hostfs_read_super_common(struct super_block *sb, ++ char *data) ++{ ++ struct inode *root_inode; ++ char *name; ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HOSTFS_SUPER_MAGIC; ++ sb->s_op = &hostfs_sbops; ++ if((data == NULL) || (*((char *) data) == '\0')) data = root_ino; ++ name = kmalloc(strlen(data) + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ strcpy(name, data); ++ root_inode = get_inode(sb, NULL, NULL); ++ if(root_inode == NULL) ++ goto out_free; ++ ++ root_inode->u.hostfs_i.host_filename = name; ++ sb->s_root = d_alloc_root(root_inode); ++ if(read_inode(root_inode)) ++ goto out_put; ++ return(sb); ++ ++ out_free: ++ kfree(name); ++ out_put: ++ iput(root_inode); ++ return(NULL); ++} ++ ++struct super_block *hostfs_read_super(struct super_block *sb, void *data, ++ int silent) ++{ ++ return(hostfs_read_super_common(sb, data)); ++} ++ ++DECLARE_FSTYPE(hostfs_type, "hostfs", hostfs_read_super, 0); ++ ++static int __init init_hostfs(void) ++{ ++ return(register_filesystem(&hostfs_type)); ++} ++ ++static void __exit exit_hostfs(void) ++{ ++ unregister_filesystem(&hostfs_type); ++} ++ ++module_init(init_hostfs) ++module_exit(exit_hostfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/fs/hostfs/hostfs_user.c um/arch/um/fs/hostfs/hostfs_user.c +--- orig/arch/um/fs/hostfs/hostfs_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hostfs/hostfs_user.c Fri Jan 31 21:48:30 2003 +@@ -0,0 +1,341 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <stdio.h> ++#include <fcntl.h> ++#include <dirent.h> ++#include <errno.h> ++#include <utime.h> ++#include <string.h> ++#include <sys/stat.h> ++#include <sys/time.h> ++#include <sys/vfs.h> ++#include "hostfs.h" ++#include "kern_util.h" ++#include "user.h" ++ ++int stat_file(const char *path, int *dev_out, unsigned long long *inode_out, ++ int *mode_out, int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, unsigned long *atime_out, ++ unsigned long *mtime_out, unsigned long *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ if(dev_out != NULL) *dev_out = buf.st_dev; ++ ++ /* See the Makefile for why STAT64_INO_FIELD is passed in ++ * by the build ++ */ ++ if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD; ++ if(mode_out != NULL) *mode_out = buf.st_mode; ++ if(nlink_out != NULL) *nlink_out = buf.st_nlink; ++ if(uid_out != NULL) *uid_out = buf.st_uid; ++ if(gid_out != NULL) *gid_out = buf.st_gid; ++ if(size_out != NULL) *size_out = buf.st_size; ++ if(atime_out != NULL) *atime_out = buf.st_atime; ++ if(mtime_out != NULL) *mtime_out = buf.st_mtime; ++ if(ctime_out != NULL) *ctime_out = buf.st_ctime; ++ if(blksize_out != NULL) *blksize_out = buf.st_blksize; ++ if(blocks_out != NULL) *blocks_out = buf.st_blocks; ++ return(0); ++} ++ ++int file_type(const char *path, int *rdev) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ if(rdev != NULL) ++ *rdev = buf.st_rdev; ++ ++ if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); ++ else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); ++ else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); ++ else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); ++ else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO); ++ else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK); ++ else return(OS_TYPE_FILE); ++} ++ ++int access_file(char *path, int r, int w, int x) ++{ ++ int mode = 0; ++ ++ if(r) mode = R_OK; ++ if(w) mode |= W_OK; ++ if(x) mode |= X_OK; ++ if(access(path, mode) != 0) return(-errno); ++ else return(0); ++} ++ ++int open_file(char *path, int r, int w) ++{ ++ int mode = 0, fd; ++ ++ if(r && !w) mode = O_RDONLY; ++ else if(!r && w) mode = O_WRONLY; ++ else if(r && w) mode = O_RDWR; ++ else panic("Impossible mode in open_file"); ++ fd = open64(path, mode); ++ if(fd < 0) return(-errno); ++ else return(fd); ++} ++ ++void *open_dir(char *path, int *err_out) ++{ ++ DIR *dir; ++ ++ dir = opendir(path); ++ *err_out = errno; ++ if(dir == NULL) return(NULL); ++ return(dir); ++} ++ ++char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out) ++{ ++ DIR *dir = stream; ++ struct dirent *ent; ++ ++ seekdir(dir, *pos); ++ ent = readdir(dir); ++ if(ent == NULL) return(NULL); ++ *len_out = strlen(ent->d_name); ++ *ino_out = ent->d_ino; ++ *pos = telldir(dir); ++ return(ent->d_name); ++} ++ ++int read_file(int fd, unsigned long long *offset, char *buf, int len) ++{ ++ int n; ++ ++ n = pread64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int write_file(int fd, unsigned long long *offset, const char *buf, int len) ++{ ++ int n; ++ ++ n = pwrite64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int lseek_file(int fd, long long offset, int whence) ++{ ++ int ret; ++ ++ ret = lseek64(fd, offset, whence); ++ if(ret < 0) return(-errno); ++ return(0); ++} ++ ++void close_file(void *stream) ++{ ++ close(*((int *) stream)); ++} ++ ++void close_dir(void *stream) ++{ ++ closedir(stream); ++} ++ ++int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox) ++{ ++ int mode, fd; ++ ++ mode = 0; ++ mode |= ur ? S_IRUSR : 0; ++ mode |= uw ? S_IWUSR : 0; ++ mode |= ux ? S_IXUSR : 0; ++ mode |= gr ? S_IRGRP : 0; ++ mode |= gw ? S_IWGRP : 0; ++ mode |= gx ? S_IXGRP : 0; ++ mode |= or ? S_IROTH : 0; ++ mode |= ow ? S_IWOTH : 0; ++ mode |= ox ? S_IXOTH : 0; ++ fd = open64(name, O_CREAT, mode); ++ if(fd < 0) return(-errno); ++ close(fd); ++ return(0); ++} ++ ++int set_attr(const char *file, struct hostfs_iattr *attrs) ++{ ++ struct utimbuf buf; ++ int err, ma; ++ ++ if(attrs->ia_valid & HOSTFS_ATTR_MODE){ ++ if(chmod(file, attrs->ia_mode) != 0) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_UID){ ++ if(chown(file, attrs->ia_uid, -1)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_GID){ ++ if(chown(file, -1, attrs->ia_gid)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_SIZE){ ++ if(truncate(file, attrs->ia_size)) return(-errno); ++ } ++ ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET; ++ if((attrs->ia_valid & ma) == ma){ ++ buf.actime = attrs->ia_atime; ++ buf.modtime = attrs->ia_mtime; ++ if(utime(file, &buf) != 0) return(-errno); ++ } ++ else { ++ if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, NULL, NULL, &buf.modtime, NULL, ++ NULL, NULL); ++ if(err != 0) return(err); ++ buf.actime = attrs->ia_atime; ++ if(utime(file, &buf) != 0) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, NULL, &buf.actime, NULL, NULL, ++ NULL, NULL); ++ if(err != 0) return(err); ++ buf.modtime = attrs->ia_mtime; ++ if(utime(file, &buf) != 0) return(-errno); ++ } ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; ++ if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, NULL, &attrs->ia_atime, &attrs->ia_mtime, ++ NULL, NULL, NULL); ++ if(err != 0) return(err); ++ } ++ return(0); ++} ++ ++int make_symlink(const char *from, const char *to) ++{ ++ int err; ++ ++ err = symlink(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int unlink_file(const char *file) ++{ ++ int err; ++ ++ err = unlink(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mkdir(const char *file, int mode) ++{ ++ int err; ++ ++ err = mkdir(file, mode); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_rmdir(const char *file) ++{ ++ int err; ++ ++ err = rmdir(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mknod(const char *file, int mode, int dev) ++{ ++ int err; ++ ++ err = mknod(file, mode, dev); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int link_file(const char *to, const char *from) ++{ ++ int err; ++ ++ err = link(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_readlink(char *file, char *buf, int size) ++{ ++ int n; ++ ++ n = readlink(file, buf, size); ++ if(n < 0) ++ return(-errno); ++ if(n < size) ++ buf[n] = '\0'; ++ return(n); ++} ++ ++int rename_file(char *from, char *to) ++{ ++ int err; ++ ++ err = rename(from, to); ++ if(err < 0) return(-errno); ++ return(0); ++} ++ ++int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out) ++{ ++ struct statfs64 buf; ++ int err; ++ ++ err = statfs64(root, &buf); ++ if(err < 0) return(-errno); ++ *bsize_out = buf.f_bsize; ++ *blocks_out = buf.f_blocks; ++ *bfree_out = buf.f_bfree; ++ *bavail_out = buf.f_bavail; ++ *files_out = buf.f_files; ++ *ffree_out = buf.f_ffree; ++ memcpy(fsid_out, &buf.f_fsid, ++ sizeof(buf.f_fsid) > fsid_size ? fsid_size : ++ sizeof(buf.f_fsid)); ++ *namelen_out = buf.f_namelen; ++ spare_out[0] = buf.f_spare[0]; ++ spare_out[1] = buf.f_spare[1]; ++ spare_out[2] = buf.f_spare[2]; ++ spare_out[3] = buf.f_spare[3]; ++ spare_out[4] = buf.f_spare[4]; ++ spare_out[5] = buf.f_spare[5]; ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/fs/hppfs/Makefile um/arch/um/fs/hppfs/Makefile +--- orig/arch/um/fs/hppfs/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hppfs/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,10 @@ ++O_TARGET := hppfs.o ++obj-y = hppfs_kern.o #hppfs_user.o ++obj-m = $(O_TARGET) ++ ++CFLAGS_hppfs_kern.o := $(CFLAGS) ++#CFLAGS_hppfs_user.o := $(USER_CFLAGS) ++ ++override CFLAGS = ++ ++include $(TOPDIR)/Rules.make +diff -Naur -X ../exclude-files orig/arch/um/fs/hppfs/hppfs_kern.c um/arch/um/fs/hppfs/hppfs_kern.c +--- orig/arch/um/fs/hppfs/hppfs_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/fs/hppfs/hppfs_kern.c Thu Feb 27 13:14:26 2003 +@@ -0,0 +1,725 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <linux/fs.h> ++#include <linux/module.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/kernel.h> ++#include <linux/ctype.h> ++#include <asm/uaccess.h> ++#include "os.h" ++ ++struct hppfs_data { ++ struct list_head list; ++ char contents[PAGE_SIZE - sizeof(struct list_head)]; ++}; ++ ++struct hppfs_private { ++ struct file proc_file; ++ int host_fd; ++ loff_t len; ++ struct hppfs_data *contents; ++}; ++ ++#define HPPFS_SUPER_MAGIC 0xb00000ee ++ ++static struct super_operations hppfs_sbops; ++ ++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry, ++ int *error); ++ ++static int is_pid(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ int i; ++ ++ sb = dentry->d_sb; ++ if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root)) ++ return(0); ++ ++ for(i = 0; i < dentry->d_name.len; i++){ ++ if(!isdigit(dentry->d_name.name[i])) ++ return(0); ++ } ++ return(1); ++} ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ const char *seg_name; ++ int len, seg_len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)) ++ len += strlen("pid") + 1; ++ else len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = "proc"; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)){ ++ seg_name = "pid"; ++ seg_len = strlen("pid"); ++ } ++ else { ++ seg_name = parent->d_name.name; ++ seg_len = parent->d_name.len; ++ } ++ ++ len -= seg_len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], seg_name, seg_len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++struct dentry_operations hppfs_dentry_ops = { ++}; ++ ++static int file_removed(struct dentry *dentry, const char *file) ++{ ++ char *host_file; ++ int extra, fd; ++ ++ extra = 0; ++ if(file != NULL) extra += strlen(file) + 1; ++ ++ host_file = dentry_name(dentry, extra + strlen("/remove")); ++ if(host_file == NULL){ ++ printk("file_removed : allocation failed\n"); ++ return(-ENOMEM); ++ } ++ ++ if(file != NULL){ ++ strcat(host_file, "/"); ++ strcat(host_file, file); ++ } ++ strcat(host_file, "/remove"); ++ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ kfree(host_file); ++ if(fd > 0){ ++ os_close_file(fd); ++ return(1); ++ } ++ return(0); ++} ++ ++static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry) ++{ ++ struct dentry *proc_dentry; ++ struct inode *inode; ++ int err, deleted; ++ ++ deleted = file_removed(dentry, NULL); ++ if(deleted < 0) ++ return(ERR_PTR(deleted)); ++ else if(deleted) ++ return(ERR_PTR(-ENOENT)); ++ ++ proc_dentry = lookup_hash(&dentry->d_name, ino->u.hppfs_i.proc_dentry); ++ if(IS_ERR(proc_dentry)) ++ return(proc_dentry); ++ ++ inode = get_inode(ino->i_sb, proc_dentry, &err); ++ if(err != 0) ++ return(ERR_PTR(err)); ++ ++ d_add(dentry, inode); ++ dentry->d_op = &hppfs_dentry_ops; ++ return(NULL); ++} ++ ++static struct inode_operations hppfs_file_iops = { ++}; ++ ++static struct inode_operations hppfs_dir_iops = { ++ .lookup = hppfs_lookup, ++}; ++ ++static ssize_t read_proc(struct file *file, char *buf, ssize_t count, ++ loff_t *ppos, int is_user) ++{ ++ ssize_t (*read)(struct file *, char *, size_t, loff_t *); ++ ssize_t n; ++ ++ read = file->f_dentry->d_inode->i_fop->read; ++ ++ if(!is_user) ++ set_fs(KERNEL_DS); ++ ++ n = (*read)(file, buf, count, &file->f_pos); ++ ++ if(!is_user) ++ set_fs(USER_DS); ++ ++ if(ppos) *ppos = file->f_pos; ++ return(n); ++} ++ ++static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) ++{ ++ ssize_t n; ++ int cur, err; ++ char *new_buf; ++ ++ n = -ENOMEM; ++ new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if(new_buf == NULL){ ++ printk("hppfs_read_file : kmalloc failed\n"); ++ goto out; ++ } ++ n = 0; ++ while(count > 0){ ++ cur = min_t(ssize_t, count, PAGE_SIZE); ++ err = os_read_file(fd, new_buf, cur); ++ if(err < 0){ ++ printk("hppfs_read : read failed, errno = %d\n", ++ count); ++ n = err; ++ goto out_free; ++ } ++ else if(err == 0) ++ break; ++ ++ if(copy_to_user(buf, new_buf, err)){ ++ n = -EFAULT; ++ goto out_free; ++ } ++ n += err; ++ count -= err; ++ } ++ out_free: ++ kfree(new_buf); ++ out: ++ return(n); ++} ++ ++static ssize_t hppfs_read(struct file *file, char *buf, size_t count, ++ loff_t *ppos) ++{ ++ struct hppfs_private *hppfs = file->private_data; ++ struct hppfs_data *data; ++ loff_t off; ++ int err; ++ ++ if(hppfs->contents != NULL){ ++ if(*ppos >= hppfs->len) return(0); ++ ++ data = hppfs->contents; ++ off = *ppos; ++ while(off >= sizeof(data->contents)){ ++ data = list_entry(data->list.next, struct hppfs_data, ++ list); ++ off -= sizeof(data->contents); ++ } ++ ++ if(off + count > hppfs->len) ++ count = hppfs->len - off; ++ copy_to_user(buf, &data->contents[off], count); ++ *ppos += count; ++ } ++ else if(hppfs->host_fd != -1){ ++ err = os_seek_file(hppfs->host_fd, *ppos); ++ if(err){ ++ printk("hppfs_read : seek failed, errno = %d\n", err); ++ return(err); ++ } ++ count = hppfs_read_file(hppfs->host_fd, buf, count); ++ if(count > 0) ++ *ppos += count; ++ } ++ else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1); ++ ++ return(count); ++} ++ ++static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, ++ loff_t *ppos) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ ssize_t (*write)(struct file *, const char *, size_t, loff_t *); ++ int err; ++ ++ write = proc_file->f_dentry->d_inode->i_fop->write; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*write)(proc_file, buf, len, &proc_file->f_pos); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int open_host_sock(char *host_file, int *filter_out) ++{ ++ char *end; ++ int fd; ++ ++ end = &host_file[strlen(host_file)]; ++ strcpy(end, "/rw"); ++ *filter_out = 1; ++ fd = os_connect_socket(host_file); ++ if(fd > 0) ++ return(fd); ++ ++ strcpy(end, "/r"); ++ *filter_out = 0; ++ fd = os_connect_socket(host_file); ++ return(fd); ++} ++ ++static void free_contents(struct hppfs_data *head) ++{ ++ struct hppfs_data *data; ++ struct list_head *ele, *next; ++ ++ if(head == NULL) return; ++ ++ list_for_each_safe(ele, next, &head->list){ ++ data = list_entry(ele, struct hppfs_data, list); ++ kfree(data); ++ } ++ kfree(head); ++} ++ ++static struct hppfs_data *hppfs_get_data(int fd, int filter, ++ struct file *proc_file, ++ struct file *hppfs_file, ++ loff_t *size_out) ++{ ++ struct hppfs_data *data, *new, *head; ++ int n, err; ++ ++ err = -ENOMEM; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL){ ++ printk("hppfs_get_data : head allocation failed\n"); ++ goto failed; ++ } ++ ++ INIT_LIST_HEAD(&data->list); ++ ++ head = data; ++ *size_out = 0; ++ ++ if(filter){ ++ while((n = read_proc(proc_file, data->contents, ++ sizeof(data->contents), NULL, 0)) > 0) ++ os_write_file(fd, data->contents, n); ++ err = os_shutdown_socket(fd, 0, 1); ++ if(err){ ++ printk("hppfs_get_data : failed to shut down " ++ "socket\n"); ++ goto failed_free; ++ } ++ } ++ while(1){ ++ n = os_read_file(fd, data->contents, sizeof(data->contents)); ++ if(n < 0){ ++ err = n; ++ printk("hppfs_get_data : read failed, errno = %d\n", ++ err); ++ goto failed_free; ++ } ++ else if(n == 0) ++ break; ++ ++ *size_out += n; ++ ++ if(n < sizeof(data->contents)) ++ break; ++ ++ new = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(new == 0){ ++ printk("hppfs_get_data : data allocation failed\n"); ++ err = -ENOMEM; ++ goto failed_free; ++ } ++ ++ INIT_LIST_HEAD(&new->list); ++ list_add(&new->list, &data->list); ++ data = new; ++ } ++ return(head); ++ ++ failed_free: ++ free_contents(head); ++ failed: ++ return(ERR_PTR(err)); ++} ++ ++static struct hppfs_private *hppfs_data(void) ++{ ++ struct hppfs_private *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL) ++ return(data); ++ ++ *data = ((struct hppfs_private ) { .host_fd = -1, ++ .len = -1, ++ .contents = NULL } ); ++ return(data); ++} ++ ++static int hppfs_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ char *host_file; ++ int err, fd, type, filter; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ host_file = dentry_name(file->f_dentry, strlen("/rw")); ++ if(host_file == NULL) ++ goto out_free2; ++ ++ proc_dentry = inode->u.hppfs_i.proc_dentry; ++ err = init_private_file(&data->proc_file, proc_dentry, file->f_mode); ++ if(err) ++ goto out_free1; ++ ++ type = os_file_type(host_file); ++ if(type == OS_TYPE_FILE){ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ if(fd >= 0) ++ data->host_fd = fd; ++ else printk("hppfs_open : failed to open '%s', errno = %d\n", ++ host_file, -fd); ++ ++ data->contents = NULL; ++ } ++ else if(type == OS_TYPE_DIR){ ++ fd = open_host_sock(host_file, &filter); ++ if(fd > 0){ ++ data->contents = hppfs_get_data(fd, filter, ++ &data->proc_file, ++ file, &data->len); ++ if(!IS_ERR(data->contents)) ++ data->host_fd = fd; ++ } ++ else printk("hppfs_open : failed to open a socket in " ++ "'%s', errno = %d\n", host_file, -fd); ++ } ++ kfree(host_file); ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free1: ++ kfree(host_file); ++ out_free2: ++ free_contents(data->contents); ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static int hppfs_dir_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ int err; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ proc_dentry = inode->u.hppfs_i.proc_dentry; ++ err = init_private_file(&data->proc_file, proc_dentry, file->f_mode); ++ if(err) ++ goto out_free; ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free: ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static loff_t hppfs_llseek(struct file *file, loff_t off, int where) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ loff_t (*llseek)(struct file *, loff_t, int); ++ loff_t ret; ++ ++ llseek = proc_file->f_dentry->d_inode->i_fop->llseek; ++ if(llseek != NULL){ ++ ret = (*llseek)(proc_file, off, where); ++ if(ret < 0) ++ return(ret); ++ } ++ ++ return(default_llseek(file, off, where)); ++} ++ ++struct hppfs_dirent { ++ void *vfs_dirent; ++ filldir_t filldir; ++ struct dentry *dentry; ++}; ++ ++static int hppfs_filldir(void *d, const char *name, int size, ++ loff_t offset, ino_t inode, unsigned int type) ++{ ++ struct hppfs_dirent *dirent = d; ++ ++ if(file_removed(dirent->dentry, name)) ++ return(0); ++ ++ return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, ++ inode, type)); ++} ++ ++static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ int (*readdir)(struct file *, void *, filldir_t); ++ struct hppfs_dirent dirent = ((struct hppfs_dirent) ++ { .vfs_dirent = ent, ++ .filldir = filldir, ++ .dentry = file->f_dentry } ); ++ int err; ++ ++ readdir = proc_file->f_dentry->d_inode->i_fop->readdir; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*readdir)(proc_file, &dirent, hppfs_filldir); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hppfs_file_fops = { ++ .owner = NULL, ++ .llseek = hppfs_llseek, ++ .read = hppfs_read, ++ .write = hppfs_write, ++ .open = hppfs_open, ++}; ++ ++static struct file_operations hppfs_dir_fops = { ++ .owner = NULL, ++ .readdir = hppfs_readdir, ++ .open = hppfs_dir_open, ++ .fsync = hppfs_fsync, ++}; ++ ++static int hppfs_statfs(struct super_block *sb, struct statfs *sf) ++{ ++ sf->f_blocks = 0; ++ sf->f_bfree = 0; ++ sf->f_bavail = 0; ++ sf->f_files = 0; ++ sf->f_ffree = 0; ++ sf->f_type = HPPFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct super_operations hppfs_sbops = { ++ .put_inode = force_delete, ++ .delete_inode = NULL, ++ .statfs = hppfs_statfs, ++}; ++ ++static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*readlink)(struct dentry *, char *, int); ++ int err, n; ++ ++ proc_dentry = dentry->d_inode->u.hppfs_i.proc_dentry; ++ err = init_private_file(&proc_file, proc_dentry, FMODE_READ); ++ if(err) ++ return(err); ++ ++ readlink = proc_dentry->d_inode->i_op->readlink; ++ n = (*readlink)(proc_dentry, buffer, buflen); ++ ++ if(proc_file.f_op->release) ++ (*proc_file.f_op->release)(proc_dentry->d_inode, &proc_file); ++ ++ return(n); ++} ++ ++static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*follow_link)(struct dentry *, struct nameidata *); ++ int err, n; ++ ++ proc_dentry = dentry->d_inode->u.hppfs_i.proc_dentry; ++ err = init_private_file(&proc_file, proc_dentry, FMODE_READ); ++ if(err) ++ return(err); ++ ++ follow_link = proc_dentry->d_inode->i_op->follow_link; ++ n = (*follow_link)(proc_dentry, nd); ++ ++ if(proc_file.f_op->release) ++ (*proc_file.f_op->release)(proc_dentry->d_inode, &proc_file); ++ ++ return(n); ++} ++ ++static struct inode_operations hppfs_link_iops = { ++ .readlink = hppfs_readlink, ++ .follow_link = hppfs_follow_link, ++}; ++ ++static void read_inode(struct inode *ino) ++{ ++ struct inode *proc_ino; ++ ++ proc_ino = ino->u.hppfs_i.proc_dentry->d_inode; ++ ino->i_uid = proc_ino->i_uid; ++ ino->i_gid = proc_ino->i_gid; ++ ino->i_atime = proc_ino->i_atime; ++ ino->i_mtime = proc_ino->i_mtime; ++ ino->i_ctime = proc_ino->i_ctime; ++ ino->i_ino = proc_ino->i_ino; ++ ino->i_dev = proc_ino->i_dev; ++ ino->i_mode = proc_ino->i_mode; ++ ino->i_nlink = proc_ino->i_nlink; ++ ino->i_size = proc_ino->i_size; ++ ino->i_blksize = proc_ino->i_blksize; ++ ino->i_blocks = proc_ino->i_blocks; ++} ++ ++static struct inode *get_inode(struct super_block *sb, struct dentry *dentry, ++ int *error) ++{ ++ struct inode *inode; ++ int err = -ENOMEM; ++ ++ inode = new_inode(sb); ++ if(inode == NULL) ++ goto out; ++ ++ insert_inode_hash(inode); ++ if(S_ISDIR(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_dir_iops; ++ inode->i_fop = &hppfs_dir_fops; ++ } ++ else if(S_ISLNK(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_link_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ else { ++ inode->i_op = &hppfs_file_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ ++ inode->i_sb = sb; ++ inode->u.hppfs_i.proc_dentry = dentry; ++ ++ read_inode(inode); ++ err = 0; ++ ++ if(error) *error = err; ++ return(inode); ++ out: ++ if(error) *error = err; ++ return(NULL); ++} ++ ++static struct super_block *hppfs_read_super(struct super_block *sb, void *d, ++ int silent) ++{ ++ struct inode *root_inode; ++ struct file_system_type *procfs; ++ struct super_block *proc_sb; ++ ++ procfs = get_fs_type("proc"); ++ if(procfs == NULL) ++ goto out; ++ ++ if(list_empty(&procfs->fs_supers)) ++ goto out; ++ ++ proc_sb = list_entry(procfs->fs_supers.next, struct super_block, ++ s_instances); ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HPPFS_SUPER_MAGIC; ++ sb->s_op = &hppfs_sbops; ++ ++ dget(proc_sb->s_root); ++ root_inode = get_inode(sb, proc_sb->s_root, NULL); ++ if(root_inode == NULL) ++ goto out_dput; ++ ++ sb->s_root = d_alloc_root(root_inode); ++ if(sb->s_root == NULL) ++ goto out_put; ++ ++ return(sb); ++ ++ out_put: ++ iput(root_inode); ++ out_dput: ++ dput(proc_sb->s_root); ++ out: ++ return(NULL); ++} ++ ++DECLARE_FSTYPE(hppfs_type, "hppfs", hppfs_read_super, 0); ++ ++static int __init init_hppfs(void) ++{ ++ return(register_filesystem(&hppfs_type)); ++} ++ ++static void __exit exit_hppfs(void) ++{ ++ unregister_filesystem(&hppfs_type); ++} ++ ++module_init(init_hppfs) ++module_exit(exit_hppfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/2_5compat.h um/arch/um/include/2_5compat.h +--- orig/arch/um/include/2_5compat.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/2_5compat.h Thu Feb 27 20:15:19 2003 +@@ -0,0 +1,46 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __2_5_COMPAT_H__ ++#define __2_5_COMPAT_H__ ++ ++#include "linux/version.h" ++ ++#define INIT_CONSOLE(dev_name, write_proc, device_proc, setup_proc, f) { \ ++ name : dev_name, \ ++ write : write_proc, \ ++ read : NULL, \ ++ device : device_proc, \ ++ unblank : NULL, \ ++ setup : setup_proc, \ ++ flags : f, \ ++ index : -1, \ ++ cflag : 0, \ ++ next : NULL \ ++} ++ ++#define INIT_ELV(queue, elv) elevator_init(elv, ELV_NOOP) ++ ++#define ELV_NOOP ELEVATOR_NOOP ++ ++#define INIT_HARDSECT(arr, maj, sizes) arr[maj] = sizes ++ ++#define IS_WRITE(req) ((req)->cmd == WRITE) ++ ++#define SET_PRI(task) \ ++ do { (task)->nice = 20; (task)->counter = -100; } while(0); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/Makefile um/arch/um/include/Makefile +--- orig/arch/um/include/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,7 @@ ++all : sc.h ++ ++sc.h : ../util/mk_sc ++ ../util/mk_sc > $@ ++ ++../util/mk_sc : ++ $(MAKE) -C ../util mk_sc +diff -Naur -X ../exclude-files orig/arch/um/include/chan_kern.h um/arch/um/include/chan_kern.h +--- orig/arch/um/include/chan_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/chan_kern.h Fri Nov 15 13:32:35 2002 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __CHAN_KERN_H__ ++#define __CHAN_KERN_H__ ++ ++#include "linux/tty.h" ++#include "linux/list.h" ++#include "chan_user.h" ++ ++struct chan { ++ struct list_head list; ++ char *dev; ++ unsigned int primary:1; ++ unsigned int input:1; ++ unsigned int output:1; ++ unsigned int opened:1; ++ int fd; ++ enum chan_init_pri pri; ++ struct chan_ops *ops; ++ void *data; ++}; ++ ++extern void chan_interrupt(struct list_head *chans, struct tq_struct *task, ++ struct tty_struct *tty, int irq, void *dev); ++extern int parse_chan_pair(char *str, struct list_head *chans, int pri, ++ int device, struct chan_opts *opts); ++extern int open_chan(struct list_head *chans); ++extern int write_chan(struct list_head *chans, const char *buf, int len, ++ int write_irq); ++extern int console_write_chan(struct list_head *chans, const char *buf, ++ int len); ++extern void close_chan(struct list_head *chans); ++extern void chan_enable_winch(struct list_head *chans, void *line); ++extern void enable_chan(struct list_head *chans, void *data); ++extern int chan_window_size(struct list_head *chans, ++ unsigned short *rows_out, ++ unsigned short *cols_out); ++extern int chan_out_fd(struct list_head *chans); ++extern int chan_config_string(struct list_head *chans, char *str, int size, ++ char **error_out); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/chan_user.h um/arch/um/include/chan_user.h +--- orig/arch/um/include/chan_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/chan_user.h Wed Nov 6 16:44:00 2002 +@@ -0,0 +1,66 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __CHAN_USER_H__ ++#define __CHAN_USER_H__ ++ ++#include "init.h" ++ ++struct chan_opts { ++ void (*announce)(char *dev_name, int dev); ++ char *xterm_title; ++ int raw; ++ unsigned long tramp_stack; ++ int in_kernel; ++}; ++ ++enum chan_init_pri { INIT_STATIC, INIT_ALL, INIT_ONE }; ++ ++struct chan_ops { ++ char *type; ++ void *(*init)(char *, int, struct chan_opts *); ++ int (*open)(int, int, int, void *, char **); ++ void (*close)(int, void *); ++ int (*read)(int, char *, void *); ++ int (*write)(int, const char *, int, void *); ++ int (*console_write)(int, const char *, int, void *); ++ int (*window_size)(int, void *, unsigned short *, unsigned short *); ++ void (*free)(void *); ++ int winch; ++}; ++ ++extern struct chan_ops fd_ops, null_ops, port_ops, pts_ops, pty_ops, tty_ops, ++ xterm_ops; ++ ++extern void generic_close(int fd, void *unused); ++extern int generic_read(int fd, char *c_out, void *unused); ++extern int generic_write(int fd, const char *buf, int n, void *unused); ++extern int generic_console_write(int fd, const char *buf, int n, void *state); ++extern int generic_window_size(int fd, void *unused, unsigned short *rows_out, ++ unsigned short *cols_out); ++extern void generic_free(void *data); ++ ++extern void register_winch(int fd, void *device_data); ++extern void register_winch_irq(int fd, int tty_fd, int pid, void *line); ++ ++#define __channel_help(fn, prefix) \ ++__uml_help(fn, prefix "[0-9]*=<channel description>\n" \ ++" Attach a console or serial line to a host channel. See\n" \ ++" http://user-mode-linux.sourceforge.net/input.html for a complete\n" \ ++" description of this switch.\n\n" \ ++); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/choose-mode.h um/arch/um/include/choose-mode.h +--- orig/arch/um/include/choose-mode.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/choose-mode.h Fri Jan 17 13:23:32 2003 +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __CHOOSE_MODE_H__ ++#define __CHOOSE_MODE_H__ ++ ++#include "uml-config.h" ++ ++#if defined(UML_CONFIG_MODE_TT) && defined(UML_CONFIG_MODE_SKAS) ++#define CHOOSE_MODE(tt, skas) (mode_tt ? (tt) : (skas)) ++ ++#elif defined(UML_CONFIG_MODE_SKAS) ++#define CHOOSE_MODE(tt, skas) (skas) ++ ++#elif defined(UML_CONFIG_MODE_TT) ++#define CHOOSE_MODE(tt, skas) (tt) ++#endif ++ ++#define CHOOSE_MODE_PROC(tt, skas, args...) \ ++ CHOOSE_MODE(tt(args), skas(args)) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/frame.h um/arch/um/include/frame.h +--- orig/arch/um/include/frame.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/frame.h Mon Dec 2 21:43:03 2002 +@@ -0,0 +1,53 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_H_ ++#define __FRAME_H_ ++ ++#include "sysdep/frame.h" ++ ++struct frame_common { ++ void *data; ++ int len; ++ int sig_index; ++ int sr_index; ++ int sr_relative; ++ int sp_index; ++ struct arch_frame_data arch; ++}; ++ ++struct sc_frame { ++ struct frame_common common; ++ int sc_index; ++}; ++ ++extern struct sc_frame signal_frame_sc; ++ ++extern struct sc_frame signal_frame_sc_sr; ++ ++struct si_frame { ++ struct frame_common common; ++ int sip_index; ++ int si_index; ++ int ucp_index; ++ int uc_index; ++}; ++ ++extern struct si_frame signal_frame_si; ++ ++extern void capture_signal_stack(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/frame_kern.h um/arch/um/include/frame_kern.h +--- orig/arch/um/include/frame_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/frame_kern.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,34 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_KERN_H_ ++#define __FRAME_KERN_H_ ++ ++#include "frame.h" ++#include "sysdep/frame_kern.h" ++ ++extern int setup_signal_stack_sc(unsigned long stack_top, int sig, ++ unsigned long handler, ++ void (*restorer)(void), ++ struct pt_regs *regs, ++ sigset_t *mask); ++extern int setup_signal_stack_si(unsigned long stack_top, int sig, ++ unsigned long handler, ++ void (*restorer)(void), ++ struct pt_regs *regs, siginfo_t *info, ++ sigset_t *mask); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/frame_user.h um/arch/um/include/frame_user.h +--- orig/arch/um/include/frame_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/frame_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_USER_H_ ++#define __FRAME_USER_H_ ++ ++#include "sysdep/frame_user.h" ++#include "frame.h" ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/helper.h um/arch/um/include/helper.h +--- orig/arch/um/include/helper.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/helper.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,27 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __HELPER_H__ ++#define __HELPER_H__ ++ ++extern int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv, ++ unsigned long *stack_out); ++extern int run_helper_thread(int (*proc)(void *), void *arg, ++ unsigned int flags, unsigned long *stack_out, ++ int stack_order); ++extern int helper_wait(int pid); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/hostaudio.h um/arch/um/include/hostaudio.h +--- orig/arch/um/include/hostaudio.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/hostaudio.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2002 Steve Schmidtke ++ * Licensed under the GPL ++ */ ++ ++#ifndef HOSTAUDIO_H ++#define HOSTAUDIO_H ++ ++#define HOSTAUDIO_DEV_DSP "/dev/sound/dsp" ++#define HOSTAUDIO_DEV_MIXER "/dev/sound/mixer" ++ ++struct hostaudio_state { ++ int fd; ++}; ++ ++struct hostmixer_state { ++ int fd; ++}; ++ ++/* UML user-side protoypes */ ++extern ssize_t hostaudio_read_user(struct hostaudio_state *state, char *buffer, ++ size_t count, loff_t *ppos); ++extern ssize_t hostaudio_write_user(struct hostaudio_state *state, ++ const char *buffer, size_t count, ++ loff_t *ppos); ++extern int hostaudio_ioctl_user(struct hostaudio_state *state, ++ unsigned int cmd, unsigned long arg); ++extern int hostaudio_open_user(struct hostaudio_state *state, int r, int w, ++ char *dsp); ++extern int hostaudio_release_user(struct hostaudio_state *state); ++extern int hostmixer_ioctl_mixdev_user(struct hostmixer_state *state, ++ unsigned int cmd, unsigned long arg); ++extern int hostmixer_open_mixdev_user(struct hostmixer_state *state, int r, ++ int w, char *mixer); ++extern int hostmixer_release_mixdev_user(struct hostmixer_state *state); ++ ++#endif /* HOSTAUDIO_H */ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/init.h um/arch/um/include/init.h +--- orig/arch/um/include/init.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/init.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,114 @@ ++#ifndef _LINUX_UML_INIT_H ++#define _LINUX_UML_INIT_H ++ ++/* These macros are used to mark some functions or ++ * initialized data (doesn't apply to uninitialized data) ++ * as `initialization' functions. The kernel can take this ++ * as hint that the function is used only during the initialization ++ * phase and free up used memory resources after ++ * ++ * Usage: ++ * For functions: ++ * ++ * You should add __init immediately before the function name, like: ++ * ++ * static void __init initme(int x, int y) ++ * { ++ * extern int z; z = x * y; ++ * } ++ * ++ * If the function has a prototype somewhere, you can also add ++ * __init between closing brace of the prototype and semicolon: ++ * ++ * extern int initialize_foobar_device(int, int, int) __init; ++ * ++ * For initialized data: ++ * You should insert __initdata between the variable name and equal ++ * sign followed by value, e.g.: ++ * ++ * static int init_variable __initdata = 0; ++ * static char linux_logo[] __initdata = { 0x32, 0x36, ... }; ++ * ++ * Don't forget to initialize data not at file scope, i.e. within a function, ++ * as gcc otherwise puts the data into the bss section and not into the init ++ * section. ++ * ++ * Also note, that this data cannot be "const". ++ */ ++ ++#ifndef _LINUX_INIT_H ++typedef int (*initcall_t)(void); ++typedef void (*exitcall_t)(void); ++ ++#define __init __attribute__ ((__section__ (".text.init"))) ++#define __exit __attribute__ ((unused, __section__(".text.exit"))) ++#define __initdata __attribute__ ((__section__ (".data.init"))) ++ ++#endif ++ ++#ifndef MODULE ++struct uml_param { ++ const char *str; ++ int (*setup_func)(char *, int *); ++}; ++ ++extern initcall_t __uml_initcall_start, __uml_initcall_end; ++extern initcall_t __uml_postsetup_start, __uml_postsetup_end; ++extern const char *__uml_help_start, *__uml_help_end; ++#endif ++ ++#define __uml_initcall(fn) \ ++ static initcall_t __uml_initcall_##fn __uml_init_call = fn ++ ++#define __uml_exitcall(fn) \ ++ static exitcall_t __uml_exitcall_##fn __uml_exit_call = fn ++ ++extern struct uml_param __uml_setup_start, __uml_setup_end; ++ ++#define __uml_postsetup(fn) \ ++ static initcall_t __uml_postsetup_##fn __uml_postsetup_call = fn ++ ++#define __non_empty_string(dummyname,string) \ ++ struct __uml_non_empty_string_struct_##dummyname \ ++ { \ ++ char _string[sizeof(string)-2]; \ ++ } ++ ++#ifndef MODULE ++#define __uml_setup(str, fn, help...) \ ++ __non_empty_string(fn ##_setup, str); \ ++ __uml_help(fn, help); \ ++ static char __uml_setup_str_##fn[] __initdata = str; \ ++ static struct uml_param __uml_setup_##fn __uml_init_setup = { __uml_setup_str_##fn, fn } ++#else ++#define __uml_setup(str, fn, help...) \ ++ ++#endif ++ ++#define __uml_help(fn, help...) \ ++ __non_empty_string(fn ##__help, help); \ ++ static char __uml_help_str_##fn[] __initdata = help; \ ++ static const char *__uml_help_##fn __uml_setup_help = __uml_help_str_##fn ++ ++/* ++ * Mark functions and data as being only used at initialization ++ * or exit time. ++ */ ++#define __uml_init_setup __attribute__ ((unused,__section__ (".uml.setup.init"))) ++#define __uml_setup_help __attribute__ ((unused,__section__ (".uml.help.init"))) ++#define __uml_init_call __attribute__ ((unused,__section__ (".uml.initcall.init"))) ++#define __uml_postsetup_call __attribute__ ((unused,__section__ (".uml.postsetup.init"))) ++#define __uml_exit_call __attribute__ ((unused,__section__ (".uml.exitcall.exit"))) ++ ++#endif /* _LINUX_UML_INIT_H */ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/initrd.h um/arch/um/include/initrd.h +--- orig/arch/um/include/initrd.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/initrd.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __INITRD_USER_H__ ++#define __INITRD_USER_H__ ++ ++extern int load_initrd(char *filename, void *buf, int size); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/irq_user.h um/arch/um/include/irq_user.h +--- orig/arch/um/include/irq_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/irq_user.h Sun Dec 8 20:38:42 2002 +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __IRQ_USER_H__ ++#define __IRQ_USER_H__ ++ ++enum { IRQ_READ, IRQ_WRITE }; ++ ++extern void sigio_handler(int sig, union uml_pt_regs *regs); ++extern int activate_fd(int irq, int fd, int type, void *dev_id); ++extern void free_irq_by_irq_and_dev(int irq, void *dev_id); ++extern void free_irq_by_fd(int fd); ++extern void reactivate_fd(int fd, int irqnum); ++extern void deactivate_fd(int fd, int irqnum); ++extern void forward_interrupts(int pid); ++extern void init_irq_signals(int on_sigstack); ++extern void forward_ipi(int fd, int pid); ++extern void free_irq_later(int irq, void *dev_id); ++extern int activate_ipi(int fd, int pid); ++extern unsigned long irq_lock(void); ++extern void irq_unlock(unsigned long flags); ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/kern.h um/arch/um/include/kern.h +--- orig/arch/um/include/kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/kern.h Sat Nov 2 21:38:02 2002 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __KERN_H__ ++#define __KERN_H__ ++ ++/* These are all user-mode things which are convenient to call directly ++ * from kernel code and for which writing a wrapper is too much of a pain. ++ * The regular include files can't be included because this file is included ++ * only into kernel code, and user-space includes conflict with kernel ++ * includes. ++ */ ++ ++extern int errno; ++ ++extern int clone(int (*proc)(void *), void *sp, int flags, void *data); ++extern int sleep(int); ++extern int printf(char *fmt, ...); ++extern char *strerror(int errnum); ++extern char *ptsname(int __fd); ++extern int munmap(void *, int); ++extern void *sbrk(int increment); ++extern void *malloc(int size); ++extern void perror(char *err); ++extern int kill(int pid, int sig); ++extern int getuid(void); ++extern int pause(void); ++extern int write(int, const void *, int); ++extern int exit(int); ++extern int close(int); ++extern int read(unsigned int, char *, int); ++extern int pipe(int *); ++extern int sched_yield(void); ++extern int ptrace(int op, int pid, long addr, long data); ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/kern_util.h um/arch/um/include/kern_util.h +--- orig/arch/um/include/kern_util.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/kern_util.h Wed Apr 16 16:00:11 2003 +@@ -0,0 +1,121 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __KERN_UTIL_H__ ++#define __KERN_UTIL_H__ ++ ++#include "sysdep/ptrace.h" ++ ++extern int ncpus; ++extern char *linux_prog; ++extern char *gdb_init; ++extern int kmalloc_ok; ++extern int timer_irq_inited; ++extern int jail; ++extern int nsyscalls; ++ ++#define UML_ROUND_DOWN(addr) ((void *)(((unsigned long) addr) & PAGE_MASK)) ++#define UML_ROUND_UP(addr) \ ++ UML_ROUND_DOWN(((unsigned long) addr) + PAGE_SIZE - 1) ++ ++extern int kernel_fork(unsigned long flags, int (*fn)(void *), void * arg); ++extern unsigned long stack_sp(unsigned long page); ++extern int kernel_thread_proc(void *data); ++extern void syscall_segv(int sig); ++extern int current_pid(void); ++extern unsigned long alloc_stack(int order, int atomic); ++extern int do_signal(int error); ++extern int is_stack_fault(unsigned long sp); ++extern unsigned long segv(unsigned long address, unsigned long ip, ++ int is_write, int is_user, void *sc); ++extern unsigned long handle_page_fault(unsigned long address, unsigned long ip, ++ int is_write, int is_user, ++ int *code_out); ++extern void syscall_ready(void); ++extern int segv_syscall(void); ++extern void kern_finish_exec(void *task, int new_pid, unsigned long stack); ++extern int page_size(void); ++extern int page_mask(void); ++extern int need_finish_fork(void); ++extern void free_stack(unsigned long stack, int order); ++extern void add_input_request(int op, void (*proc)(int), void *arg); ++extern int sys_execve(char *file, char **argv, char **env); ++extern char *current_cmd(void); ++extern void timer_handler(int sig, union uml_pt_regs *regs); ++extern int set_signals(int enable); ++extern void force_sigbus(void); ++extern int pid_to_processor_id(int pid); ++extern void block_signals(void); ++extern void unblock_signals(void); ++extern void deliver_signals(void *t); ++extern int next_syscall_index(int max); ++extern int next_trap_index(int max); ++extern void cpu_idle(void); ++extern void finish_fork(void); ++extern void paging_init(void); ++extern void init_flush_vm(void); ++extern void *syscall_sp(void *t); ++extern void syscall_trace(void); ++extern int hz(void); ++extern void idle_timer(void); ++extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs); ++extern int external_pid(void *t); ++extern void boot_timer_handler(int sig); ++extern void interrupt_end(void); ++extern void initial_thread_cb(void (*proc)(void *), void *arg); ++extern int debugger_signal(int status, int pid); ++extern void debugger_parent_signal(int status, int pid); ++extern void child_signal(int pid, int status); ++extern int init_ptrace_proxy(int idle_pid, int startup, int stop); ++extern int init_parent_proxy(int pid); ++extern void check_stack_overflow(void *ptr); ++extern void relay_signal(int sig, union uml_pt_regs *regs); ++extern void not_implemented(void); ++extern int user_context(unsigned long sp); ++extern void timer_irq(union uml_pt_regs *regs); ++extern void unprotect_stack(unsigned long stack); ++extern void do_uml_exitcalls(void); ++extern int attach_debugger(int idle_pid, int pid, int stop); ++extern void bad_segv(unsigned long address, unsigned long ip, int is_write); ++extern int config_gdb(char *str); ++extern int remove_gdb(void); ++extern char *uml_strdup(char *string); ++extern void unprotect_kernel_mem(void); ++extern void protect_kernel_mem(void); ++extern void set_kmem_end(unsigned long); ++extern void uml_cleanup(void); ++extern void set_current(void *t); ++extern void lock_signalled_task(void *t); ++extern void IPI_handler(int cpu); ++extern int jail_setup(char *line, int *add); ++extern void *get_init_task(void); ++extern int clear_user_proc(void *buf, int size); ++extern int copy_to_user_proc(void *to, void *from, int size); ++extern int copy_from_user_proc(void *to, void *from, int size); ++extern int strlen_user_proc(char *str); ++extern void bus_handler(int sig, union uml_pt_regs *regs); ++extern void winch(int sig, union uml_pt_regs *regs); ++extern long execute_syscall(void *r); ++extern int smp_sigio_handler(void); ++extern void *get_current(void); ++extern struct task_struct *get_task(int pid, int require); ++extern void machine_halt(void); ++extern int is_syscall(unsigned long addr); ++extern void arch_switch(void); ++extern void free_irq(unsigned int, void *); ++extern int um_in_interrupt(void); ++extern int cpu(void); ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/line.h um/arch/um/include/line.h +--- orig/arch/um/include/line.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/line.h Fri Nov 15 13:44:44 2002 +@@ -0,0 +1,106 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __LINE_H__ ++#define __LINE_H__ ++ ++#include "linux/list.h" ++#include "linux/tqueue.h" ++#include "linux/tty.h" ++#include "asm/semaphore.h" ++#include "chan_user.h" ++#include "mconsole_kern.h" ++ ++struct line_driver { ++ char *name; ++ char *devfs_name; ++ short major; ++ short minor_start; ++ short type; ++ short subtype; ++ int read_irq; ++ char *read_irq_name; ++ int write_irq; ++ char *write_irq_name; ++ char *symlink_from; ++ char *symlink_to; ++ struct mc_device mc; ++}; ++ ++struct line { ++ char *init_str; ++ int init_pri; ++ struct list_head chan_list; ++ int valid; ++ int count; ++ struct tty_struct *tty; ++ struct semaphore sem; ++ char *buffer; ++ char *head; ++ char *tail; ++ int sigio; ++ struct tq_struct task; ++ struct line_driver *driver; ++ int have_irq; ++}; ++ ++#define LINE_INIT(str, d) \ ++ { init_str : str, \ ++ init_pri : INIT_STATIC, \ ++ chan_list : { }, \ ++ valid : 1, \ ++ count : 0, \ ++ tty : NULL, \ ++ sem : { }, \ ++ buffer : NULL, \ ++ head : NULL, \ ++ tail : NULL, \ ++ sigio : 0, \ ++ driver : d, \ ++ have_irq : 0 } ++ ++struct lines { ++ int num; ++}; ++ ++#define LINES_INIT(n) { num : n } ++ ++extern void line_interrupt(int irq, void *data, struct pt_regs *unused); ++extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused); ++extern void line_close(struct line *lines, struct tty_struct *tty); ++extern int line_open(struct line *lines, struct tty_struct *tty, ++ struct chan_opts *opts); ++extern int line_setup(struct line *lines, int num, char *init, ++ int all_allowed); ++extern int line_write(struct line *line, struct tty_struct *tty, int from_user, ++ const char *buf, int len); ++extern int line_write_room(struct tty_struct *tty); ++extern char *add_xterm_umid(char *base); ++extern int line_setup_irq(int fd, int input, int output, void *data); ++extern void line_close_chan(struct line *line); ++extern void line_disable(struct line *line, int current_irq); ++extern void line_register_devfs(struct lines *set, ++ struct line_driver *line_driver, ++ struct tty_driver *driver, struct line *lines, ++ int nlines); ++extern void lines_init(struct line *lines, int nlines); ++extern void close_lines(struct line *lines, int nlines); ++extern int line_config(struct line *lines, int num, char *str); ++extern int line_remove(struct line *lines, int num, char *str); ++extern int line_get_config(char *dev, struct line *lines, int num, char *str, ++ int size, char **error_out); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mconsole.h um/arch/um/include/mconsole.h +--- orig/arch/um/include/mconsole.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mconsole.h Fri Jan 17 13:48:25 2003 +@@ -0,0 +1,99 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MCONSOLE_H__ ++#define __MCONSOLE_H__ ++ ++#ifndef __KERNEL__ ++#include <stdint.h> ++#define u32 uint32_t ++#endif ++ ++#define MCONSOLE_MAGIC (0xcafebabe) ++#define MCONSOLE_MAX_DATA (512) ++#define MCONSOLE_VERSION 2 ++ ++struct mconsole_request { ++ u32 magic; ++ u32 version; ++ u32 len; ++ char data[MCONSOLE_MAX_DATA]; ++}; ++ ++struct mconsole_reply { ++ u32 err; ++ u32 more; ++ u32 len; ++ char data[MCONSOLE_MAX_DATA]; ++}; ++ ++struct mconsole_notify { ++ u32 magic; ++ u32 version; ++ enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG, ++ MCONSOLE_USER_NOTIFY } type; ++ u32 len; ++ char data[MCONSOLE_MAX_DATA]; ++}; ++ ++struct mc_request; ++ ++struct mconsole_command ++{ ++ char *command; ++ void (*handler)(struct mc_request *req); ++ int as_interrupt; ++}; ++ ++struct mc_request ++{ ++ int len; ++ int as_interrupt; ++ ++ int originating_fd; ++ int originlen; ++ unsigned char origin[128]; /* sockaddr_un */ ++ ++ struct mconsole_request request; ++ struct mconsole_command *cmd; ++}; ++ ++extern char mconsole_socket_name[]; ++ ++extern int mconsole_unlink_socket(void); ++extern int mconsole_reply(struct mc_request *req, char *reply, int err, ++ int more); ++ ++extern void mconsole_version(struct mc_request *req); ++extern void mconsole_help(struct mc_request *req); ++extern void mconsole_halt(struct mc_request *req); ++extern void mconsole_reboot(struct mc_request *req); ++extern void mconsole_config(struct mc_request *req); ++extern void mconsole_remove(struct mc_request *req); ++extern void mconsole_sysrq(struct mc_request *req); ++extern void mconsole_cad(struct mc_request *req); ++extern void mconsole_stop(struct mc_request *req); ++extern void mconsole_go(struct mc_request *req); ++ ++extern int mconsole_get_request(int fd, struct mc_request *req); ++extern int mconsole_notify(char *sock_name, int type, const void *data, ++ int len); ++extern char *mconsole_notify_socket(void); ++extern void lock_notify(void); ++extern void unlock_notify(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mconsole_kern.h um/arch/um/include/mconsole_kern.h +--- orig/arch/um/include/mconsole_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mconsole_kern.h Fri Nov 15 15:21:58 2002 +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MCONSOLE_KERN_H__ ++#define __MCONSOLE_KERN_H__ ++ ++#include "linux/config.h" ++#include "linux/list.h" ++#include "mconsole.h" ++ ++struct mconsole_entry { ++ struct list_head list; ++ struct mc_request request; ++}; ++ ++struct mc_device { ++ struct list_head list; ++ char *name; ++ int (*config)(char *); ++ int (*get_config)(char *, char *, int, char **); ++ int (*remove)(char *); ++}; ++ ++#define CONFIG_CHUNK(str, size, current, chunk, end) \ ++do { \ ++ current += strlen(chunk); \ ++ if(current >= size) \ ++ str = NULL; \ ++ if(str != NULL){ \ ++ strcpy(str, chunk); \ ++ str += strlen(chunk); \ ++ } \ ++ if(end) \ ++ current++; \ ++} while(0) ++ ++#ifdef CONFIG_MCONSOLE ++ ++extern void mconsole_register_dev(struct mc_device *new); ++ ++#else ++ ++static inline void mconsole_register_dev(struct mc_device *new) ++{ ++} ++ ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mem.h um/arch/um/include/mem.h +--- orig/arch/um/include/mem.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mem.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,29 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MEM_H__ ++#define __MEM_H__ ++ ++struct vm_reserved { ++ struct list_head list; ++ unsigned long start; ++ unsigned long end; ++}; ++ ++extern void set_usable_vm(unsigned long start, unsigned long end); ++extern void set_kmem_end(unsigned long new); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mem_user.h um/arch/um/include/mem_user.h +--- orig/arch/um/include/mem_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mem_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,87 @@ ++/* ++ * arch/um/include/mem_user.h ++ * ++ * BRIEF MODULE DESCRIPTION ++ * user side memory interface for support IO memory inside user mode linux ++ * ++ * Copyright (C) 2001 RidgeRun, Inc. ++ * Author: RidgeRun, Inc. ++ * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN ++ * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ++ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ++ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ++ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 675 Mass Ave, Cambridge, MA 02139, USA. ++ */ ++ ++#ifndef _MEM_USER_H ++#define _MEM_USER_H ++ ++struct mem_region { ++ char *driver; ++ unsigned long start_pfn; ++ unsigned long start; ++ unsigned long len; ++ void *mem_map; ++ int fd; ++}; ++ ++extern struct mem_region *regions[]; ++extern struct mem_region physmem_region; ++ ++#define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) ++ ++extern unsigned long host_task_size; ++extern unsigned long task_size; ++ ++extern int init_mem_user(void); ++extern int create_mem_file(unsigned long len); ++extern void setup_range(int fd, char *driver, unsigned long start, ++ unsigned long pfn, unsigned long total, int need_vm, ++ struct mem_region *region, void *reserved); ++extern void setup_memory(void *entry); ++extern unsigned long find_iomem(char *driver, unsigned long *len_out); ++extern int init_maps(struct mem_region *region); ++extern int nregions(void); ++extern int reserve_vm(unsigned long start, unsigned long end, void *e); ++extern unsigned long get_vm(unsigned long len); ++extern void setup_physmem(unsigned long start, unsigned long usable, ++ unsigned long len); ++extern int setup_region(struct mem_region *region, void *entry); ++extern void add_iomem(char *name, int fd, unsigned long size); ++extern struct mem_region *phys_region(unsigned long phys); ++extern unsigned long phys_offset(unsigned long phys); ++extern void unmap_physmem(void); ++extern int map_memory(unsigned long virt, unsigned long phys, ++ unsigned long len, int r, int w, int x); ++extern int protect_memory(unsigned long addr, unsigned long len, ++ int r, int w, int x, int must_succeed); ++extern unsigned long get_kmem_end(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mode.h um/arch/um/include/mode.h +--- orig/arch/um/include/mode.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mode.h Fri Jan 17 13:23:32 2003 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_H__ ++#define __MODE_H__ ++ ++#include "uml-config.h" ++ ++#ifdef UML_CONFIG_MODE_TT ++#include "../kernel/tt/include/mode.h" ++#endif ++ ++#ifdef UML_CONFIG_MODE_SKAS ++#include "../kernel/skas/include/mode.h" ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/mode_kern.h um/arch/um/include/mode_kern.h +--- orig/arch/um/include/mode_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/mode_kern.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_KERN_H__ ++#define __MODE_KERN_H__ ++ ++#include "linux/config.h" ++ ++#ifdef CONFIG_MODE_TT ++#include "../kernel/tt/include/mode_kern.h" ++#endif ++ ++#ifdef CONFIG_MODE_SKAS ++#include "../kernel/skas/include/mode_kern.h" ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/net_kern.h um/arch/um/include/net_kern.h +--- orig/arch/um/include/net_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/net_kern.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,81 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_NET_KERN_H ++#define __UM_NET_KERN_H ++ ++#include "linux/netdevice.h" ++#include "linux/skbuff.h" ++#include "linux/socket.h" ++#include "linux/list.h" ++ ++struct uml_net { ++ struct list_head list; ++ struct net_device *dev; ++ int index; ++ unsigned char mac[ETH_ALEN]; ++ int have_mac; ++}; ++ ++struct uml_net_private { ++ struct list_head list; ++ spinlock_t lock; ++ struct net_device *dev; ++ struct timer_list tl; ++ struct net_device_stats stats; ++ int fd; ++ unsigned char mac[ETH_ALEN]; ++ int have_mac; ++ unsigned short (*protocol)(struct sk_buff *); ++ int (*open)(void *); ++ void (*close)(int, void *); ++ void (*remove)(void *); ++ int (*read)(int, struct sk_buff **skb, struct uml_net_private *); ++ int (*write)(int, struct sk_buff **skb, struct uml_net_private *); ++ ++ void (*add_address)(unsigned char *, unsigned char *, void *); ++ void (*delete_address)(unsigned char *, unsigned char *, void *); ++ int (*set_mtu)(int mtu, void *); ++ int user[1]; ++}; ++ ++struct net_kern_info { ++ void (*init)(struct net_device *, void *); ++ unsigned short (*protocol)(struct sk_buff *); ++ int (*read)(int, struct sk_buff **skb, struct uml_net_private *); ++ int (*write)(int, struct sk_buff **skb, struct uml_net_private *); ++}; ++ ++struct transport { ++ struct list_head list; ++ char *name; ++ int (*setup)(char *, char **, void *); ++ struct net_user_info *user; ++ struct net_kern_info *kern; ++ int private_size; ++ int setup_size; ++}; ++ ++extern struct net_device *ether_init(int); ++extern unsigned short ether_protocol(struct sk_buff *); ++extern int setup_etheraddr(char *str, unsigned char *addr); ++extern struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra); ++extern int tap_setup_common(char *str, char *type, char **dev_name, ++ char **mac_out, char **gate_addr); ++extern void register_transport(struct transport *new); ++extern unsigned short eth_protocol(struct sk_buff *skb); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/net_user.h um/arch/um/include/net_user.h +--- orig/arch/um/include/net_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/net_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,66 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_NET_USER_H__ ++#define __UM_NET_USER_H__ ++ ++#define ETH_ADDR_LEN (6) ++#define ETH_HEADER_ETHERTAP (16) ++#define ETH_HEADER_OTHER (14) ++#define ETH_MAX_PACKET (1500) ++ ++#define UML_NET_VERSION (4) ++ ++struct net_user_info { ++ void (*init)(void *, void *); ++ int (*open)(void *); ++ void (*close)(int, void *); ++ void (*remove)(void *); ++ int (*set_mtu)(int mtu, void *); ++ void (*add_address)(unsigned char *, unsigned char *, void *); ++ void (*delete_address)(unsigned char *, unsigned char *, void *); ++ int max_packet; ++}; ++ ++extern void ether_user_init(void *data, void *dev); ++extern void dev_ip_addr(void *d, char *buf, char *bin_buf); ++extern void set_ether_mac(void *d, unsigned char *addr); ++extern void iter_addresses(void *d, void (*cb)(unsigned char *, ++ unsigned char *, void *), ++ void *arg); ++ ++extern void *get_output_buffer(int *len_out); ++extern void free_output_buffer(void *buffer); ++ ++extern int tap_open_common(void *dev, char *gate_addr); ++extern void tap_check_ips(char *gate_addr, char *eth_addr); ++ ++extern void read_output(int fd, char *output_out, int len); ++ ++extern int net_read(int fd, void *buf, int len); ++extern int net_recvfrom(int fd, void *buf, int len); ++extern int net_write(int fd, void *buf, int len); ++extern int net_send(int fd, void *buf, int len); ++extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len); ++ ++extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg); ++extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg); ++ ++extern char *split_if_spec(char *str, ...); ++ ++extern int dev_netmask(void *d, void *m); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/os.h um/arch/um/include/os.h +--- orig/arch/um/include/os.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/os.h Tue Feb 4 19:11:32 2003 +@@ -0,0 +1,137 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __OS_H__ ++#define __OS_H__ ++ ++#include "asm/types.h" ++#include "../os/include/file.h" ++ ++#define OS_TYPE_FILE 1 ++#define OS_TYPE_DIR 2 ++#define OS_TYPE_SYMLINK 3 ++#define OS_TYPE_CHARDEV 4 ++#define OS_TYPE_BLOCKDEV 5 ++#define OS_TYPE_FIFO 6 ++#define OS_TYPE_SOCK 7 ++ ++struct openflags { ++ unsigned int r : 1; ++ unsigned int w : 1; ++ unsigned int s : 1; /* O_SYNC */ ++ unsigned int c : 1; /* O_CREAT */ ++ unsigned int t : 1; /* O_TRUNC */ ++ unsigned int a : 1; /* O_APPEND */ ++ unsigned int e : 1; /* O_EXCL */ ++ unsigned int cl : 1; /* FD_CLOEXEC */ ++}; ++ ++#define OPENFLAGS() ((struct openflags) { .r = 0, .w = 0, .s = 0, .c = 0, \ ++ .t = 0, .a = 0, .e = 0, .cl = 0 }) ++ ++static inline struct openflags of_read(struct openflags flags) ++{ ++ flags.r = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_write(struct openflags flags) ++{ ++ flags.w = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_rdwr(struct openflags flags) ++{ ++ return(of_read(of_write(flags))); ++} ++ ++static inline struct openflags of_set_rw(struct openflags flags, int r, int w) ++{ ++ flags.r = r; ++ flags.w = w; ++ return(flags); ++} ++ ++static inline struct openflags of_sync(struct openflags flags) ++{ ++ flags.s = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_create(struct openflags flags) ++{ ++ flags.c = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_trunc(struct openflags flags) ++{ ++ flags.t = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_append(struct openflags flags) ++{ ++ flags.a = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_excl(struct openflags flags) ++{ ++ flags.e = 1; ++ return(flags); ++} ++ ++static inline struct openflags of_cloexec(struct openflags flags) ++{ ++ flags.cl = 1; ++ return(flags); ++} ++ ++extern int os_seek_file(int fd, __u64 offset); ++extern int os_open_file(char *file, struct openflags flags, int mode); ++extern int os_read_file(int fd, void *buf, int len); ++extern int os_write_file(int fd, void *buf, int count); ++extern int os_file_size(char *file, long long *size_out); ++extern int os_pipe(int *fd, int stream, int close_on_exec); ++extern int os_set_fd_async(int fd, int owner); ++extern int os_set_fd_block(int fd, int blocking); ++extern int os_accept_connection(int fd); ++extern int os_shutdown_socket(int fd, int r, int w); ++extern void os_close_file(int fd); ++extern int os_rcv_fd(int fd, int *helper_pid_out); ++extern int create_unix_socket(char *file, int len); ++extern int os_connect_socket(char *name); ++extern int os_file_type(char *file); ++extern int os_file_mode(char *file, struct openflags *mode_out); ++extern int os_lock_file(int fd, int excl); ++ ++extern unsigned long os_process_pc(int pid); ++extern int os_process_parent(int pid); ++extern void os_stop_process(int pid); ++extern void os_kill_process(int pid, int reap_child); ++extern void os_usr1_process(int pid); ++extern int os_getpid(void); ++ ++extern int os_map_memory(void *virt, int fd, unsigned long off, ++ unsigned long len, int r, int w, int x); ++extern int os_protect_memory(void *addr, unsigned long len, ++ int r, int w, int x); ++extern int os_unmap_memory(void *addr, int len); ++extern void os_flush_stdout(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/process.h um/arch/um/include/process.h +--- orig/arch/um/include/process.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/process.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,25 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PROCESS_H__ ++#define __PROCESS_H__ ++ ++#include <asm/sigcontext.h> ++ ++extern void sig_handler(int sig, struct sigcontext sc); ++extern void alarm_handler(int sig, struct sigcontext sc); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/ptrace_user.h um/arch/um/include/ptrace_user.h +--- orig/arch/um/include/ptrace_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/ptrace_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,18 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PTRACE_USER_H__ ++#define __PTRACE_USER_H__ ++ ++#include "sysdep/ptrace_user.h" ++ ++extern int ptrace_getregs(long pid, unsigned long *regs_out); ++extern int ptrace_setregs(long pid, unsigned long *regs_in); ++extern int ptrace_getfpregs(long pid, unsigned long *regs_out); ++extern void arch_enter_kernel(void *task, int pid); ++extern void arch_leave_kernel(void *task, int pid); ++extern void ptrace_pokeuser(unsigned long addr, unsigned long data); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/include/sigcontext.h um/arch/um/include/sigcontext.h +--- orig/arch/um/include/sigcontext.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sigcontext.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,25 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UML_SIGCONTEXT_H__ ++#define __UML_SIGCONTEXT_H__ ++ ++#include "sysdep/sigcontext.h" ++ ++extern int sc_size(void *data); ++extern void sc_to_sc(void *to_ptr, void *from_ptr); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sigio.h um/arch/um/include/sigio.h +--- orig/arch/um/include/sigio.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sigio.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SIGIO_H__ ++#define __SIGIO_H__ ++ ++extern int write_sigio_irq(int fd); ++extern int register_sigio_fd(int fd); ++extern int read_sigio_fd(int fd); ++extern int add_sigio_fd(int fd, int read); ++extern int ignore_sigio_fd(int fd); ++extern void sigio_lock(void); ++extern void sigio_unlock(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/signal_kern.h um/arch/um/include/signal_kern.h +--- orig/arch/um/include/signal_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/signal_kern.h Thu Dec 5 18:08:47 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SIGNAL_KERN_H__ ++#define __SIGNAL_KERN_H__ ++ ++extern int have_signals(void *t); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/signal_user.h um/arch/um/include/signal_user.h +--- orig/arch/um/include/signal_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/signal_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,26 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SIGNAL_USER_H__ ++#define __SIGNAL_USER_H__ ++ ++extern int signal_stack_size; ++ ++extern int change_sig(int signal, int on); ++extern void set_sigstack(void *stack, int size); ++extern void set_handler(int sig, void (*handler)(int), int flags, ...); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/skas_ptrace.h um/arch/um/include/skas_ptrace.h +--- orig/arch/um/include/skas_ptrace.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/skas_ptrace.h Mon Dec 16 11:54:52 2002 +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_PTRACE_H ++#define __SKAS_PTRACE_H ++ ++struct ptrace_faultinfo { ++ int is_write; ++ unsigned long addr; ++}; ++ ++struct ptrace_ldt { ++ int func; ++ void *ptr; ++ unsigned long bytecount; ++}; ++ ++#define PTRACE_FAULTINFO 52 ++#define PTRACE_SIGPENDING 53 ++#define PTRACE_LDT 54 ++#define PTRACE_SWITCH_MM 55 ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/syscall_user.h um/arch/um/include/syscall_user.h +--- orig/arch/um/include/syscall_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/syscall_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSCALL_USER_H ++#define __SYSCALL_USER_H ++ ++extern int record_syscall_start(int syscall); ++extern void record_syscall_end(int index, int result); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/checksum.h um/arch/um/include/sysdep-i386/checksum.h +--- orig/arch/um/include/sysdep-i386/checksum.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/checksum.h Tue Oct 29 21:23:02 2002 +@@ -0,0 +1,217 @@ ++/* ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_SYSDEP_CHECKSUM_H ++#define __UM_SYSDEP_CHECKSUM_H ++ ++#include "linux/string.h" ++ ++/* ++ * computes the checksum of a memory block at buff, length len, ++ * and adds in "sum" (32-bit) ++ * ++ * returns a 32-bit number suitable for feeding into itself ++ * or csum_tcpudp_magic ++ * ++ * this function must be called with even lengths, except ++ * for the last fragment, which may be odd ++ * ++ * it's best to have buff aligned on a 32-bit boundary ++ */ ++unsigned int csum_partial(const unsigned char * buff, int len, ++ unsigned int sum); ++ ++/* ++ * the same as csum_partial, but copies from src while it ++ * checksums, and handles user-space pointer exceptions correctly, when needed. ++ * ++ * here even more important to align src and dst on a 32-bit (or even ++ * better 64-bit) boundary ++ */ ++ ++unsigned int csum_partial_copy_to(const char *src, char *dst, int len, ++ int sum, int *err_ptr); ++unsigned int csum_partial_copy_from(const char *src, char *dst, int len, ++ int sum, int *err_ptr); ++ ++/* ++ * Note: when you get a NULL pointer exception here this means someone ++ * passed in an incorrect kernel address to one of these functions. ++ * ++ * If you use these functions directly please don't forget the ++ * verify_area(). ++ */ ++ ++static __inline__ ++unsigned int csum_partial_copy_nocheck(const char *src, char *dst, ++ int len, int sum) ++{ ++ memcpy(dst, src, len); ++ return(csum_partial(dst, len, sum)); ++} ++ ++static __inline__ ++unsigned int csum_partial_copy_from_user(const char *src, char *dst, ++ int len, int sum, int *err_ptr) ++{ ++ return csum_partial_copy_from(src, dst, len, sum, err_ptr); ++} ++ ++/* ++ * These are the old (and unsafe) way of doing checksums, a warning message ++ * will be printed if they are used and an exeption occurs. ++ * ++ * these functions should go away after some time. ++ */ ++ ++#define csum_partial_copy_fromuser csum_partial_copy_from_user ++unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum); ++ ++/* ++ * This is a version of ip_compute_csum() optimized for IP headers, ++ * which always checksum on 4 octet boundaries. ++ * ++ * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by ++ * Arnt Gulbrandsen. ++ */ ++static inline unsigned short ip_fast_csum(unsigned char * iph, ++ unsigned int ihl) ++{ ++ unsigned int sum; ++ ++ __asm__ __volatile__( ++ "movl (%1), %0 ;\n" ++ "subl $4, %2 ;\n" ++ "jbe 2f ;\n" ++ "addl 4(%1), %0 ;\n" ++ "adcl 8(%1), %0 ;\n" ++ "adcl 12(%1), %0 ;\n" ++"1: adcl 16(%1), %0 ;\n" ++ "lea 4(%1), %1 ;\n" ++ "decl %2 ;\n" ++ "jne 1b ;\n" ++ "adcl $0, %0 ;\n" ++ "movl %0, %2 ;\n" ++ "shrl $16, %0 ;\n" ++ "addw %w2, %w0 ;\n" ++ "adcl $0, %0 ;\n" ++ "notl %0 ;\n" ++"2: ;\n" ++ /* Since the input registers which are loaded with iph and ipl ++ are modified, we must also specify them as outputs, or gcc ++ will assume they contain their original values. */ ++ : "=r" (sum), "=r" (iph), "=r" (ihl) ++ : "1" (iph), "2" (ihl)); ++ return(sum); ++} ++ ++/* ++ * Fold a partial checksum ++ */ ++ ++static inline unsigned int csum_fold(unsigned int sum) ++{ ++ __asm__( ++ "addl %1, %0 ;\n" ++ "adcl $0xffff, %0 ;\n" ++ : "=r" (sum) ++ : "r" (sum << 16), "0" (sum & 0xffff0000) ++ ); ++ return (~sum) >> 16; ++} ++ ++static inline unsigned long csum_tcpudp_nofold(unsigned long saddr, ++ unsigned long daddr, ++ unsigned short len, ++ unsigned short proto, ++ unsigned int sum) ++{ ++ __asm__( ++ "addl %1, %0 ;\n" ++ "adcl %2, %0 ;\n" ++ "adcl %3, %0 ;\n" ++ "adcl $0, %0 ;\n" ++ : "=r" (sum) ++ : "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum)); ++ return sum; ++} ++ ++/* ++ * computes the checksum of the TCP/UDP pseudo-header ++ * returns a 16-bit checksum, already complemented ++ */ ++static inline unsigned short int csum_tcpudp_magic(unsigned long saddr, ++ unsigned long daddr, ++ unsigned short len, ++ unsigned short proto, ++ unsigned int sum) ++{ ++ return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); ++} ++ ++/* ++ * this routine is used for miscellaneous IP-like checksums, mainly ++ * in icmp.c ++ */ ++ ++static inline unsigned short ip_compute_csum(unsigned char * buff, int len) ++{ ++ return csum_fold (csum_partial(buff, len, 0)); ++} ++ ++#define _HAVE_ARCH_IPV6_CSUM ++static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr, ++ struct in6_addr *daddr, ++ __u32 len, ++ unsigned short proto, ++ unsigned int sum) ++{ ++ __asm__( ++ "addl 0(%1), %0 ;\n" ++ "adcl 4(%1), %0 ;\n" ++ "adcl 8(%1), %0 ;\n" ++ "adcl 12(%1), %0 ;\n" ++ "adcl 0(%2), %0 ;\n" ++ "adcl 4(%2), %0 ;\n" ++ "adcl 8(%2), %0 ;\n" ++ "adcl 12(%2), %0 ;\n" ++ "adcl %3, %0 ;\n" ++ "adcl %4, %0 ;\n" ++ "adcl $0, %0 ;\n" ++ : "=&r" (sum) ++ : "r" (saddr), "r" (daddr), ++ "r"(htonl(len)), "r"(htonl(proto)), "0"(sum)); ++ ++ return csum_fold(sum); ++} ++ ++/* ++ * Copy and checksum to user ++ */ ++#define HAVE_CSUM_COPY_USER ++static __inline__ unsigned int csum_and_copy_to_user(const char *src, ++ char *dst, int len, ++ int sum, int *err_ptr) ++{ ++ if (access_ok(VERIFY_WRITE, dst, len)) ++ return(csum_partial_copy_to(src, dst, len, sum, err_ptr)); ++ ++ if (len) ++ *err_ptr = -EFAULT; ++ ++ return -1; /* invalid checksum */ ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame.h um/arch/um/include/sysdep-i386/frame.h +--- orig/arch/um/include/sysdep-i386/frame.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/frame.h Fri Dec 6 14:07:54 2002 +@@ -0,0 +1,29 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_I386_H ++#define __FRAME_I386_H ++ ++struct arch_frame_data_raw { ++ unsigned long fp_start; ++ unsigned long sr; ++}; ++ ++struct arch_frame_data { ++ int fpstate_size; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame_kern.h um/arch/um/include/sysdep-i386/frame_kern.h +--- orig/arch/um/include/sysdep-i386/frame_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/frame_kern.h Mon Dec 2 21:45:04 2002 +@@ -0,0 +1,69 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_KERN_I386_H ++#define __FRAME_KERN_I386_H ++ ++/* This is called from sys_sigreturn. It takes the sp at the point of the ++ * sigreturn system call and returns the address of the sigcontext struct ++ * on the stack. ++ */ ++ ++static inline void *sp_to_sc(unsigned long sp) ++{ ++ return((void *) sp); ++} ++ ++static inline void *sp_to_uc(unsigned long sp) ++{ ++ unsigned long uc; ++ ++ uc = sp + signal_frame_si.uc_index - ++ signal_frame_si.common.sp_index - 4; ++ return((void *) uc); ++} ++ ++static inline void *sp_to_rt_sc(unsigned long sp) ++{ ++ unsigned long sc; ++ ++ sc = sp - signal_frame_si.common.sp_index + ++ signal_frame_si.common.len - 4; ++ return((void *) sc); ++} ++ ++static inline void *sp_to_mask(unsigned long sp) ++{ ++ unsigned long mask; ++ ++ mask = sp - signal_frame_sc.common.sp_index + ++ signal_frame_sc.common.len - 8; ++ return((void *) mask); ++} ++ ++extern int sc_size(void *data); ++ ++static inline void *sp_to_rt_mask(unsigned long sp) ++{ ++ unsigned long mask; ++ ++ mask = sp - signal_frame_si.common.sp_index + ++ signal_frame_si.common.len + ++ sc_size(&signal_frame_si.common.arch) - 4; ++ return((void *) mask); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/frame_user.h um/arch/um/include/sysdep-i386/frame_user.h +--- orig/arch/um/include/sysdep-i386/frame_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/frame_user.h Fri Dec 6 14:13:59 2002 +@@ -0,0 +1,91 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __FRAME_USER_I386_H ++#define __FRAME_USER_I386_H ++ ++#include <asm/page.h> ++#include "sysdep/frame.h" ++ ++/* This stuff is to calculate the size of the fp state struct at runtime ++ * because it has changed between 2.2 and 2.4 and it would be good for a ++ * UML compiled on one to work on the other. ++ * So, setup_arch_frame_raw fills in the arch struct with the raw data, which ++ * just contains the address of the end of the sigcontext. This is invoked ++ * from the signal handler. ++ * setup_arch_frame uses that data to figure out what ++ * arch_frame_data.fpstate_size should be. It really has no idea, since it's ++ * not allowed to do sizeof(struct fpstate) but it's safe to consider that it's ++ * everything from the end of the sigcontext up to the top of the stack. So, ++ * it masks off the page number to get the offset within the page and subtracts ++ * that from the page size, and that's how big the fpstate struct will be ++ * considered to be. ++ */ ++ ++static inline void setup_arch_frame_raw(struct arch_frame_data_raw *data, ++ void *end, unsigned long srp) ++{ ++ unsigned long sr = *((unsigned long *) srp); ++ ++ data->fp_start = (unsigned long) end; ++ if((sr & PAGE_MASK) == ((unsigned long) end & PAGE_MASK)) ++ data->sr = sr; ++ else data->sr = 0; ++} ++ ++static inline void setup_arch_frame(struct arch_frame_data_raw *in, ++ struct arch_frame_data *out) ++{ ++ unsigned long fpstate_start = in->fp_start; ++ ++ if(in->sr == 0){ ++ fpstate_start &= ~PAGE_MASK; ++ out->fpstate_size = PAGE_SIZE - fpstate_start; ++ } ++ else { ++ out->fpstate_size = in->sr - fpstate_start; ++ } ++} ++ ++/* This figures out where on the stack the SA_RESTORER function address ++ * is stored. For i386, it's the signal handler return address, so it's ++ * located next to the frame pointer. ++ * This is inlined, so __builtin_frame_address(0) is correct. Otherwise, ++ * it would have to be __builtin_frame_address(1). ++ */ ++ ++static inline unsigned long frame_restorer(void) ++{ ++ unsigned long *fp; ++ ++ fp = __builtin_frame_address(0); ++ return((unsigned long) (fp + 1)); ++} ++ ++/* Similarly, this returns the value of sp when the handler was first ++ * entered. This is used to calculate the proper sp when delivering ++ * signals. ++ */ ++ ++static inline unsigned long frame_sp(void) ++{ ++ unsigned long *fp; ++ ++ fp = __builtin_frame_address(0); ++ return((unsigned long) (fp + 1)); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/ptrace.h um/arch/um/include/sysdep-i386/ptrace.h +--- orig/arch/um/include/sysdep-i386/ptrace.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/ptrace.h Fri Jan 17 13:23:31 2003 +@@ -0,0 +1,193 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSDEP_I386_PTRACE_H ++#define __SYSDEP_I386_PTRACE_H ++ ++#include "uml-config.h" ++ ++#ifdef UML_CONFIG_MODE_TT ++#include "ptrace-tt.h" ++#endif ++ ++#ifdef UML_CONFIG_MODE_SKAS ++#include "ptrace-skas.h" ++#endif ++ ++#include "choose-mode.h" ++ ++union uml_pt_regs { ++#ifdef UML_CONFIG_MODE_TT ++ struct tt_regs { ++ long syscall; ++ void *sc; ++ } tt; ++#endif ++#ifdef UML_CONFIG_MODE_SKAS ++ struct skas_regs { ++ unsigned long regs[HOST_FRAME_SIZE]; ++ unsigned long fp[HOST_FP_SIZE]; ++ unsigned long xfp[HOST_XFP_SIZE]; ++ unsigned long fault_addr; ++ unsigned long fault_type; ++ unsigned long trap_type; ++ long syscall; ++ int is_user; ++ } skas; ++#endif ++}; ++ ++#define EMPTY_UML_PT_REGS { } ++ ++extern int mode_tt; ++ ++#define UPT_SC(r) ((r)->tt.sc) ++#define UPT_IP(r) \ ++ CHOOSE_MODE(SC_IP(UPT_SC(r)), REGS_IP((r)->skas.regs)) ++#define UPT_SP(r) \ ++ CHOOSE_MODE(SC_SP(UPT_SC(r)), REGS_SP((r)->skas.regs)) ++#define UPT_EFLAGS(r) \ ++ CHOOSE_MODE(SC_EFLAGS(UPT_SC(r)), REGS_EFLAGS((r)->skas.regs)) ++#define UPT_EAX(r) \ ++ CHOOSE_MODE(SC_EAX(UPT_SC(r)), REGS_EAX((r)->skas.regs)) ++#define UPT_EBX(r) \ ++ CHOOSE_MODE(SC_EBX(UPT_SC(r)), REGS_EBX((r)->skas.regs)) ++#define UPT_ECX(r) \ ++ CHOOSE_MODE(SC_ECX(UPT_SC(r)), REGS_ECX((r)->skas.regs)) ++#define UPT_EDX(r) \ ++ CHOOSE_MODE(SC_EDX(UPT_SC(r)), REGS_EDX((r)->skas.regs)) ++#define UPT_ESI(r) \ ++ CHOOSE_MODE(SC_ESI(UPT_SC(r)), REGS_ESI((r)->skas.regs)) ++#define UPT_EDI(r) \ ++ CHOOSE_MODE(SC_EDI(UPT_SC(r)), REGS_EDI((r)->skas.regs)) ++#define UPT_EBP(r) \ ++ CHOOSE_MODE(SC_EBP(UPT_SC(r)), REGS_EBP((r)->skas.regs)) ++#define UPT_ORIG_EAX(r) \ ++ CHOOSE_MODE((r)->tt.syscall, (r)->skas.syscall) ++#define UPT_CS(r) \ ++ CHOOSE_MODE(SC_CS(UPT_SC(r)), REGS_CS((r)->skas.regs)) ++#define UPT_SS(r) \ ++ CHOOSE_MODE(SC_SS(UPT_SC(r)), REGS_SS((r)->skas.regs)) ++#define UPT_DS(r) \ ++ CHOOSE_MODE(SC_DS(UPT_SC(r)), REGS_DS((r)->skas.regs)) ++#define UPT_ES(r) \ ++ CHOOSE_MODE(SC_ES(UPT_SC(r)), REGS_ES((r)->skas.regs)) ++#define UPT_FS(r) \ ++ CHOOSE_MODE(SC_FS(UPT_SC(r)), REGS_FS((r)->skas.regs)) ++#define UPT_GS(r) \ ++ CHOOSE_MODE(SC_GS(UPT_SC(r)), REGS_GS((r)->skas.regs)) ++ ++#define UPT_SYSCALL_ARG1(r) UPT_EBX(r) ++#define UPT_SYSCALL_ARG2(r) UPT_ECX(r) ++#define UPT_SYSCALL_ARG3(r) UPT_EDX(r) ++#define UPT_SYSCALL_ARG4(r) UPT_ESI(r) ++#define UPT_SYSCALL_ARG5(r) UPT_EDI(r) ++#define UPT_SYSCALL_ARG6(r) UPT_EBP(r) ++ ++extern int user_context(unsigned long sp); ++ ++#define UPT_IS_USER(r) \ ++ CHOOSE_MODE(user_context(UPT_SP(r)), (r)->skas.is_user) ++ ++struct syscall_args { ++ unsigned long args[6]; ++}; ++ ++#define SYSCALL_ARGS(r) ((struct syscall_args) \ ++ { .args = { UPT_SYSCALL_ARG1(r), \ ++ UPT_SYSCALL_ARG2(r), \ ++ UPT_SYSCALL_ARG3(r), \ ++ UPT_SYSCALL_ARG4(r), \ ++ UPT_SYSCALL_ARG5(r), \ ++ UPT_SYSCALL_ARG6(r) } } ) ++ ++#define UPT_REG(regs, reg) \ ++ ({ unsigned long val; \ ++ switch(reg){ \ ++ case EIP: val = UPT_IP(regs); break; \ ++ case UESP: val = UPT_SP(regs); break; \ ++ case EAX: val = UPT_EAX(regs); break; \ ++ case EBX: val = UPT_EBX(regs); break; \ ++ case ECX: val = UPT_ECX(regs); break; \ ++ case EDX: val = UPT_EDX(regs); break; \ ++ case ESI: val = UPT_ESI(regs); break; \ ++ case EDI: val = UPT_EDI(regs); break; \ ++ case EBP: val = UPT_EBP(regs); break; \ ++ case ORIG_EAX: val = UPT_ORIG_EAX(regs); break; \ ++ case CS: val = UPT_CS(regs); break; \ ++ case SS: val = UPT_SS(regs); break; \ ++ case DS: val = UPT_DS(regs); break; \ ++ case ES: val = UPT_ES(regs); break; \ ++ case FS: val = UPT_FS(regs); break; \ ++ case GS: val = UPT_GS(regs); break; \ ++ case EFL: val = UPT_EFLAGS(regs); break; \ ++ default : \ ++ panic("Bad register in UPT_REG : %d\n", reg); \ ++ val = -1; \ ++ } \ ++ val; \ ++ }) ++ ++ ++#define UPT_SET(regs, reg, val) \ ++ do { \ ++ switch(reg){ \ ++ case EIP: UPT_IP(regs) = val; break; \ ++ case UESP: UPT_SP(regs) = val; break; \ ++ case EAX: UPT_EAX(regs) = val; break; \ ++ case EBX: UPT_EBX(regs) = val; break; \ ++ case ECX: UPT_ECX(regs) = val; break; \ ++ case EDX: UPT_EDX(regs) = val; break; \ ++ case ESI: UPT_ESI(regs) = val; break; \ ++ case EDI: UPT_EDI(regs) = val; break; \ ++ case EBP: UPT_EBP(regs) = val; break; \ ++ case ORIG_EAX: UPT_ORIG_EAX(regs) = val; break; \ ++ case CS: UPT_CS(regs) = val; break; \ ++ case SS: UPT_SS(regs) = val; break; \ ++ case DS: UPT_DS(regs) = val; break; \ ++ case ES: UPT_ES(regs) = val; break; \ ++ case FS: UPT_FS(regs) = val; break; \ ++ case GS: UPT_GS(regs) = val; break; \ ++ case EFL: UPT_EFLAGS(regs) = val; break; \ ++ default : \ ++ panic("Bad register in UPT_SET : %d\n", reg); \ ++ break; \ ++ } \ ++ } while (0) ++ ++#define UPT_SET_SYSCALL_RETURN(r, res) \ ++ CHOOSE_MODE(SC_SET_SYSCALL_RETURN(UPT_SC(r), (res)), \ ++ REGS_SET_SYSCALL_RETURN((r)->skas.regs, (res))) ++ ++#define UPT_RESTART_SYSCALL(r) \ ++ CHOOSE_MODE(SC_RESTART_SYSCALL(UPT_SC(r)), \ ++ REGS_RESTART_SYSCALL((r)->skas.regs)) ++ ++#define UPT_ORIG_SYSCALL(r) UPT_EAX(r) ++#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r) ++#define UPT_SYSCALL_RET(r) UPT_EAX(r) ++ ++#define UPT_SEGV_IS_FIXABLE(r) \ ++ CHOOSE_MODE(SC_SEGV_IS_FIXABLE(UPT_SC(r)), \ ++ REGS_SEGV_IS_FIXABLE(&r->skas)) ++ ++#define UPT_FAULT_ADDR(r) \ ++ CHOOSE_MODE(SC_FAULT_ADDR(UPT_SC(r)), REGS_FAULT_ADDR(&r->skas)) ++ ++#define UPT_FAULT_WRITE(r) \ ++ CHOOSE_MODE(SC_FAULT_WRITE(UPT_SC(r)), REGS_FAULT_WRITE(&r->skas)) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/ptrace_user.h um/arch/um/include/sysdep-i386/ptrace_user.h +--- orig/arch/um/include/sysdep-i386/ptrace_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/ptrace_user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSDEP_I386_PTRACE_USER_H__ ++#define __SYSDEP_I386_PTRACE_USER_H__ ++ ++#include <asm/ptrace.h> ++ ++#define PT_OFFSET(r) ((r) * sizeof(long)) ++ ++#define PT_SYSCALL_NR(regs) ((regs)[ORIG_EAX]) ++#define PT_SYSCALL_NR_OFFSET PT_OFFSET(ORIG_EAX) ++ ++#define PT_SYSCALL_ARG1_OFFSET PT_OFFSET(EBX) ++#define PT_SYSCALL_ARG2_OFFSET PT_OFFSET(ECX) ++#define PT_SYSCALL_ARG3_OFFSET PT_OFFSET(EDX) ++#define PT_SYSCALL_ARG4_OFFSET PT_OFFSET(ESI) ++#define PT_SYSCALL_ARG5_OFFSET PT_OFFSET(EDI) ++ ++#define PT_SYSCALL_RET_OFFSET PT_OFFSET(EAX) ++ ++#define PT_IP_OFFSET PT_OFFSET(EIP) ++#define PT_IP(regs) ((regs)[EIP]) ++#define PT_SP(regs) ((regs)[UESP]) ++ ++#ifndef FRAME_SIZE ++#define FRAME_SIZE (17) ++#endif ++#define FRAME_SIZE_OFFSET (FRAME_SIZE * sizeof(unsigned long)) ++ ++#define FP_FRAME_SIZE (27) ++#define FPX_FRAME_SIZE (128) ++ ++#ifdef PTRACE_GETREGS ++#define UM_HAVE_GETREGS ++#endif ++ ++#ifdef PTRACE_SETREGS ++#define UM_HAVE_SETREGS ++#endif ++ ++#ifdef PTRACE_GETFPREGS ++#define UM_HAVE_GETFPREGS ++#endif ++ ++#ifdef PTRACE_SETFPREGS ++#define UM_HAVE_SETFPREGS ++#endif ++ ++#ifdef PTRACE_GETFPXREGS ++#define UM_HAVE_GETFPXREGS ++#endif ++ ++#ifdef PTRACE_SETFPXREGS ++#define UM_HAVE_SETFPXREGS ++#endif ++ ++extern void update_debugregs(int seq); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/sigcontext.h um/arch/um/include/sysdep-i386/sigcontext.h +--- orig/arch/um/include/sysdep-i386/sigcontext.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/sigcontext.h Sun Dec 8 18:21:33 2002 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYS_SIGCONTEXT_I386_H ++#define __SYS_SIGCONTEXT_I386_H ++ ++#include "sc.h" ++ ++#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) ++ ++#define SC_RESTART_SYSCALL(sc) IP_RESTART_SYSCALL(SC_IP(sc)) ++#define SC_SET_SYSCALL_RETURN(sc, result) SC_EAX(sc) = (result) ++ ++#define SC_FAULT_ADDR(sc) SC_CR2(sc) ++#define SC_FAULT_TYPE(sc) SC_ERR(sc) ++ ++#define FAULT_WRITE(err) (err & 2) ++#define TO_SC_ERR(is_write) ((is_write) ? 2 : 0) ++ ++#define SC_FAULT_WRITE(sc) (FAULT_WRITE(SC_ERR(sc))) ++ ++#define SC_TRAP_TYPE(sc) SC_TRAPNO(sc) ++ ++/* ptrace expects that, at the start of a system call, %eax contains ++ * -ENOSYS, so this makes it so. ++ */ ++#define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0) ++ ++/* These are General Protection and Page Fault */ ++#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14)) ++ ++#define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc))) ++ ++extern unsigned long *sc_sigmask(void *sc_ptr); ++extern int sc_get_fpregs(unsigned long buf, void *sc_ptr); ++ ++#endif ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-i386/syscalls.h um/arch/um/include/sysdep-i386/syscalls.h +--- orig/arch/um/include/sysdep-i386/syscalls.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-i386/syscalls.h Sun Dec 8 18:04:15 2002 +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "asm/unistd.h" ++#include "sysdep/ptrace.h" ++ ++typedef long syscall_handler_t(struct pt_regs); ++ ++#define EXECUTE_SYSCALL(syscall, regs) \ ++ ((long (*)(struct syscall_args)) (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) ++ ++extern syscall_handler_t sys_modify_ldt; ++extern syscall_handler_t old_mmap_i386; ++extern syscall_handler_t old_select; ++extern syscall_handler_t sys_ni_syscall; ++ ++#define ARCH_SYSCALLS \ ++ [ __NR_mmap ] = old_mmap_i386, \ ++ [ __NR_select ] = old_select, \ ++ [ __NR_vm86old ] = sys_ni_syscall, \ ++ [ __NR_modify_ldt ] = sys_modify_ldt, \ ++ [ __NR_lchown32 ] = sys_lchown, \ ++ [ __NR_getuid32 ] = sys_getuid, \ ++ [ __NR_getgid32 ] = sys_getgid, \ ++ [ __NR_geteuid32 ] = sys_geteuid, \ ++ [ __NR_getegid32 ] = sys_getegid, \ ++ [ __NR_setreuid32 ] = sys_setreuid, \ ++ [ __NR_setregid32 ] = sys_setregid, \ ++ [ __NR_getgroups32 ] = sys_getgroups, \ ++ [ __NR_setgroups32 ] = sys_setgroups, \ ++ [ __NR_fchown32 ] = sys_fchown, \ ++ [ __NR_setresuid32 ] = sys_setresuid, \ ++ [ __NR_getresuid32 ] = sys_getresuid, \ ++ [ __NR_setresgid32 ] = sys_setresgid, \ ++ [ __NR_getresgid32 ] = sys_getresgid, \ ++ [ __NR_chown32 ] = sys_chown, \ ++ [ __NR_setuid32 ] = sys_setuid, \ ++ [ __NR_setgid32 ] = sys_setgid, \ ++ [ __NR_setfsuid32 ] = sys_setfsuid, \ ++ [ __NR_setfsgid32 ] = sys_setfsgid, \ ++ [ __NR_pivot_root ] = sys_pivot_root, \ ++ [ __NR_mincore ] = sys_mincore, \ ++ [ __NR_madvise ] = sys_madvise, \ ++ [ 222 ] = sys_ni_syscall, ++ ++/* 222 doesn't yet have a name in include/asm-i386/unistd.h */ ++ ++#define LAST_ARCH_SYSCALL 222 ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/ptrace.h um/arch/um/include/sysdep-ia64/ptrace.h +--- orig/arch/um/include/sysdep-ia64/ptrace.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ia64/ptrace.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,26 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSDEP_IA64_PTRACE_H ++#define __SYSDEP_IA64_PTRACE_H ++ ++struct sys_pt_regs { ++ int foo; ++}; ++ ++#define EMPTY_REGS { 0 } ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/sigcontext.h um/arch/um/include/sysdep-ia64/sigcontext.h +--- orig/arch/um/include/sysdep-ia64/sigcontext.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ia64/sigcontext.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSDEP_IA64_SIGCONTEXT_H ++#define __SYSDEP_IA64_SIGCONTEXT_H ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ia64/syscalls.h um/arch/um/include/sysdep-ia64/syscalls.h +--- orig/arch/um/include/sysdep-ia64/syscalls.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ia64/syscalls.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYSDEP_IA64_SYSCALLS_H ++#define __SYSDEP_IA64_SYSCALLS_H ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/ptrace.h um/arch/um/include/sysdep-ppc/ptrace.h +--- orig/arch/um/include/sysdep-ppc/ptrace.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ppc/ptrace.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,104 @@ ++/* ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYS_PTRACE_PPC_H ++#define __SYS_PTRACE_PPC_H ++ ++#include "linux/config.h" ++#include "linux/types.h" ++ ++/* the following taken from <asm-ppc/ptrace.h> */ ++ ++#ifdef CONFIG_PPC64 ++#define PPC_REG unsigned long /*long*/ ++#else ++#define PPC_REG unsigned long ++#endif ++struct sys_pt_regs_s { ++ PPC_REG gpr[32]; ++ PPC_REG nip; ++ PPC_REG msr; ++ PPC_REG orig_gpr3; /* Used for restarting system calls */ ++ PPC_REG ctr; ++ PPC_REG link; ++ PPC_REG xer; ++ PPC_REG ccr; ++ PPC_REG mq; /* 601 only (not used at present) */ ++ /* Used on APUS to hold IPL value. */ ++ PPC_REG trap; /* Reason for being here */ ++ PPC_REG dar; /* Fault registers */ ++ PPC_REG dsisr; ++ PPC_REG result; /* Result of a system call */ ++}; ++ ++#define NUM_REGS (sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG)) ++ ++struct sys_pt_regs { ++ PPC_REG regs[sizeof(struct sys_pt_regs_s) / sizeof(PPC_REG)]; ++}; ++ ++#define UM_MAX_REG (PT_FPR0) ++#define UM_MAX_REG_OFFSET (UM_MAX_REG * sizeof(PPC_REG)) ++ ++#define EMPTY_REGS { { [ 0 ... NUM_REGS - 1] = 0 } } ++ ++#define UM_REG(r, n) ((r)->regs[n]) ++ ++#define UM_SYSCALL_RET(r) UM_REG(r, PT_R3) ++#define UM_SP(r) UM_REG(r, PT_R1) ++#define UM_IP(r) UM_REG(r, PT_NIP) ++#define UM_ELF_ZERO(r) UM_REG(r, PT_FPSCR) ++#define UM_SYSCALL_NR(r) UM_REG(r, PT_R0) ++#define UM_SYSCALL_ARG1(r) UM_REG(r, PT_ORIG_R3) ++#define UM_SYSCALL_ARG2(r) UM_REG(r, PT_R4) ++#define UM_SYSCALL_ARG3(r) UM_REG(r, PT_R5) ++#define UM_SYSCALL_ARG4(r) UM_REG(r, PT_R6) ++#define UM_SYSCALL_ARG5(r) UM_REG(r, PT_R7) ++#define UM_SYSCALL_ARG6(r) UM_REG(r, PT_R8) ++ ++#define UM_SYSCALL_NR_OFFSET (PT_R0 * sizeof(PPC_REG)) ++#define UM_SYSCALL_RET_OFFSET (PT_R3 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG1_OFFSET (PT_R3 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG2_OFFSET (PT_R4 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG3_OFFSET (PT_R5 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG4_OFFSET (PT_R6 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG5_OFFSET (PT_R7 * sizeof(PPC_REG)) ++#define UM_SYSCALL_ARG6_OFFSET (PT_R8 * sizeof(PPC_REG)) ++#define UM_SP_OFFSET (PT_R1 * sizeof(PPC_REG)) ++#define UM_IP_OFFSET (PT_NIP * sizeof(PPC_REG)) ++#define UM_ELF_ZERO_OFFSET (PT_R3 * sizeof(PPC_REG)) ++ ++#define UM_SET_SYSCALL_RETURN(_regs, result) \ ++do { \ ++ if (result < 0) { \ ++ (_regs)->regs[PT_CCR] |= 0x10000000; \ ++ UM_SYSCALL_RET((_regs)) = -result; \ ++ } else { \ ++ UM_SYSCALL_RET((_regs)) = result; \ ++ } \ ++} while(0) ++ ++extern void shove_aux_table(unsigned long sp); ++#define UM_FIX_EXEC_STACK(sp) shove_aux_table(sp); ++ ++/* These aren't actually defined. The undefs are just to make sure ++ * everyone's clear on the concept. ++ */ ++#undef UML_HAVE_GETREGS ++#undef UML_HAVE_GETFPREGS ++#undef UML_HAVE_SETREGS ++#undef UML_HAVE_SETFPREGS ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/sigcontext.h um/arch/um/include/sysdep-ppc/sigcontext.h +--- orig/arch/um/include/sysdep-ppc/sigcontext.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ppc/sigcontext.h Sat Nov 23 22:02:19 2002 +@@ -0,0 +1,62 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SYS_SIGCONTEXT_PPC_H ++#define __SYS_SIGCONTEXT_PPC_H ++ ++#define DSISR_WRITE 0x02000000 ++ ++#define SC_FAULT_ADDR(sc) ({ \ ++ struct sigcontext *_sc = (sc); \ ++ long retval = -1; \ ++ switch (_sc->regs->trap) { \ ++ case 0x300: \ ++ /* data exception */ \ ++ retval = _sc->regs->dar; \ ++ break; \ ++ case 0x400: \ ++ /* instruction exception */ \ ++ retval = _sc->regs->nip; \ ++ break; \ ++ default: \ ++ panic("SC_FAULT_ADDR: unhandled trap type\n"); \ ++ } \ ++ retval; \ ++ }) ++ ++#define SC_FAULT_WRITE(sc) ({ \ ++ struct sigcontext *_sc = (sc); \ ++ long retval = -1; \ ++ switch (_sc->regs->trap) { \ ++ case 0x300: \ ++ /* data exception */ \ ++ retval = !!(_sc->regs->dsisr & DSISR_WRITE); \ ++ break; \ ++ case 0x400: \ ++ /* instruction exception: not a write */ \ ++ retval = 0; \ ++ break; \ ++ default: \ ++ panic("SC_FAULT_ADDR: unhandled trap type\n"); \ ++ } \ ++ retval; \ ++ }) ++ ++#define SC_IP(sc) ((sc)->regs->nip) ++#define SC_SP(sc) ((sc)->regs->gpr[1]) ++#define SEGV_IS_FIXABLE(sc) (1) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysdep-ppc/syscalls.h um/arch/um/include/sysdep-ppc/syscalls.h +--- orig/arch/um/include/sysdep-ppc/syscalls.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysdep-ppc/syscalls.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++typedef long syscall_handler_t(unsigned long arg1, unsigned long arg2, ++ unsigned long arg3, unsigned long arg4, ++ unsigned long arg5, unsigned long arg6); ++ ++#define EXECUTE_SYSCALL(syscall, regs) \ ++ (*sys_call_table[syscall])(UM_SYSCALL_ARG1(®s), \ ++ UM_SYSCALL_ARG2(®s), \ ++ UM_SYSCALL_ARG3(®s), \ ++ UM_SYSCALL_ARG4(®s), \ ++ UM_SYSCALL_ARG5(®s), \ ++ UM_SYSCALL_ARG6(®s)) ++ ++extern syscall_handler_t sys_mincore; ++extern syscall_handler_t sys_madvise; ++ ++/* old_mmap needs the correct prototype since syscall_kern.c includes ++ * this file. ++ */ ++int old_mmap(unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long fd, unsigned long offset); ++ ++#define ARCH_SYSCALLS \ ++ [ __NR_modify_ldt ] = sys_ni_syscall, \ ++ [ __NR_pciconfig_read ] = sys_ni_syscall, \ ++ [ __NR_pciconfig_write ] = sys_ni_syscall, \ ++ [ __NR_pciconfig_iobase ] = sys_ni_syscall, \ ++ [ __NR_pivot_root ] = sys_ni_syscall, \ ++ [ __NR_multiplexer ] = sys_ni_syscall, \ ++ [ __NR_mmap ] = old_mmap, \ ++ [ __NR_madvise ] = sys_madvise, \ ++ [ __NR_mincore ] = sys_mincore, ++ ++#define LAST_ARCH_SYSCALL __NR_mincore ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/sysrq.h um/arch/um/include/sysrq.h +--- orig/arch/um/include/sysrq.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/sysrq.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SYSRQ_H ++#define __UM_SYSRQ_H ++ ++extern void show_trace(unsigned long *stack); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/include/tempfile.h um/arch/um/include/tempfile.h +--- orig/arch/um/include/tempfile.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/tempfile.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,21 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TEMPFILE_H__ ++#define __TEMPFILE_H__ ++ ++extern int make_tempfile(const char *template, char **tempname, int do_unlink); ++ ++#endif ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/time_user.h um/arch/um/include/time_user.h +--- orig/arch/um/include/time_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/time_user.h Wed Jan 8 12:55:47 2003 +@@ -0,0 +1,17 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TIME_USER_H__ ++#define __TIME_USER_H__ ++ ++extern void timer(void); ++extern void switch_timers(int to_real); ++extern void set_interval(int timer_type); ++extern void idle_sleep(int secs); ++extern void enable_timer(void); ++extern unsigned long time_lock(void); ++extern void time_unlock(unsigned long); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/include/tlb.h um/arch/um/include/tlb.h +--- orig/arch/um/include/tlb.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/tlb.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TLB_H__ ++#define __TLB_H__ ++ ++extern void mprotect_kernel_vm(int w); ++extern void force_flush_all(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/ubd_user.h um/arch/um/include/ubd_user.h +--- orig/arch/um/include/ubd_user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/ubd_user.h Thu Mar 6 18:09:14 2003 +@@ -0,0 +1,77 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Copyright (C) 2001 RidgeRun, Inc (glonnon@ridgerun.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_UBD_USER_H ++#define __UM_UBD_USER_H ++ ++#include "os.h" ++ ++enum ubd_req { UBD_READ, UBD_WRITE }; ++ ++struct io_thread_req { ++ enum ubd_req op; ++ int fds[2]; ++ unsigned long offsets[2]; ++ unsigned long long offset; ++ unsigned long length; ++ char *buffer; ++ int sectorsize; ++ unsigned long sector_mask; ++ unsigned long cow_offset; ++ unsigned long bitmap_words[2]; ++ int error; ++}; ++ ++extern int open_ubd_file(char *file, struct openflags *openflags, ++ char **backing_file_out, int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, int *data_offset_out, ++ int *create_cow_out); ++extern int create_cow_file(char *cow_file, char *backing_file, ++ struct openflags flags, int sectorsize, ++ int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, ++ int *data_offset_out); ++extern int read_cow_bitmap(int fd, void *buf, int offset, int len); ++extern int read_ubd_fs(int fd, void *buffer, int len); ++extern int write_ubd_fs(int fd, char *buffer, int len); ++extern int start_io_thread(unsigned long sp, int *fds_out); ++extern void do_io(struct io_thread_req *req); ++ ++static inline int ubd_test_bit(__u64 bit, unsigned char *data) ++{ ++ __u64 n; ++ int bits, off; ++ ++ bits = sizeof(data[0]) * 8; ++ n = bit / bits; ++ off = bit % bits; ++ return((data[n] & (1 << off)) != 0); ++} ++ ++static inline void ubd_set_bit(__u64 bit, unsigned char *data) ++{ ++ __u64 n; ++ int bits, off; ++ ++ bits = sizeof(data[0]) * 8; ++ n = bit / bits; ++ off = bit % bits; ++ data[n] |= (1 << off); ++} ++ ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/um_mmu.h um/arch/um/include/um_mmu.h +--- orig/arch/um/include/um_mmu.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/um_mmu.h Sat Nov 9 12:51:43 2002 +@@ -0,0 +1,40 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __ARCH_UM_MMU_H ++#define __ARCH_UM_MMU_H ++ ++#include "linux/config.h" ++#include "choose-mode.h" ++ ++#ifdef CONFIG_MODE_TT ++#include "../kernel/tt/include/mmu.h" ++#endif ++ ++#ifdef CONFIG_MODE_SKAS ++#include "../kernel/skas/include/mmu.h" ++#endif ++ ++typedef union { ++#ifdef CONFIG_MODE_TT ++ struct mmu_context_tt tt; ++#endif ++#ifdef CONFIG_MODE_SKAS ++ struct mmu_context_skas skas; ++#endif ++} mm_context_t; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/um_uaccess.h um/arch/um/include/um_uaccess.h +--- orig/arch/um/include/um_uaccess.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/um_uaccess.h Sat Nov 23 22:03:02 2002 +@@ -0,0 +1,73 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __ARCH_UM_UACCESS_H ++#define __ARCH_UM_UACCESS_H ++ ++#include "linux/config.h" ++#include "choose-mode.h" ++ ++#ifdef CONFIG_MODE_TT ++#include "../kernel/tt/include/uaccess.h" ++#endif ++ ++#ifdef CONFIG_MODE_SKAS ++#include "../kernel/skas/include/uaccess.h" ++#endif ++ ++#define access_ok(type, addr, size) \ ++ CHOOSE_MODE_PROC(access_ok_tt, access_ok_skas, type, addr, size) ++ ++static inline int verify_area(int type, const void * addr, unsigned long size) ++{ ++ return(CHOOSE_MODE_PROC(verify_area_tt, verify_area_skas, type, addr, ++ size)); ++} ++ ++static inline int copy_from_user(void *to, const void *from, int n) ++{ ++ return(CHOOSE_MODE_PROC(copy_from_user_tt, copy_from_user_skas, to, ++ from, n)); ++} ++ ++static inline int copy_to_user(void *to, const void *from, int n) ++{ ++ return(CHOOSE_MODE_PROC(copy_to_user_tt, copy_to_user_skas, to, ++ from, n)); ++} ++ ++static inline int strncpy_from_user(char *dst, const char *src, int count) ++{ ++ return(CHOOSE_MODE_PROC(strncpy_from_user_tt, strncpy_from_user_skas, ++ dst, src, count)); ++} ++ ++static inline int __clear_user(void *mem, int len) ++{ ++ return(CHOOSE_MODE_PROC(__clear_user_tt, __clear_user_skas, mem, len)); ++} ++ ++static inline int clear_user(void *mem, int len) ++{ ++ return(CHOOSE_MODE_PROC(clear_user_tt, clear_user_skas, mem, len)); ++} ++ ++static inline int strnlen_user(const void *str, int len) ++{ ++ return(CHOOSE_MODE_PROC(strnlen_user_tt, strnlen_user_skas, str, len)); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/umid.h um/arch/um/include/umid.h +--- orig/arch/um/include/umid.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/umid.h Mon Dec 16 20:52:19 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UMID_H__ ++#define __UMID_H__ ++ ++extern int umid_file_name(char *name, char *buf, int len); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/uml_uaccess.h um/arch/um/include/uml_uaccess.h +--- orig/arch/um/include/uml_uaccess.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/uml_uaccess.h Thu Dec 19 13:15:22 2002 +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UML_UACCESS_H__ ++#define __UML_UACCESS_H__ ++ ++extern int __do_copy_to_user(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher); ++extern unsigned long __do_user_copy(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher, ++ void (*op)(void *to, const void *from, ++ int n), int *faulted_out); ++void __do_copy(void *to, const void *from, int n); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/umn.h um/arch/um/include/umn.h +--- orig/arch/um/include/umn.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/umn.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,27 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UMN_H ++#define __UMN_H ++ ++extern int open_umn_tty(int *slave_out, int *slipno_out); ++extern void close_umn_tty(int master, int slave); ++extern int umn_send_packet(int fd, void *data, int len); ++extern int set_umn_addr(int fd, char *addr, char *ptp_addr); ++extern void slip_unesc(unsigned char s); ++extern void umn_read(int fd); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/user.h um/arch/um/include/user.h +--- orig/arch/um/include/user.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/user.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,29 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __USER_H__ ++#define __USER_H__ ++ ++extern void panic(const char *fmt, ...); ++extern int printk(const char *fmt, ...); ++extern void schedule(void); ++extern void *um_kmalloc(int size); ++extern void *um_kmalloc_atomic(int size); ++extern void kfree(void *ptr); ++extern int in_aton(char *str); ++extern int open_gdb_chan(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/include/user_util.h um/arch/um/include/user_util.h +--- orig/arch/um/include/user_util.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/include/user_util.h Wed Apr 23 20:42:00 2003 +@@ -0,0 +1,103 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __USER_UTIL_H__ ++#define __USER_UTIL_H__ ++ ++#include "sysdep/ptrace.h" ++ ++extern int mode_tt; ++ ++extern int grantpt(int __fd); ++extern int unlockpt(int __fd); ++extern char *ptsname(int __fd); ++ ++enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB }; ++ ++struct cpu_task { ++ int pid; ++ void *task; ++}; ++ ++extern struct cpu_task cpu_tasks[]; ++ ++struct signal_info { ++ void (*handler)(int, union uml_pt_regs *); ++ int is_irq; ++}; ++ ++extern struct signal_info sig_info[]; ++ ++extern unsigned long low_physmem; ++extern unsigned long high_physmem; ++extern unsigned long uml_physmem; ++extern unsigned long uml_reserved; ++extern unsigned long end_vm; ++extern unsigned long start_vm; ++extern unsigned long highmem; ++ ++extern char host_info[]; ++ ++extern char saved_command_line[]; ++extern char command_line[]; ++ ++extern char *tempdir; ++ ++extern unsigned long _stext, _etext, _sdata, _edata, __bss_start, _end; ++extern unsigned long _unprotected_end; ++extern unsigned long brk_start; ++ ++extern int pty_output_sigio; ++extern int pty_close_sigio; ++ ++extern void stop(void); ++extern void stack_protections(unsigned long address); ++extern void task_protections(unsigned long address); ++extern int wait_for_stop(int pid, int sig, int cont_type, void *relay); ++extern void *add_signal_handler(int sig, void (*handler)(int)); ++extern int start_fork_tramp(void *arg, unsigned long temp_stack, ++ int clone_flags, int (*tramp)(void *)); ++extern int linux_main(int argc, char **argv); ++extern void set_cmdline(char *cmd); ++extern void input_cb(void (*proc)(void *), void *arg, int arg_len); ++extern int get_pty(void); ++extern void *um_kmalloc(int size); ++extern int raw(int fd, int complain); ++extern int switcheroo(int fd, int prot, void *from, void *to, int size); ++extern void setup_machinename(char *machine_out); ++extern void setup_hostinfo(void); ++extern void add_arg(char *cmd_line, char *arg); ++extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)); ++extern void init_new_thread_signals(int altstack); ++extern void do_exec(int old_pid, int new_pid); ++extern void tracer_panic(char *msg, ...); ++extern char *get_umid(int only_if_set); ++extern void do_longjmp(void *p, int val); ++extern void suspend_new_thread(int fd); ++extern int detach(int pid, int sig); ++extern int attach(int pid); ++extern void kill_child_dead(int pid); ++extern int cont(int pid); ++extern void check_ptrace(void); ++extern void check_sigio(void); ++extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr); ++extern void write_sigio_workaround(void); ++extern void arch_check_bugs(void); ++extern int arch_handle_signal(int sig, union uml_pt_regs *regs); ++extern int arch_fixup(unsigned long address, void *sc_ptr); ++extern int can_do_skas(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/Makefile um/arch/um/kernel/Makefile +--- orig/arch/um/kernel/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/Makefile Thu Apr 10 11:14:55 2003 +@@ -0,0 +1,73 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = built-in.o ++ ++obj-y = config.o checksum.o exec_kern.o exitcode.o frame_kern.o frame.o \ ++ helper.o init_task.o irq.o irq_user.o ksyms.o mem.o mem_user.o \ ++ process.o process_kern.o ptrace.o reboot.o resource.o sigio_user.o \ ++ sigio_kern.o signal_kern.o signal_user.o smp.o syscall_kern.o \ ++ syscall_user.o sysrq.o sys_call_table.o tempfile.o time.o \ ++ time_kern.o tlb.o trap_kern.o trap_user.o uaccess_user.o um_arch.o \ ++ umid.o user_syms.o user_util.o ++ ++obj-$(CONFIG_BLK_DEV_INITRD) += initrd_kern.o initrd_user.o ++obj-$(CONFIG_GPROF) += gprof_syms.o ++obj-$(CONFIG_GCOV) += gmon_syms.o ++obj-$(CONFIG_TTY_LOG) += tty_log.o ++ ++subdir-$(CONFIG_MODE_TT) += tt ++subdir-$(CONFIG_MODE_SKAS) += skas ++ ++user-objs-$(CONFIG_TTY_LOG) += tty_log.o ++ ++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) ++ ++# user_syms.o not included here because Rules.make has its own ideas about ++# building anything in export-objs ++ ++USER_OBJS = $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \ ++ process.o tempfile.o time.o umid.o user_util.o ++ ++DMODULES-$(CONFIG_MODULES) = -D__CONFIG_MODULES__ ++DMODVERSIONS-$(CONFIG_MODVERSIONS) = -D__CONFIG_MODVERSIONS__ ++ ++export-objs-$(CONFIG_GPROF) += gprof_syms.o ++export-objs-$(CONFIG_GCOV) += gmon_syms.o ++ ++export-objs = ksyms.o process_kern.o signal_kern.o user_syms.o $(export-objs-y) ++ ++CFLAGS_user_syms.o = -D__AUTOCONF_INCLUDED__ $(DMODULES-y) $(DMODVERSIONS-y) \ ++ -I/usr/include -I../include ++ ++CFLAGS_frame.o := $(patsubst -fomit-frame-pointer,,$(USER_CFLAGS)) ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++# This has to be separate because it needs be compiled with frame pointers ++# regardless of how the rest of the kernel is built. ++ ++frame.o: frame.c ++ $(CC) $(CFLAGS_$@) -c -o $@ $< ++ ++QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' ++ ++config.c : config.c.in $(TOPDIR)/.config ++ $(PERL) -e $(QUOTE) < config.c.in > $@ ++ ++clean: ++ $(RM) config.c ++ for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: clean +diff -Naur -X ../exclude-files orig/arch/um/kernel/checksum.c um/arch/um/kernel/checksum.c +--- orig/arch/um/kernel/checksum.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/checksum.c Thu Oct 31 22:39:58 2002 +@@ -0,0 +1,42 @@ ++#include "asm/uaccess.h" ++#include "linux/errno.h" ++ ++extern unsigned int arch_csum_partial(const char *buff, int len, int sum); ++ ++extern unsigned int csum_partial(char *buff, int len, int sum) ++{ ++ return(arch_csum_partial(buff, len, sum)); ++} ++ ++unsigned int csum_partial_copy_to(const char *src, char *dst, int len, ++ int sum, int *err_ptr) ++{ ++ if(copy_to_user(dst, src, len)){ ++ *err_ptr = -EFAULT; ++ return(-1); ++ } ++ ++ return(arch_csum_partial(src, len, sum)); ++} ++ ++unsigned int csum_partial_copy_from(const char *src, char *dst, int len, ++ int sum, int *err_ptr) ++{ ++ if(copy_from_user(dst, src, len)){ ++ *err_ptr = -EFAULT; ++ return(-1); ++ } ++ ++ return(arch_csum_partial(dst, len, sum)); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/config.c.in um/arch/um/kernel/config.c.in +--- orig/arch/um/kernel/config.c.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/config.c.in Thu Apr 10 11:17:55 2003 +@@ -0,0 +1,32 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include "init.h" ++ ++static __initdata char *config = "CONFIG"; ++ ++static int __init print_config(char *line, int *add) ++{ ++ printf("%s", config); ++ exit(0); ++} ++ ++__uml_setup("--showconfig", print_config, ++"--showconfig\n" ++" Prints the config file that this UML binary was generated from.\n\n" ++); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/exec_kern.c um/arch/um/kernel/exec_kern.c +--- orig/arch/um/kernel/exec_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/exec_kern.c Wed Apr 16 16:35:05 2003 +@@ -0,0 +1,86 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/slab.h" ++#include "linux/smp_lock.h" ++#include "asm/ptrace.h" ++#include "asm/pgtable.h" ++#include "asm/pgalloc.h" ++#include "asm/uaccess.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "mem_user.h" ++#include "kern.h" ++#include "irq_user.h" ++#include "tlb.h" ++#include "2_5compat.h" ++#include "os.h" ++#include "time_user.h" ++#include "choose-mode.h" ++#include "mode_kern.h" ++ ++void flush_thread(void) ++{ ++ CHOOSE_MODE(flush_thread_tt(), flush_thread_skas()); ++} ++ ++void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) ++{ ++ CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); ++} ++ ++extern void log_exec(char **argv, void *tty); ++ ++static int execve1(char *file, char **argv, char **env) ++{ ++ int error; ++ ++#ifdef CONFIG_TTY_LOG ++ log_exec(argv, current->tty); ++#endif ++ error = do_execve(file, argv, env, ¤t->thread.regs); ++ if (error == 0){ ++ current->ptrace &= ~PT_DTRACE; ++ set_cmdline(current_cmd()); ++ } ++ return(error); ++} ++ ++int um_execve(char *file, char **argv, char **env) ++{ ++ int err; ++ ++ err = execve1(file, argv, env); ++ if(!err) ++ do_longjmp(current->thread.exec_buf, 1); ++ return(err); ++} ++ ++int sys_execve(char *file, char **argv, char **env) ++{ ++ int error; ++ char *filename; ++ ++ lock_kernel(); ++ filename = getname((char *) file); ++ error = PTR_ERR(filename); ++ if (IS_ERR(filename)) goto out; ++ error = execve1(filename, argv, env); ++ putname(filename); ++ out: ++ unlock_kernel(); ++ return(error); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/exitcode.c um/arch/um/kernel/exitcode.c +--- orig/arch/um/kernel/exitcode.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/exitcode.c Thu Nov 7 18:22:04 2002 +@@ -0,0 +1,73 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/init.h" ++#include "linux/ctype.h" ++#include "linux/proc_fs.h" ++#include "asm/uaccess.h" ++ ++/* If read and write race, the read will still atomically read a valid ++ * value. ++ */ ++int uml_exitcode = 0; ++ ++static int read_proc_exitcode(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = sprintf(page, "%d\n", uml_exitcode); ++ len -= off; ++ if(len <= off+count) *eof = 1; ++ *start = page + off; ++ if(len > count) len = count; ++ if(len < 0) len = 0; ++ return(len); ++} ++ ++static int write_proc_exitcode(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ char *end, buf[sizeof("nnnnn\0")]; ++ int tmp; ++ ++ if(copy_from_user(buf, buffer, count)) ++ return(-EFAULT); ++ tmp = simple_strtol(buf, &end, 0); ++ if((*end != '\0') && !isspace(*end)) ++ return(-EINVAL); ++ uml_exitcode = tmp; ++ return(count); ++} ++ ++static int make_proc_exitcode(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = create_proc_entry("exitcode", 0600, &proc_root); ++ if(ent == NULL){ ++ printk("make_proc_exitcode : Failed to register " ++ "/proc/exitcode\n"); ++ return(0); ++ } ++ ++ ent->read_proc = read_proc_exitcode; ++ ent->write_proc = write_proc_exitcode; ++ ++ return(0); ++} ++ ++__initcall(make_proc_exitcode); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/frame.c um/arch/um/kernel/frame.c +--- orig/arch/um/kernel/frame.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/frame.c Wed Dec 11 11:12:41 2002 +@@ -0,0 +1,342 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <string.h> ++#include <signal.h> ++#include <wait.h> ++#include <sched.h> ++#include <errno.h> ++#include <sys/ptrace.h> ++#include <sys/syscall.h> ++#include <sys/mman.h> ++#include <asm/page.h> ++#include <asm/ptrace.h> ++#include <asm/sigcontext.h> ++#include "sysdep/ptrace.h" ++#include "sysdep/sigcontext.h" ++#include "frame_user.h" ++#include "kern_util.h" ++#include "ptrace_user.h" ++#include "os.h" ++ ++static int capture_stack(int (*child)(void *arg), void *arg, void *sp, ++ unsigned long top, void **data_out) ++{ ++ unsigned long regs[FRAME_SIZE]; ++ int pid, status, n, len; ++ ++ /* Start the child as a thread */ ++ pid = clone(child, sp, CLONE_VM | SIGCHLD, arg); ++ if(pid < 0){ ++ printf("capture_stack : clone failed - errno = %d\n", errno); ++ exit(1); ++ } ++ ++ /* Wait for it to stop itself and continue it with a SIGUSR1 to force ++ * it into the signal handler. ++ */ ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0){ ++ printf("capture_stack : waitpid failed - errno = %d\n", errno); ++ exit(1); ++ } ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){ ++ fprintf(stderr, "capture_stack : Expected SIGSTOP, " ++ "got status = 0x%x\n", status); ++ exit(1); ++ } ++ if(ptrace(PTRACE_CONT, pid, 0, SIGUSR1) < 0){ ++ printf("capture_stack : PTRACE_CONT failed - errno = %d\n", ++ errno); ++ exit(1); ++ } ++ ++ /* Wait for it to stop itself again and grab its registers again. ++ * At this point, the handler has stuffed the addresses of ++ * sig, sc, and SA_RESTORER in raw. ++ */ ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0){ ++ printf("capture_stack : waitpid failed - errno = %d\n", errno); ++ exit(1); ++ } ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){ ++ fprintf(stderr, "capture_stack : Expected SIGSTOP, " ++ "got status = 0x%x\n", status); ++ exit(1); ++ } ++ if(ptrace(PTRACE_GETREGS, pid, 0, regs) < 0){ ++ printf("capture_stack : PTRACE_GETREGS failed - errno = %d\n", ++ errno); ++ exit(1); ++ } ++ ++ /* It has outlived its usefulness, so continue it so it can exit */ ++ if(ptrace(PTRACE_CONT, pid, 0, 0) < 0){ ++ printf("capture_stack : PTRACE_CONT failed - errno = %d\n", ++ errno); ++ exit(1); ++ } ++ if(waitpid(pid, &status, 0) < 0){ ++ printf("capture_stack : waitpid failed - errno = %d\n", errno); ++ exit(1); ++ } ++ if(!WIFSIGNALED(status) || (WTERMSIG(status) != 9)){ ++ printf("capture_stack : Expected exit signal 9, " ++ "got status = 0x%x\n", status); ++ exit(1); ++ } ++ ++ /* The frame that we want is the top of the signal stack */ ++ ++ len = top - PT_SP(regs); ++ *data_out = malloc(len); ++ if(*data_out == NULL){ ++ printf("capture_stack : malloc failed - errno = %d\n", errno); ++ exit(1); ++ } ++ memcpy(*data_out, (void *) PT_SP(regs), len); ++ ++ return(len); ++} ++ ++struct common_raw { ++ void *stack; ++ int size; ++ unsigned long sig; ++ unsigned long sr; ++ unsigned long sp; ++ struct arch_frame_data_raw arch; ++}; ++ ++#define SA_RESTORER (0x04000000) ++ ++typedef unsigned long old_sigset_t; ++ ++struct old_sigaction { ++ __sighandler_t handler; ++ old_sigset_t sa_mask; ++ unsigned long sa_flags; ++ void (*sa_restorer)(void); ++}; ++ ++static void child_common(struct common_raw *common, sighandler_t handler, ++ int restorer, int flags) ++{ ++ stack_t ss = ((stack_t) { .ss_sp = common->stack, ++ .ss_flags = 0, ++ .ss_size = common->size }); ++ int err; ++ ++ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){ ++ printf("PTRACE_TRACEME failed, errno = %d\n", errno); ++ } ++ if(sigaltstack(&ss, NULL) < 0){ ++ printf("sigaltstack failed - errno = %d\n", errno); ++ kill(getpid(), SIGKILL); ++ } ++ ++ if(restorer){ ++ struct sigaction sa; ++ ++ sa.sa_handler = handler; ++ sigemptyset(&sa.sa_mask); ++ sa.sa_flags = SA_ONSTACK | flags; ++ err = sigaction(SIGUSR1, &sa, NULL); ++ } ++ else { ++ struct old_sigaction sa; ++ ++ sa.handler = handler; ++ sa.sa_mask = 0; ++ sa.sa_flags = (SA_ONSTACK | flags) & ~SA_RESTORER; ++ err = syscall(__NR_sigaction, SIGUSR1, &sa, NULL); ++ } ++ ++ if(err < 0){ ++ printf("sigaction failed - errno = %d\n", errno); ++ kill(getpid(), SIGKILL); ++ } ++ ++ os_stop_process(os_getpid()); ++} ++ ++/* Changed only during early boot */ ++struct sc_frame signal_frame_sc; ++ ++struct sc_frame signal_frame_sc_sr; ++ ++struct sc_frame_raw { ++ struct common_raw common; ++ unsigned long sc; ++ int restorer; ++}; ++ ++/* Changed only during early boot */ ++static struct sc_frame_raw *raw_sc = NULL; ++ ++static void sc_handler(int sig, struct sigcontext sc) ++{ ++ raw_sc->common.sig = (unsigned long) &sig; ++ raw_sc->common.sr = frame_restorer(); ++ raw_sc->common.sp = frame_sp(); ++ raw_sc->sc = (unsigned long) ≻ ++ setup_arch_frame_raw(&raw_sc->common.arch, &sc + 1, raw_sc->common.sr); ++ ++ os_stop_process(os_getpid()); ++ kill(getpid(), SIGKILL); ++} ++ ++static int sc_child(void *arg) ++{ ++ raw_sc = arg; ++ child_common(&raw_sc->common, (sighandler_t) sc_handler, ++ raw_sc->restorer, 0); ++ return(-1); ++} ++ ++/* Changed only during early boot */ ++struct si_frame signal_frame_si; ++ ++struct si_frame_raw { ++ struct common_raw common; ++ unsigned long sip; ++ unsigned long si; ++ unsigned long ucp; ++ unsigned long uc; ++}; ++ ++/* Changed only during early boot */ ++static struct si_frame_raw *raw_si = NULL; ++ ++static void si_handler(int sig, siginfo_t *si, struct ucontext *ucontext) ++{ ++ raw_si->common.sig = (unsigned long) &sig; ++ raw_si->common.sr = frame_restorer(); ++ raw_si->common.sp = frame_sp(); ++ raw_si->sip = (unsigned long) &si; ++ raw_si->si = (unsigned long) si; ++ raw_si->ucp = (unsigned long) &ucontext; ++ raw_si->uc = (unsigned long) ucontext; ++ setup_arch_frame_raw(&raw_si->common.arch, ++ ucontext->uc_mcontext.fpregs, raw_si->common.sr); ++ ++ os_stop_process(os_getpid()); ++ kill(getpid(), SIGKILL); ++} ++ ++static int si_child(void *arg) ++{ ++ raw_si = arg; ++ child_common(&raw_si->common, (sighandler_t) si_handler, 1, ++ SA_SIGINFO); ++ return(-1); ++} ++ ++static int relative_sr(unsigned long sr, int sr_index, void *stack, ++ void *framep) ++{ ++ unsigned long *srp = (unsigned long *) sr; ++ unsigned long frame = (unsigned long) framep; ++ ++ if((*srp & PAGE_MASK) == (unsigned long) stack){ ++ *srp -= sr; ++ *((unsigned long *) (frame + sr_index)) = *srp; ++ return(1); ++ } ++ else return(0); ++} ++ ++static unsigned long capture_stack_common(int (*proc)(void *), void *arg, ++ struct common_raw *common_in, ++ void *top, void *sigstack, ++ int stack_len, ++ struct frame_common *common_out) ++{ ++ unsigned long sig_top = (unsigned long) sigstack + stack_len, base; ++ ++ common_in->stack = (void *) sigstack; ++ common_in->size = stack_len; ++ common_out->len = capture_stack(proc, arg, top, sig_top, ++ &common_out->data); ++ base = sig_top - common_out->len; ++ common_out->sig_index = common_in->sig - base; ++ common_out->sp_index = common_in->sp - base; ++ common_out->sr_index = common_in->sr - base; ++ common_out->sr_relative = relative_sr(common_in->sr, ++ common_out->sr_index, sigstack, ++ common_out->data); ++ return(base); ++} ++ ++void capture_signal_stack(void) ++{ ++ struct sc_frame_raw raw_sc; ++ struct si_frame_raw raw_si; ++ void *stack, *sigstack; ++ unsigned long top, sig_top, base; ++ ++ stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ sigstack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if((stack == MAP_FAILED) || (sigstack == MAP_FAILED)){ ++ printf("capture_signal_stack : mmap failed - errno = %d\n", ++ errno); ++ exit(1); ++ } ++ ++ top = (unsigned long) stack + PAGE_SIZE - sizeof(void *); ++ sig_top = (unsigned long) sigstack + PAGE_SIZE; ++ ++ /* Get the sigcontext, no sigrestorer layout */ ++ raw_sc.restorer = 0; ++ base = capture_stack_common(sc_child, &raw_sc, &raw_sc.common, ++ (void *) top, sigstack, PAGE_SIZE, ++ &signal_frame_sc.common); ++ ++ signal_frame_sc.sc_index = raw_sc.sc - base; ++ setup_arch_frame(&raw_sc.common.arch, &signal_frame_sc.common.arch); ++ ++ /* Ditto for the sigcontext, sigrestorer layout */ ++ raw_sc.restorer = 1; ++ base = capture_stack_common(sc_child, &raw_sc, &raw_sc.common, ++ (void *) top, sigstack, PAGE_SIZE, ++ &signal_frame_sc_sr.common); ++ signal_frame_sc_sr.sc_index = raw_sc.sc - base; ++ setup_arch_frame(&raw_sc.common.arch, &signal_frame_sc_sr.common.arch); ++ ++ /* And the siginfo layout */ ++ ++ base = capture_stack_common(si_child, &raw_si, &raw_si.common, ++ (void *) top, sigstack, PAGE_SIZE, ++ &signal_frame_si.common); ++ signal_frame_si.sip_index = raw_si.sip - base; ++ signal_frame_si.si_index = raw_si.si - base; ++ signal_frame_si.ucp_index = raw_si.ucp - base; ++ signal_frame_si.uc_index = raw_si.uc - base; ++ setup_arch_frame(&raw_si.common.arch, &signal_frame_si.common.arch); ++ ++ if((munmap(stack, PAGE_SIZE) < 0) || ++ (munmap(sigstack, PAGE_SIZE) < 0)){ ++ printf("capture_signal_stack : munmap failed - errno = %d\n", ++ errno); ++ exit(1); ++ } ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/frame_kern.c um/arch/um/kernel/frame_kern.c +--- orig/arch/um/kernel/frame_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/frame_kern.c Sun Dec 8 19:44:13 2002 +@@ -0,0 +1,171 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "asm/signal.h" ++#include "asm/uaccess.h" ++#include "asm/ucontext.h" ++#include "frame_kern.h" ++#include "sigcontext.h" ++#include "sysdep/ptrace.h" ++#include "choose-mode.h" ++#include "mode.h" ++ ++int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from) ++{ ++ if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) ++ return -EFAULT; ++ if (from->si_code < 0) ++ return __copy_to_user(to, from, sizeof(siginfo_t)); ++ else { ++ int err; ++ ++ /* If you change siginfo_t structure, please be sure ++ this code is fixed accordingly. ++ It should never copy any pad contained in the structure ++ to avoid security leaks, but must copy the generic ++ 3 ints plus the relevant union member. */ ++ err = __put_user(from->si_signo, &to->si_signo); ++ err |= __put_user(from->si_errno, &to->si_errno); ++ err |= __put_user((short)from->si_code, &to->si_code); ++ /* First 32bits of unions are always present. */ ++ err |= __put_user(from->si_pid, &to->si_pid); ++ switch (from->si_code >> 16) { ++ case __SI_FAULT >> 16: ++ break; ++ case __SI_CHLD >> 16: ++ err |= __put_user(from->si_utime, &to->si_utime); ++ err |= __put_user(from->si_stime, &to->si_stime); ++ err |= __put_user(from->si_status, &to->si_status); ++ default: ++ err |= __put_user(from->si_uid, &to->si_uid); ++ break; ++ } ++ return err; ++ } ++} ++ ++static int copy_restorer(void (*restorer)(void), unsigned long start, ++ unsigned long sr_index, int sr_relative) ++{ ++ unsigned long sr; ++ ++ if(sr_relative){ ++ sr = (unsigned long) restorer; ++ sr += start + sr_index; ++ restorer = (void (*)(void)) sr; ++ } ++ ++ return(copy_to_user((void *) (start + sr_index), &restorer, ++ sizeof(restorer))); ++} ++ ++static int copy_sc_to_user(void *to, void *fp, struct pt_regs *from, ++ struct arch_frame_data *arch) ++{ ++ return(CHOOSE_MODE(copy_sc_to_user_tt(to, fp, UPT_SC(&from->regs), ++ arch), ++ copy_sc_to_user_skas(to, fp, &from->regs, ++ current->thread.cr2, ++ current->thread.err))); ++} ++ ++static int copy_ucontext_to_user(struct ucontext *uc, void *fp, sigset_t *set, ++ unsigned long sp) ++{ ++ int err = 0; ++ ++ err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); ++ err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); ++ err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); ++ err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, ++ &signal_frame_si.common.arch); ++ err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); ++ return(err); ++} ++ ++int setup_signal_stack_si(unsigned long stack_top, int sig, ++ unsigned long handler, void (*restorer)(void), ++ struct pt_regs *regs, siginfo_t *info, ++ sigset_t *mask) ++{ ++ unsigned long start; ++ void *sip, *ucp, *fp; ++ ++ start = stack_top - signal_frame_si.common.len; ++ sip = (void *) (start + signal_frame_si.si_index); ++ ucp = (void *) (start + signal_frame_si.uc_index); ++ fp = (void *) (((unsigned long) ucp) + sizeof(struct ucontext)); ++ ++ if(restorer == NULL) ++ panic("setup_signal_stack_si - no restorer"); ++ ++ if(copy_to_user((void *) start, signal_frame_si.common.data, ++ signal_frame_si.common.len) || ++ copy_to_user((void *) (start + signal_frame_si.common.sig_index), ++ &sig, sizeof(sig)) || ++ copy_siginfo_to_user(sip, info) || ++ copy_to_user((void *) (start + signal_frame_si.sip_index), &sip, ++ sizeof(sip)) || ++ copy_ucontext_to_user(ucp, fp, mask, PT_REGS_SP(regs)) || ++ copy_to_user((void *) (start + signal_frame_si.ucp_index), &ucp, ++ sizeof(ucp)) || ++ copy_restorer(restorer, start, signal_frame_si.common.sr_index, ++ signal_frame_si.common.sr_relative)) ++ return(1); ++ ++ PT_REGS_IP(regs) = handler; ++ PT_REGS_SP(regs) = start + signal_frame_si.common.sp_index; ++ return(0); ++} ++ ++int setup_signal_stack_sc(unsigned long stack_top, int sig, ++ unsigned long handler, void (*restorer)(void), ++ struct pt_regs *regs, sigset_t *mask) ++{ ++ struct frame_common *frame = &signal_frame_sc_sr.common; ++ void *user_sc; ++ int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); ++ unsigned long sigs, sr; ++ unsigned long start = stack_top - frame->len - sig_size; ++ ++ user_sc = (void *) (start + signal_frame_sc_sr.sc_index); ++ if(restorer == NULL){ ++ frame = &signal_frame_sc.common; ++ user_sc = (void *) (start + signal_frame_sc.sc_index); ++ sr = (unsigned long) frame->data; ++ sr += frame->sr_index; ++ sr = *((unsigned long *) sr); ++ restorer = ((void (*)(void)) sr); ++ } ++ ++ sigs = start + frame->len; ++ if(copy_to_user((void *) start, frame->data, frame->len) || ++ copy_to_user((void *) (start + frame->sig_index), &sig, ++ sizeof(sig)) || ++ copy_sc_to_user(user_sc, NULL, regs, ++ &signal_frame_sc.common.arch) || ++ copy_to_user(sc_sigmask(user_sc), mask, sizeof(mask->sig[0])) || ++ copy_to_user((void *) sigs, &mask->sig[1], sig_size) || ++ copy_restorer(restorer, start, frame->sr_index, frame->sr_relative)) ++ return(1); ++ ++ PT_REGS_IP(regs) = handler; ++ PT_REGS_SP(regs) = start + frame->sp_index; ++ ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/gmon_syms.c um/arch/um/kernel/gmon_syms.c +--- orig/arch/um/kernel/gmon_syms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/gmon_syms.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/module.h" ++ ++extern void __bb_init_func(void *); ++EXPORT_SYMBOL(__bb_init_func); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/gprof_syms.c um/arch/um/kernel/gprof_syms.c +--- orig/arch/um/kernel/gprof_syms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/gprof_syms.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,20 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/module.h" ++ ++extern void mcount(void); ++EXPORT_SYMBOL(mcount); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/helper.c um/arch/um/kernel/helper.c +--- orig/arch/um/kernel/helper.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/helper.c Thu Oct 31 10:34:23 2002 +@@ -0,0 +1,153 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <errno.h> ++#include <fcntl.h> ++#include <sched.h> ++#include <sys/signal.h> ++#include <sys/wait.h> ++#include "user.h" ++#include "kern_util.h" ++#include "os.h" ++ ++struct helper_data { ++ void (*pre_exec)(void*); ++ void *pre_data; ++ char **argv; ++ int fd; ++}; ++ ++/* Debugging aid, changed only from gdb */ ++int helper_pause = 0; ++ ++static void helper_hup(int sig) ++{ ++} ++ ++static int helper_child(void *arg) ++{ ++ struct helper_data *data = arg; ++ char **argv = data->argv; ++ ++ if(helper_pause){ ++ signal(SIGHUP, helper_hup); ++ pause(); ++ } ++ if(data->pre_exec != NULL) ++ (*data->pre_exec)(data->pre_data); ++ execvp(argv[0], argv); ++ printk("execvp of '%s' failed - errno = %d\n", argv[0], errno); ++ write(data->fd, &errno, sizeof(errno)); ++ os_kill_process(os_getpid(), 0); ++ return(0); ++} ++ ++/* XXX The alloc_stack here breaks if this is called in the tracing thread */ ++ ++int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv, ++ unsigned long *stack_out) ++{ ++ struct helper_data data; ++ unsigned long stack, sp; ++ int pid, fds[2], err, n; ++ ++ if((stack_out != NULL) && (*stack_out != 0)) ++ stack = *stack_out; ++ else stack = alloc_stack(0, um_in_interrupt()); ++ if(stack == 0) return(-ENOMEM); ++ ++ err = os_pipe(fds, 1, 0); ++ if(err){ ++ printk("run_helper : pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ if(fcntl(fds[1], F_SETFD, 1) != 0){ ++ printk("run_helper : setting FD_CLOEXEC failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ ++ sp = stack + page_size() - sizeof(void *); ++ data.pre_exec = pre_exec; ++ data.pre_data = pre_data; ++ data.argv = argv; ++ data.fd = fds[1]; ++ pid = clone(helper_child, (void *) sp, CLONE_VM | SIGCHLD, &data); ++ if(pid < 0){ ++ printk("run_helper : clone failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ close(fds[1]); ++ n = read(fds[0], &err, sizeof(err)); ++ if(n < 0){ ++ printk("run_helper : read on pipe failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ else if(n != 0){ ++ waitpid(pid, NULL, 0); ++ pid = -err; ++ } ++ ++ if(stack_out == NULL) free_stack(stack, 0); ++ else *stack_out = stack; ++ return(pid); ++} ++ ++int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, ++ unsigned long *stack_out, int stack_order) ++{ ++ unsigned long stack, sp; ++ int pid, status; ++ ++ stack = alloc_stack(stack_order, um_in_interrupt()); ++ if(stack == 0) return(-ENOMEM); ++ ++ sp = stack + (page_size() << stack_order) - sizeof(void *); ++ pid = clone(proc, (void *) sp, flags | SIGCHLD, arg); ++ if(pid < 0){ ++ printk("run_helper_thread : clone failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ if(stack_out == NULL){ ++ pid = waitpid(pid, &status, 0); ++ if(pid < 0) ++ printk("run_helper_thread - wait failed, errno = %d\n", ++ pid); ++ if(!WIFEXITED(status) || (WEXITSTATUS(status) != 0)) ++ printk("run_helper_thread - thread returned status " ++ "0x%x\n", status); ++ free_stack(stack, stack_order); ++ } ++ else *stack_out = stack; ++ return(pid); ++} ++ ++int helper_wait(int pid, int block) ++{ ++ int ret; ++ ++ ret = waitpid(pid, NULL, WNOHANG); ++ if(ret < 0){ ++ printk("helper_wait : waitpid failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ return(ret); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/init_task.c um/arch/um/kernel/init_task.c +--- orig/arch/um/kernel/init_task.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/init_task.c Sat Dec 28 19:58:44 2002 +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/mm.h" ++#include "linux/sched.h" ++#include "linux/version.h" ++#include "asm/uaccess.h" ++#include "asm/pgtable.h" ++#include "user_util.h" ++#include "mem_user.h" ++ ++static struct fs_struct init_fs = INIT_FS; ++static struct files_struct init_files = INIT_FILES; ++static struct signal_struct init_signals = INIT_SIGNALS; ++struct mm_struct init_mm = INIT_MM(init_mm); ++ ++/* ++ * Initial task structure. ++ * ++ * We need to make sure that this is 16384-byte aligned due to the ++ * way process stacks are handled. This is done by having a special ++ * "init_task" linker map entry.. ++ */ ++ ++union task_union init_task_union ++__attribute__((__section__(".data.init_task"))) = ++{ INIT_TASK(init_task_union.task) }; ++ ++struct task_struct *alloc_task_struct(void) ++{ ++ return((struct task_struct *) ++ __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER)); ++} ++ ++void unprotect_stack(unsigned long stack) ++{ ++ protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, ++ 1, 1, 0, 1); ++} ++ ++void free_task_struct(struct task_struct *task) ++{ ++ /* free_pages decrements the page counter and only actually frees ++ * the pages if they are now not accessed by anything. ++ */ ++ free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/initrd_kern.c um/arch/um/kernel/initrd_kern.c +--- orig/arch/um/kernel/initrd_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/initrd_kern.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,59 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/init.h" ++#include "linux/bootmem.h" ++#include "linux/blk.h" ++#include "asm/types.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "initrd.h" ++#include "init.h" ++#include "os.h" ++ ++/* Changed by uml_initrd_setup, which is a setup */ ++static char *initrd __initdata = NULL; ++ ++static int __init read_initrd(void) ++{ ++ void *area; ++ long long size; ++ int err; ++ ++ if(initrd == NULL) return 0; ++ err = os_file_size(initrd, &size); ++ if(err) return 0; ++ area = alloc_bootmem(size); ++ if(area == NULL) return 0; ++ if(load_initrd(initrd, area, size) == -1) return 0; ++ initrd_start = (unsigned long) area; ++ initrd_end = initrd_start + size; ++ return 0; ++} ++ ++__uml_postsetup(read_initrd); ++ ++static int __init uml_initrd_setup(char *line, int *add) ++{ ++ initrd = line; ++ return 0; ++} ++ ++__uml_setup("initrd=", uml_initrd_setup, ++"initrd=<initrd image>\n" ++" This is used to boot UML from an initrd image. The argument is the\n" ++" name of the file containing the image.\n\n" ++); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/initrd_user.c um/arch/um/kernel/initrd_user.c +--- orig/arch/um/kernel/initrd_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/initrd_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,43 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <sys/types.h> ++#include <sys/stat.h> ++#include <fcntl.h> ++#include <errno.h> ++ ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "initrd.h" ++#include "os.h" ++ ++int load_initrd(char *filename, void *buf, int size) ++{ ++ int fd, n; ++ ++ if((fd = os_open_file(filename, of_read(OPENFLAGS()), 0)) < 0){ ++ printk("Opening '%s' failed - errno = %d\n", filename, errno); ++ return(-1); ++ } ++ if((n = read(fd, buf, size)) != size){ ++ printk("Read of %d bytes from '%s' returned %d, errno = %d\n", ++ size, filename, n, errno); ++ return(-1); ++ } ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/irq.c um/arch/um/kernel/irq.c +--- orig/arch/um/kernel/irq.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/irq.c Wed Mar 26 14:45:29 2003 +@@ -0,0 +1,842 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: ++ * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar ++ */ ++ ++#include "linux/config.h" ++#include "linux/kernel.h" ++#include "linux/smp.h" ++#include "linux/irq.h" ++#include "linux/kernel_stat.h" ++#include "linux/interrupt.h" ++#include "linux/random.h" ++#include "linux/slab.h" ++#include "linux/file.h" ++#include "linux/proc_fs.h" ++#include "linux/init.h" ++#include "linux/seq_file.h" ++#include "asm/irq.h" ++#include "asm/hw_irq.h" ++#include "asm/hardirq.h" ++#include "asm/atomic.h" ++#include "asm/signal.h" ++#include "asm/system.h" ++#include "asm/errno.h" ++#include "asm/uaccess.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "irq_user.h" ++ ++static void register_irq_proc (unsigned int irq); ++ ++irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = ++ { [0 ... NR_IRQS-1] = { 0, &no_irq_type, NULL, 0, SPIN_LOCK_UNLOCKED}}; ++ ++/* ++ * Generic no controller code ++ */ ++ ++static void enable_none(unsigned int irq) { } ++static unsigned int startup_none(unsigned int irq) { return 0; } ++static void disable_none(unsigned int irq) { } ++static void ack_none(unsigned int irq) ++{ ++/* ++ * 'what should we do if we get a hw irq event on an illegal vector'. ++ * each architecture has to answer this themselves, it doesnt deserve ++ * a generic callback i think. ++ */ ++#if CONFIG_X86 ++ printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); ++#ifdef CONFIG_X86_LOCAL_APIC ++ /* ++ * Currently unexpected vectors happen only on SMP and APIC. ++ * We _must_ ack these because every local APIC has only N ++ * irq slots per priority level, and a 'hanging, unacked' IRQ ++ * holds up an irq slot - in excessive cases (when multiple ++ * unexpected vectors occur) that might lock up the APIC ++ * completely. ++ */ ++ ack_APIC_irq(); ++#endif ++#endif ++} ++ ++/* startup is the same as "enable", shutdown is same as "disable" */ ++#define shutdown_none disable_none ++#define end_none enable_none ++ ++struct hw_interrupt_type no_irq_type = { ++ "none", ++ startup_none, ++ shutdown_none, ++ enable_none, ++ disable_none, ++ ack_none, ++ end_none ++}; ++ ++/* Not changed */ ++volatile unsigned long irq_err_count; ++ ++/* ++ * Generic, controller-independent functions: ++ */ ++ ++int get_irq_list(char *buf) ++{ ++ int i, j; ++ unsigned long flags; ++ struct irqaction * action; ++ char *p = buf; ++ ++ p += sprintf(p, " "); ++ for (j=0; j<smp_num_cpus; j++) ++ p += sprintf(p, "CPU%d ",j); ++ *p++ = '\n'; ++ ++ for (i = 0 ; i < NR_IRQS ; i++) { ++ spin_lock_irqsave(&irq_desc[i].lock, flags); ++ action = irq_desc[i].action; ++ if (!action) ++ goto end; ++ p += sprintf(p, "%3d: ",i); ++#ifndef CONFIG_SMP ++ p += sprintf(p, "%10u ", kstat_irqs(i)); ++#else ++ for (j = 0; j < smp_num_cpus; j++) ++ p += sprintf(p, "%10u ", ++ kstat.irqs[cpu_logical_map(j)][i]); ++#endif ++ p += sprintf(p, " %14s", irq_desc[i].handler->typename); ++ p += sprintf(p, " %s", action->name); ++ ++ for (action=action->next; action; action = action->next) ++ p += sprintf(p, ", %s", action->name); ++ *p++ = '\n'; ++ end: ++ spin_unlock_irqrestore(&irq_desc[i].lock, flags); ++ } ++ p += sprintf(p, "\n"); ++#ifdef notdef ++#if CONFIG_SMP ++ p += sprintf(p, "LOC: "); ++ for (j = 0; j < smp_num_cpus; j++) ++ p += sprintf(p, "%10u ", ++ apic_timer_irqs[cpu_logical_map(j)]); ++ p += sprintf(p, "\n"); ++#endif ++#endif ++ p += sprintf(p, "ERR: %10lu\n", irq_err_count); ++ return p - buf; ++} ++ ++ ++/* ++ * This should really return information about whether ++ * we should do bottom half handling etc. Right now we ++ * end up _always_ checking the bottom half, which is a ++ * waste of time and is not what some drivers would ++ * prefer. ++ */ ++int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, ++ struct irqaction * action) ++{ ++ int status; ++ int cpu = smp_processor_id(); ++ ++ irq_enter(cpu, irq); ++ ++ status = 1; /* Force the "do bottom halves" bit */ ++ ++ if (!(action->flags & SA_INTERRUPT)) ++ __sti(); ++ ++ do { ++ status |= action->flags; ++ action->handler(irq, action->dev_id, regs); ++ action = action->next; ++ } while (action); ++ if (status & SA_SAMPLE_RANDOM) ++ add_interrupt_randomness(irq); ++ __cli(); ++ ++ irq_exit(cpu, irq); ++ ++ return status; ++} ++ ++/* ++ * Generic enable/disable code: this just calls ++ * down into the PIC-specific version for the actual ++ * hardware disable after having gotten the irq ++ * controller lock. ++ */ ++ ++/** ++ * disable_irq_nosync - disable an irq without waiting ++ * @irq: Interrupt to disable ++ * ++ * Disable the selected interrupt line. Disables of an interrupt ++ * stack. Unlike disable_irq(), this function does not ensure existing ++ * instances of the IRQ handler have completed before returning. ++ * ++ * This function may be called from IRQ context. ++ */ ++ ++void inline disable_irq_nosync(unsigned int irq) ++{ ++ irq_desc_t *desc = irq_desc + irq; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&desc->lock, flags); ++ if (!desc->depth++) { ++ desc->status |= IRQ_DISABLED; ++ desc->handler->disable(irq); ++ } ++ spin_unlock_irqrestore(&desc->lock, flags); ++} ++ ++/** ++ * disable_irq - disable an irq and wait for completion ++ * @irq: Interrupt to disable ++ * ++ * Disable the selected interrupt line. Disables of an interrupt ++ * stack. That is for two disables you need two enables. This ++ * function waits for any pending IRQ handlers for this interrupt ++ * to complete before returning. If you use this function while ++ * holding a resource the IRQ handler may need you will deadlock. ++ * ++ * This function may be called - with care - from IRQ context. ++ */ ++ ++void disable_irq(unsigned int irq) ++{ ++ disable_irq_nosync(irq); ++ ++ if (!local_irq_count(smp_processor_id())) { ++ do { ++ barrier(); ++ } while (irq_desc[irq].status & IRQ_INPROGRESS); ++ } ++} ++ ++/** ++ * enable_irq - enable interrupt handling on an irq ++ * @irq: Interrupt to enable ++ * ++ * Re-enables the processing of interrupts on this IRQ line ++ * providing no disable_irq calls are now in effect. ++ * ++ * This function may be called from IRQ context. ++ */ ++ ++void enable_irq(unsigned int irq) ++{ ++ irq_desc_t *desc = irq_desc + irq; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&desc->lock, flags); ++ switch (desc->depth) { ++ case 1: { ++ unsigned int status = desc->status & ~IRQ_DISABLED; ++ desc->status = status; ++ if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { ++ desc->status = status | IRQ_REPLAY; ++ hw_resend_irq(desc->handler,irq); ++ } ++ desc->handler->enable(irq); ++ /* fall-through */ ++ } ++ default: ++ desc->depth--; ++ break; ++ case 0: ++ printk(KERN_ERR "enable_irq() unbalanced from %p\n", ++ __builtin_return_address(0)); ++ } ++ spin_unlock_irqrestore(&desc->lock, flags); ++} ++ ++/* ++ * do_IRQ handles all normal device IRQ's (the special ++ * SMP cross-CPU interrupts have their own specific ++ * handlers). ++ */ ++unsigned int do_IRQ(int irq, union uml_pt_regs *regs) ++{ ++ /* ++ * 0 return value means that this irq is already being ++ * handled by some other CPU. (or is disabled) ++ */ ++ int cpu = smp_processor_id(); ++ irq_desc_t *desc = irq_desc + irq; ++ struct irqaction * action; ++ unsigned int status; ++ ++ kstat.irqs[cpu][irq]++; ++ spin_lock(&desc->lock); ++ desc->handler->ack(irq); ++ /* ++ REPLAY is when Linux resends an IRQ that was dropped earlier ++ WAITING is used by probe to mark irqs that are being tested ++ */ ++ status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); ++ status |= IRQ_PENDING; /* we _want_ to handle it */ ++ ++ /* ++ * If the IRQ is disabled for whatever reason, we cannot ++ * use the action we have. ++ */ ++ action = NULL; ++ if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { ++ action = desc->action; ++ status &= ~IRQ_PENDING; /* we commit to handling */ ++ status |= IRQ_INPROGRESS; /* we are handling it */ ++ } ++ desc->status = status; ++ ++ /* ++ * If there is no IRQ handler or it was disabled, exit early. ++ Since we set PENDING, if another processor is handling ++ a different instance of this same irq, the other processor ++ will take care of it. ++ */ ++ if (!action) ++ goto out; ++ ++ /* ++ * Edge triggered interrupts need to remember ++ * pending events. ++ * This applies to any hw interrupts that allow a second ++ * instance of the same irq to arrive while we are in do_IRQ ++ * or in the handler. But the code here only handles the _second_ ++ * instance of the irq, not the third or fourth. So it is mostly ++ * useful for irq hardware that does not mask cleanly in an ++ * SMP environment. ++ */ ++ for (;;) { ++ spin_unlock(&desc->lock); ++ handle_IRQ_event(irq, (struct pt_regs *) regs, action); ++ spin_lock(&desc->lock); ++ ++ if (!(desc->status & IRQ_PENDING)) ++ break; ++ desc->status &= ~IRQ_PENDING; ++ } ++ desc->status &= ~IRQ_INPROGRESS; ++out: ++ /* ++ * The ->end() handler has to deal with interrupts which got ++ * disabled while the handler was running. ++ */ ++ desc->handler->end(irq); ++ spin_unlock(&desc->lock); ++ ++ if (softirq_pending(cpu)) ++ do_softirq(); ++ return 1; ++} ++ ++/** ++ * request_irq - allocate an interrupt line ++ * @irq: Interrupt line to allocate ++ * @handler: Function to be called when the IRQ occurs ++ * @irqflags: Interrupt type flags ++ * @devname: An ascii name for the claiming device ++ * @dev_id: A cookie passed back to the handler function ++ * ++ * This call allocates interrupt resources and enables the ++ * interrupt line and IRQ handling. From the point this ++ * call is made your handler function may be invoked. Since ++ * your handler function must clear any interrupt the board ++ * raises, you must take care both to initialise your hardware ++ * and to set up the interrupt handler in the right order. ++ * ++ * Dev_id must be globally unique. Normally the address of the ++ * device data structure is used as the cookie. Since the handler ++ * receives this value it makes sense to use it. ++ * ++ * If your interrupt is shared you must pass a non NULL dev_id ++ * as this is required when freeing the interrupt. ++ * ++ * Flags: ++ * ++ * SA_SHIRQ Interrupt is shared ++ * ++ * SA_INTERRUPT Disable local interrupts while processing ++ * ++ * SA_SAMPLE_RANDOM The interrupt can be used for entropy ++ * ++ */ ++ ++int request_irq(unsigned int irq, ++ void (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char * devname, ++ void *dev_id) ++{ ++ int retval; ++ struct irqaction * action; ++ ++#if 1 ++ /* ++ * Sanity-check: shared interrupts should REALLY pass in ++ * a real dev-ID, otherwise we'll have trouble later trying ++ * to figure out which interrupt is which (messes up the ++ * interrupt freeing logic etc). ++ */ ++ if (irqflags & SA_SHIRQ) { ++ if (!dev_id) ++ printk(KERN_ERR "Bad boy: %s (at 0x%x) called us " ++ "without a dev_id!\n", devname, (&irq)[-1]); ++ } ++#endif ++ ++ if (irq >= NR_IRQS) ++ return -EINVAL; ++ if (!handler) ++ return -EINVAL; ++ ++ action = (struct irqaction *) ++ kmalloc(sizeof(struct irqaction), GFP_KERNEL); ++ if (!action) ++ return -ENOMEM; ++ ++ action->handler = handler; ++ action->flags = irqflags; ++ action->mask = 0; ++ action->name = devname; ++ action->next = NULL; ++ action->dev_id = dev_id; ++ ++ retval = setup_irq(irq, action); ++ if (retval) ++ kfree(action); ++ return retval; ++} ++ ++int um_request_irq(unsigned int irq, int fd, int type, ++ void (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, const char * devname, ++ void *dev_id) ++{ ++ int err; ++ ++ err = request_irq(irq, handler, irqflags, devname, dev_id); ++ if(err) ++ return(err); ++ ++ if(fd != -1) ++ err = activate_fd(irq, fd, type, dev_id); ++ return(err); ++} ++ ++/* this was setup_x86_irq but it seems pretty generic */ ++int setup_irq(unsigned int irq, struct irqaction * new) ++{ ++ int shared = 0; ++ unsigned long flags; ++ struct irqaction *old, **p; ++ irq_desc_t *desc = irq_desc + irq; ++ ++ /* ++ * Some drivers like serial.c use request_irq() heavily, ++ * so we have to be careful not to interfere with a ++ * running system. ++ */ ++ if (new->flags & SA_SAMPLE_RANDOM) { ++ /* ++ * This function might sleep, we want to call it first, ++ * outside of the atomic block. ++ * Yes, this might clear the entropy pool if the wrong ++ * driver is attempted to be loaded, without actually ++ * installing a new handler, but is this really a problem, ++ * only the sysadmin is able to do this. ++ */ ++ rand_initialize_irq(irq); ++ } ++ ++ /* ++ * The following block of code has to be executed atomically ++ */ ++ spin_lock_irqsave(&desc->lock,flags); ++ p = &desc->action; ++ if ((old = *p) != NULL) { ++ /* Can't share interrupts unless both agree to */ ++ if (!(old->flags & new->flags & SA_SHIRQ)) { ++ spin_unlock_irqrestore(&desc->lock,flags); ++ return -EBUSY; ++ } ++ ++ /* add new interrupt at end of irq queue */ ++ do { ++ p = &old->next; ++ old = *p; ++ } while (old); ++ shared = 1; ++ } ++ ++ *p = new; ++ ++ if (!shared) { ++ desc->depth = 0; ++ desc->status &= ~IRQ_DISABLED; ++ desc->handler->startup(irq); ++ } ++ spin_unlock_irqrestore(&desc->lock,flags); ++ ++ register_irq_proc(irq); ++ return 0; ++} ++ ++/** ++ * free_irq - free an interrupt ++ * @irq: Interrupt line to free ++ * @dev_id: Device identity to free ++ * ++ * Remove an interrupt handler. The handler is removed and if the ++ * interrupt line is no longer in use by any driver it is disabled. ++ * On a shared IRQ the caller must ensure the interrupt is disabled ++ * on the card it drives before calling this function. The function ++ * does not return until any executing interrupts for this IRQ ++ * have completed. ++ * ++ * This function may be called from interrupt context. ++ * ++ * Bugs: Attempting to free an irq in a handler for the same irq hangs ++ * the machine. ++ */ ++ ++void free_irq(unsigned int irq, void *dev_id) ++{ ++ irq_desc_t *desc; ++ struct irqaction **p; ++ unsigned long flags; ++ ++ if (irq >= NR_IRQS) ++ return; ++ ++ desc = irq_desc + irq; ++ spin_lock_irqsave(&desc->lock,flags); ++ p = &desc->action; ++ for (;;) { ++ struct irqaction * action = *p; ++ if (action) { ++ struct irqaction **pp = p; ++ p = &action->next; ++ if (action->dev_id != dev_id) ++ continue; ++ ++ /* Found it - now remove it from the list of entries */ ++ *pp = action->next; ++ if (!desc->action) { ++ desc->status |= IRQ_DISABLED; ++ desc->handler->shutdown(irq); ++ } ++ free_irq_by_irq_and_dev(irq, dev_id); ++ spin_unlock_irqrestore(&desc->lock,flags); ++ ++#ifdef CONFIG_SMP ++ /* Wait to make sure it's not being used on another CPU */ ++ while (desc->status & IRQ_INPROGRESS) ++ barrier(); ++#endif ++ kfree(action); ++ return; ++ } ++ printk(KERN_ERR "Trying to free free IRQ%d\n",irq); ++ spin_unlock_irqrestore(&desc->lock,flags); ++ return; ++ } ++} ++ ++/* These are initialized by sysctl_init, which is called from init/main.c */ ++static struct proc_dir_entry * root_irq_dir; ++static struct proc_dir_entry * irq_dir [NR_IRQS]; ++static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; ++ ++/* These are read and written as longs, so a read won't see a partial write ++ * even during a race. ++ */ ++static unsigned long irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = ~0UL }; ++ ++#define HEX_DIGITS 8 ++ ++static int irq_affinity_read_proc (char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ if (count < HEX_DIGITS+1) ++ return -EINVAL; ++ return sprintf (page, "%08lx\n", irq_affinity[(long)data]); ++} ++ ++static unsigned int parse_hex_value (const char *buffer, ++ unsigned long count, unsigned long *ret) ++{ ++ unsigned char hexnum [HEX_DIGITS]; ++ unsigned long value; ++ int i; ++ ++ if (!count) ++ return -EINVAL; ++ if (count > HEX_DIGITS) ++ count = HEX_DIGITS; ++ if (copy_from_user(hexnum, buffer, count)) ++ return -EFAULT; ++ ++ /* ++ * Parse the first 8 characters as a hex string, any non-hex char ++ * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. ++ */ ++ value = 0; ++ ++ for (i = 0; i < count; i++) { ++ unsigned int c = hexnum[i]; ++ ++ switch (c) { ++ case '0' ... '9': c -= '0'; break; ++ case 'a' ... 'f': c -= 'a'-10; break; ++ case 'A' ... 'F': c -= 'A'-10; break; ++ default: ++ goto out; ++ } ++ value = (value << 4) | c; ++ } ++out: ++ *ret = value; ++ return 0; ++} ++ ++static int irq_affinity_write_proc (struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ int irq = (long) data, full_count = count, err; ++ unsigned long new_value; ++ ++ if (!irq_desc[irq].handler->set_affinity) ++ return -EIO; ++ ++ err = parse_hex_value(buffer, count, &new_value); ++ ++#if CONFIG_SMP ++ /* ++ * Do not allow disabling IRQs completely - it's a too easy ++ * way to make the system unusable accidentally :-) At least ++ * one online CPU still has to be targeted. ++ */ ++ if (!(new_value & cpu_online_map)) ++ return -EINVAL; ++#endif ++ ++ irq_affinity[irq] = new_value; ++ irq_desc[irq].handler->set_affinity(irq, new_value); ++ ++ return full_count; ++} ++ ++static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ unsigned long *mask = (unsigned long *) data; ++ if (count < HEX_DIGITS+1) ++ return -EINVAL; ++ return sprintf (page, "%08lx\n", *mask); ++} ++ ++static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ unsigned long *mask = (unsigned long *) data, full_count = count, err; ++ unsigned long new_value; ++ ++ err = parse_hex_value(buffer, count, &new_value); ++ if (err) ++ return err; ++ ++ *mask = new_value; ++ return full_count; ++} ++ ++#define MAX_NAMELEN 10 ++ ++static void register_irq_proc (unsigned int irq) ++{ ++ struct proc_dir_entry *entry; ++ char name [MAX_NAMELEN]; ++ ++ if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) || ++ irq_dir[irq]) ++ return; ++ ++ memset(name, 0, MAX_NAMELEN); ++ sprintf(name, "%d", irq); ++ ++ /* create /proc/irq/1234 */ ++ irq_dir[irq] = proc_mkdir(name, root_irq_dir); ++ ++ /* create /proc/irq/1234/smp_affinity */ ++ entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); ++ ++ entry->nlink = 1; ++ entry->data = (void *)(long)irq; ++ entry->read_proc = irq_affinity_read_proc; ++ entry->write_proc = irq_affinity_write_proc; ++ ++ smp_affinity_entry[irq] = entry; ++} ++ ++/* Read and written as a long */ ++unsigned long prof_cpu_mask = -1; ++ ++void __init init_irq_proc (void) ++{ ++ struct proc_dir_entry *entry; ++ int i; ++ ++ /* create /proc/irq */ ++ root_irq_dir = proc_mkdir("irq", 0); ++ ++ /* create /proc/irq/prof_cpu_mask */ ++ entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); ++ ++ entry->nlink = 1; ++ entry->data = (void *)&prof_cpu_mask; ++ entry->read_proc = prof_cpu_mask_read_proc; ++ entry->write_proc = prof_cpu_mask_write_proc; ++ ++ /* ++ * Create entries for all existing IRQs. ++ */ ++ for (i = 0; i < NR_IRQS; i++) ++ register_irq_proc(i); ++} ++ ++static spinlock_t irq_spinlock = SPIN_LOCK_UNLOCKED; ++ ++unsigned long irq_lock(void) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&irq_spinlock, flags); ++ return(flags); ++} ++ ++void irq_unlock(unsigned long flags) ++{ ++ spin_unlock_irqrestore(&irq_spinlock, flags); ++} ++ ++unsigned long probe_irq_on(void) ++{ ++ return(0); ++} ++ ++int probe_irq_off(unsigned long val) ++{ ++ return(0); ++} ++ ++static unsigned int startup_SIGIO_irq(unsigned int irq) ++{ ++ return(0); ++} ++ ++static void shutdown_SIGIO_irq(unsigned int irq) ++{ ++} ++ ++static void enable_SIGIO_irq(unsigned int irq) ++{ ++} ++ ++static void disable_SIGIO_irq(unsigned int irq) ++{ ++} ++ ++static void mask_and_ack_SIGIO(unsigned int irq) ++{ ++} ++ ++static void end_SIGIO_irq(unsigned int irq) ++{ ++} ++ ++static unsigned int startup_SIGVTALRM_irq(unsigned int irq) ++{ ++ return(0); ++} ++ ++static void shutdown_SIGVTALRM_irq(unsigned int irq) ++{ ++} ++ ++static void enable_SIGVTALRM_irq(unsigned int irq) ++{ ++} ++ ++static void disable_SIGVTALRM_irq(unsigned int irq) ++{ ++} ++ ++static void mask_and_ack_SIGVTALRM(unsigned int irq) ++{ ++} ++ ++static void end_SIGVTALRM_irq(unsigned int irq) ++{ ++} ++ ++static struct hw_interrupt_type SIGIO_irq_type = { ++ "SIGIO", ++ startup_SIGIO_irq, ++ shutdown_SIGIO_irq, ++ enable_SIGIO_irq, ++ disable_SIGIO_irq, ++ mask_and_ack_SIGIO, ++ end_SIGIO_irq, ++ NULL ++}; ++ ++static struct hw_interrupt_type SIGVTALRM_irq_type = { ++ "SIGVTALRM", ++ startup_SIGVTALRM_irq, ++ shutdown_SIGVTALRM_irq, ++ enable_SIGVTALRM_irq, ++ disable_SIGVTALRM_irq, ++ mask_and_ack_SIGVTALRM, ++ end_SIGVTALRM_irq, ++ NULL ++}; ++ ++void __init init_IRQ(void) ++{ ++ int i; ++ ++ irq_desc[TIMER_IRQ].status = IRQ_DISABLED; ++ irq_desc[TIMER_IRQ].action = 0; ++ irq_desc[TIMER_IRQ].depth = 1; ++ irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type; ++ enable_irq(TIMER_IRQ); ++ for(i=1;i<NR_IRQS;i++){ ++ irq_desc[i].status = IRQ_DISABLED; ++ irq_desc[i].action = 0; ++ irq_desc[i].depth = 1; ++ irq_desc[i].handler = &SIGIO_irq_type; ++ enable_irq(i); ++ } ++ init_irq_signals(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/irq_user.c um/arch/um/kernel/irq_user.c +--- orig/arch/um/kernel/irq_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/irq_user.c Sun Dec 22 15:49:46 2002 +@@ -0,0 +1,427 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <unistd.h> ++#include <errno.h> ++#include <fcntl.h> ++#include <signal.h> ++#include <string.h> ++#include <sys/poll.h> ++#include <sys/types.h> ++#include <sys/time.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "process.h" ++#include "signal_user.h" ++#include "sigio.h" ++#include "irq_user.h" ++#include "os.h" ++ ++struct irq_fd { ++ struct irq_fd *next; ++ void *id; ++ int fd; ++ int type; ++ int irq; ++ int pid; ++ int events; ++ int current_events; ++ int freed; ++}; ++ ++static struct irq_fd *active_fds = NULL; ++static struct irq_fd **last_irq_ptr = &active_fds; ++ ++static struct pollfd *pollfds = NULL; ++static int pollfds_num = 0; ++static int pollfds_size = 0; ++ ++extern int io_count, intr_count; ++ ++void sigio_handler(int sig, union uml_pt_regs *regs) ++{ ++ struct irq_fd *irq_fd, *next; ++ int i, n; ++ ++ if(smp_sigio_handler()) return; ++ while(1){ ++ if((n = poll(pollfds, pollfds_num, 0)) < 0){ ++ if(errno == EINTR) continue; ++ printk("sigio_handler : poll returned %d, " ++ "errno = %d\n", n, errno); ++ break; ++ } ++ if(n == 0) break; ++ ++ irq_fd = active_fds; ++ for(i = 0; i < pollfds_num; i++){ ++ if(pollfds[i].revents != 0){ ++ irq_fd->current_events = pollfds[i].revents; ++ pollfds[i].fd = -1; ++ } ++ irq_fd = irq_fd->next; ++ } ++ ++ for(irq_fd = active_fds; irq_fd != NULL; irq_fd = next){ ++ next = irq_fd->next; ++ if(irq_fd->current_events != 0){ ++ irq_fd->current_events = 0; ++ do_IRQ(irq_fd->irq, regs); ++ ++ /* This is here because the next irq may be ++ * freed in the handler. If a console goes ++ * away, both the read and write irqs will be ++ * freed. After do_IRQ, ->next will point to ++ * a good IRQ. ++ * Irqs can't be freed inside their handlers, ++ * so the next best thing is to have them ++ * marked as needing freeing, so that they ++ * can be freed here. ++ */ ++ next = irq_fd->next; ++ if(irq_fd->freed) ++ free_irq(irq_fd->irq, irq_fd->id); ++ } ++ } ++ } ++} ++ ++int activate_ipi(int fd, int pid) ++{ ++ return(os_set_fd_async(fd, pid)); ++} ++ ++static void maybe_sigio_broken(int fd, int type) ++{ ++ if(isatty(fd)){ ++ if((type == IRQ_WRITE) && !pty_output_sigio){ ++ write_sigio_workaround(); ++ add_sigio_fd(fd, 0); ++ } ++ else if((type == IRQ_READ) && !pty_close_sigio){ ++ write_sigio_workaround(); ++ add_sigio_fd(fd, 1); ++ } ++ } ++} ++ ++int activate_fd(int irq, int fd, int type, void *dev_id) ++{ ++ struct pollfd *tmp_pfd; ++ struct irq_fd *new_fd, *irq_fd; ++ unsigned long flags; ++ int pid, events, err, n, size; ++ ++ pid = os_getpid(); ++ err = os_set_fd_async(fd, pid); ++ if(err < 0) ++ goto out; ++ ++ new_fd = um_kmalloc(sizeof(*new_fd)); ++ err = -ENOMEM; ++ if(new_fd == NULL) ++ goto out; ++ ++ if(type == IRQ_READ) events = POLLIN | POLLPRI; ++ else events = POLLOUT; ++ *new_fd = ((struct irq_fd) { .next = NULL, ++ .id = dev_id, ++ .fd = fd, ++ .type = type, ++ .irq = irq, ++ .pid = pid, ++ .events = events, ++ .current_events = 0, ++ .freed = 0 } ); ++ ++ /* Critical section - locked by a spinlock because this stuff can ++ * be changed from interrupt handlers. The stuff above is done ++ * outside the lock because it allocates memory. ++ */ ++ ++ /* Actually, it only looks like it can be called from interrupt ++ * context. The culprit is reactivate_fd, which calls ++ * maybe_sigio_broken, which calls write_sigio_workaround, ++ * which calls activate_fd. However, write_sigio_workaround should ++ * only be called once, at boot time. That would make it clear that ++ * this is called only from process context, and can be locked with ++ * a semaphore. ++ */ ++ flags = irq_lock(); ++ for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ ++ if((irq_fd->fd == fd) && (irq_fd->type == type)){ ++ printk("Registering fd %d twice\n", fd); ++ printk("Irqs : %d, %d\n", irq_fd->irq, irq); ++ printk("Ids : 0x%x, 0x%x\n", irq_fd->id, dev_id); ++ goto out_unlock; ++ } ++ } ++ ++ n = pollfds_num; ++ if(n == pollfds_size){ ++ while(1){ ++ /* Here we have to drop the lock in order to call ++ * kmalloc, which might sleep. If something else ++ * came in and changed the pollfds array, we free ++ * the buffer and try again. ++ */ ++ irq_unlock(flags); ++ size = (pollfds_num + 1) * sizeof(pollfds[0]); ++ tmp_pfd = um_kmalloc(size); ++ flags = irq_lock(); ++ if(tmp_pfd == NULL) ++ goto out_unlock; ++ if(n == pollfds_size) ++ break; ++ kfree(tmp_pfd); ++ } ++ if(pollfds != NULL){ ++ memcpy(tmp_pfd, pollfds, ++ sizeof(pollfds[0]) * pollfds_size); ++ kfree(pollfds); ++ } ++ pollfds = tmp_pfd; ++ pollfds_size++; ++ } ++ ++ if(type == IRQ_WRITE) ++ fd = -1; ++ ++ pollfds[pollfds_num] = ((struct pollfd) { .fd = fd, ++ .events = events, ++ .revents = 0 }); ++ pollfds_num++; ++ ++ *last_irq_ptr = new_fd; ++ last_irq_ptr = &new_fd->next; ++ ++ irq_unlock(flags); ++ ++ /* This calls activate_fd, so it has to be outside the critical ++ * section. ++ */ ++ maybe_sigio_broken(fd, type); ++ ++ return(0); ++ ++ out_unlock: ++ irq_unlock(flags); ++ kfree(new_fd); ++ out: ++ return(err); ++} ++ ++static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg) ++{ ++ struct irq_fd **prev; ++ unsigned long flags; ++ int i = 0; ++ ++ flags = irq_lock(); ++ prev = &active_fds; ++ while(*prev != NULL){ ++ if((*test)(*prev, arg)){ ++ struct irq_fd *old_fd = *prev; ++ if((pollfds[i].fd != -1) && ++ (pollfds[i].fd != (*prev)->fd)){ ++ printk("free_irq_by_cb - mismatch between " ++ "active_fds and pollfds, fd %d vs %d\n", ++ (*prev)->fd, pollfds[i].fd); ++ goto out; ++ } ++ memcpy(&pollfds[i], &pollfds[i + 1], ++ (pollfds_num - i - 1) * sizeof(pollfds[0])); ++ pollfds_num--; ++ if(last_irq_ptr == &old_fd->next) ++ last_irq_ptr = prev; ++ *prev = (*prev)->next; ++ if(old_fd->type == IRQ_WRITE) ++ ignore_sigio_fd(old_fd->fd); ++ kfree(old_fd); ++ continue; ++ } ++ prev = &(*prev)->next; ++ i++; ++ } ++ out: ++ irq_unlock(flags); ++} ++ ++struct irq_and_dev { ++ int irq; ++ void *dev; ++}; ++ ++static int same_irq_and_dev(struct irq_fd *irq, void *d) ++{ ++ struct irq_and_dev *data = d; ++ ++ return((irq->irq == data->irq) && (irq->id == data->dev)); ++} ++ ++void free_irq_by_irq_and_dev(int irq, void *dev) ++{ ++ struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq, ++ .dev = dev }); ++ ++ free_irq_by_cb(same_irq_and_dev, &data); ++} ++ ++static int same_fd(struct irq_fd *irq, void *fd) ++{ ++ return(irq->fd == *((int *) fd)); ++} ++ ++void free_irq_by_fd(int fd) ++{ ++ free_irq_by_cb(same_fd, &fd); ++} ++ ++static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out) ++{ ++ struct irq_fd *irq; ++ int i = 0; ++ ++ for(irq=active_fds; irq != NULL; irq = irq->next){ ++ if((irq->fd == fd) && (irq->irq == irqnum)) break; ++ i++; ++ } ++ if(irq == NULL){ ++ printk("find_irq_by_fd doesn't have descriptor %d\n", fd); ++ goto out; ++ } ++ if((pollfds[i].fd != -1) && (pollfds[i].fd != fd)){ ++ printk("find_irq_by_fd - mismatch between active_fds and " ++ "pollfds, fd %d vs %d, need %d\n", irq->fd, ++ pollfds[i].fd, fd); ++ irq = NULL; ++ goto out; ++ } ++ *index_out = i; ++ out: ++ return(irq); ++} ++ ++void free_irq_later(int irq, void *dev_id) ++{ ++ struct irq_fd *irq_fd; ++ unsigned long flags; ++ ++ flags = irq_lock(); ++ for(irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next){ ++ if((irq_fd->irq == irq) && (irq_fd->id == dev_id)) ++ break; ++ } ++ if(irq_fd == NULL){ ++ printk("free_irq_later found no irq, irq = %d, " ++ "dev_id = 0x%p\n", irq, dev_id); ++ goto out; ++ } ++ irq_fd->freed = 1; ++ out: ++ irq_unlock(flags); ++} ++ ++void reactivate_fd(int fd, int irqnum) ++{ ++ struct irq_fd *irq; ++ unsigned long flags; ++ int i; ++ ++ flags = irq_lock(); ++ irq = find_irq_by_fd(fd, irqnum, &i); ++ if(irq == NULL){ ++ irq_unlock(flags); ++ return; ++ } ++ ++ pollfds[i].fd = irq->fd; ++ ++ irq_unlock(flags); ++ ++ /* This calls activate_fd, so it has to be outside the critical ++ * section. ++ */ ++ maybe_sigio_broken(fd, irq->type); ++} ++ ++void deactivate_fd(int fd, int irqnum) ++{ ++ struct irq_fd *irq; ++ unsigned long flags; ++ int i; ++ ++ flags = irq_lock(); ++ irq = find_irq_by_fd(fd, irqnum, &i); ++ if(irq == NULL) ++ goto out; ++ pollfds[i].fd = -1; ++ out: ++ irq_unlock(flags); ++} ++ ++void forward_ipi(int fd, int pid) ++{ ++ if(fcntl(fd, F_SETOWN, pid) < 0){ ++ int save_errno = errno; ++ if(fcntl(fd, F_GETOWN, 0) != pid){ ++ printk("forward_ipi: F_SETOWN failed, fd = %d, " ++ "me = %d, target = %d, errno = %d\n", fd, ++ os_getpid(), pid, save_errno); ++ } ++ } ++} ++ ++void forward_interrupts(int pid) ++{ ++ struct irq_fd *irq; ++ unsigned long flags; ++ ++ flags = irq_lock(); ++ for(irq=active_fds;irq != NULL;irq = irq->next){ ++ if(fcntl(irq->fd, F_SETOWN, pid) < 0){ ++ int save_errno = errno; ++ if(fcntl(irq->fd, F_GETOWN, 0) != pid){ ++ /* XXX Just remove the irq rather than ++ * print out an infinite stream of these ++ */ ++ printk("Failed to forward %d to pid %d, " ++ "errno = %d\n", irq->fd, pid, ++ save_errno); ++ } ++ } ++ irq->pid = pid; ++ } ++ irq_unlock(flags); ++} ++ ++void init_irq_signals(int on_sigstack) ++{ ++ __sighandler_t h; ++ int flags; ++ ++ flags = on_sigstack ? SA_ONSTACK : 0; ++ if(timer_irq_inited) h = (__sighandler_t) alarm_handler; ++ else h = boot_timer_handler; ++ ++ set_handler(SIGVTALRM, h, flags | SA_RESTART, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, -1); ++ set_handler(SIGIO, (__sighandler_t) sig_handler, flags | SA_RESTART, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ signal(SIGWINCH, SIG_IGN); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/ksyms.c um/arch/um/kernel/ksyms.c +--- orig/arch/um/kernel/ksyms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/ksyms.c Tue Dec 17 13:29:43 2002 +@@ -0,0 +1,94 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/module.h" ++#include "linux/string.h" ++#include "linux/smp_lock.h" ++#include "linux/spinlock.h" ++#include "asm/current.h" ++#include "asm/delay.h" ++#include "asm/processor.h" ++#include "asm/unistd.h" ++#include "asm/pgalloc.h" ++#include "asm/pgtable.h" ++#include "asm/page.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "os.h" ++#include "helper.h" ++ ++EXPORT_SYMBOL(stop); ++EXPORT_SYMBOL(strtok); ++EXPORT_SYMBOL(uml_physmem); ++EXPORT_SYMBOL(set_signals); ++EXPORT_SYMBOL(get_signals); ++EXPORT_SYMBOL(kernel_thread); ++EXPORT_SYMBOL(__const_udelay); ++EXPORT_SYMBOL(__udelay); ++EXPORT_SYMBOL(sys_waitpid); ++EXPORT_SYMBOL(task_size); ++EXPORT_SYMBOL(flush_tlb_range); ++EXPORT_SYMBOL(host_task_size); ++EXPORT_SYMBOL(arch_validate); ++ ++EXPORT_SYMBOL(region_pa); ++EXPORT_SYMBOL(region_va); ++EXPORT_SYMBOL(phys_mem_map); ++EXPORT_SYMBOL(page_mem_map); ++EXPORT_SYMBOL(high_physmem); ++EXPORT_SYMBOL(empty_zero_page); ++EXPORT_SYMBOL(um_virt_to_phys); ++EXPORT_SYMBOL(mode_tt); ++EXPORT_SYMBOL(handle_page_fault); ++ ++EXPORT_SYMBOL(os_getpid); ++EXPORT_SYMBOL(os_open_file); ++EXPORT_SYMBOL(os_read_file); ++EXPORT_SYMBOL(os_write_file); ++EXPORT_SYMBOL(os_seek_file); ++EXPORT_SYMBOL(os_pipe); ++EXPORT_SYMBOL(os_file_type); ++EXPORT_SYMBOL(os_close_file); ++EXPORT_SYMBOL(helper_wait); ++EXPORT_SYMBOL(os_shutdown_socket); ++EXPORT_SYMBOL(os_connect_socket); ++EXPORT_SYMBOL(run_helper); ++EXPORT_SYMBOL(start_thread); ++EXPORT_SYMBOL(dump_thread); ++ ++/* This is here because UML expands open to sys_open, not to a system ++ * call instruction. ++ */ ++EXPORT_SYMBOL(sys_open); ++EXPORT_SYMBOL(sys_lseek); ++EXPORT_SYMBOL(sys_read); ++EXPORT_SYMBOL(sys_wait4); ++ ++#ifdef CONFIG_SMP ++ ++/* required for SMP */ ++ ++extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); ++EXPORT_SYMBOL_NOVERS(__write_lock_failed); ++ ++extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); ++EXPORT_SYMBOL_NOVERS(__read_lock_failed); ++ ++EXPORT_SYMBOL(kernel_flag_cacheline); ++EXPORT_SYMBOL(smp_num_cpus); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/mem.c um/arch/um/kernel/mem.c +--- orig/arch/um/kernel/mem.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/mem.c Sun Mar 30 14:30:26 2003 +@@ -0,0 +1,852 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/types.h" ++#include "linux/mm.h" ++#include "linux/fs.h" ++#include "linux/init.h" ++#include "linux/bootmem.h" ++#include "linux/swap.h" ++#include "linux/slab.h" ++#include "linux/vmalloc.h" ++#include "linux/highmem.h" ++#include "asm/page.h" ++#include "asm/pgtable.h" ++#include "asm/pgalloc.h" ++#include "asm/bitops.h" ++#include "asm/uaccess.h" ++#include "asm/tlb.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "mem_user.h" ++#include "mem.h" ++#include "kern.h" ++#include "init.h" ++#include "os.h" ++#include "mode_kern.h" ++#include "uml_uaccess.h" ++ ++/* Changed during early boot */ ++pgd_t swapper_pg_dir[1024]; ++unsigned long high_physmem; ++unsigned long vm_start; ++unsigned long vm_end; ++unsigned long highmem; ++unsigned long *empty_zero_page = NULL; ++unsigned long *empty_bad_page = NULL; ++ ++/* Not modified */ ++const char bad_pmd_string[] = "Bad pmd in pte_alloc: %08lx\n"; ++ ++/* Changed during early boot */ ++static unsigned long totalram_pages = 0; ++ ++extern char __init_begin, __init_end; ++extern long physmem_size; ++ ++#ifdef CONFIG_SMP ++/* Not changed by UML */ ++mmu_gather_t mmu_gathers[NR_CPUS]; ++#endif ++ ++/* Changed during early boot */ ++int kmalloc_ok = 0; ++ ++#define NREGIONS (phys_region_index(0xffffffff) - phys_region_index(0x0) + 1) ++struct mem_region *regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] = NULL }; ++#define REGION_SIZE ((0xffffffff & ~REGION_MASK) + 1) ++ ++/* Changed during early boot */ ++static unsigned long brk_end; ++ ++static void map_cb(void *unused) ++{ ++ map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); ++} ++ ++void unmap_physmem(void) ++{ ++ os_unmap_memory((void *) brk_end, uml_reserved - brk_end); ++} ++ ++extern char __binary_start; ++ ++void mem_init(void) ++{ ++ unsigned long start; ++ ++#ifdef CONFIG_HIGHMEM ++ highmem_start_page = phys_page(__pa(high_physmem)); ++#endif ++ ++ /* clear the zero-page */ ++ memset((void *) empty_zero_page, 0, PAGE_SIZE); ++ ++ /* Map in the area just after the brk now that kmalloc is about ++ * to be turned on. ++ */ ++ brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); ++ map_cb(NULL); ++ initial_thread_cb(map_cb, NULL); ++ free_bootmem(__pa(brk_end), uml_reserved - brk_end); ++ uml_reserved = brk_end; ++ ++ /* Fill in any hole at the start of the binary */ ++ start = (unsigned long) &__binary_start; ++ if(uml_physmem != start){ ++ map_memory(uml_physmem, __pa(uml_physmem), start - uml_physmem, ++ 1, 1, 0); ++ } ++ ++ /* this will put all low memory onto the freelists */ ++ totalram_pages = free_all_bootmem(); ++ totalram_pages += highmem >> PAGE_SHIFT; ++ max_mapnr = totalram_pages; ++ num_physpages = totalram_pages; ++ printk(KERN_INFO "Memory: %luk available\n", ++ (unsigned long) nr_free_pages() << (PAGE_SHIFT-10)); ++ kmalloc_ok = 1; ++} ++ ++/* Changed during early boot */ ++static unsigned long kmem_top = 0; ++ ++unsigned long get_kmem_end(void) ++{ ++ if(kmem_top == 0) ++ kmem_top = CHOOSE_MODE(kmem_end_tt, kmem_end_skas); ++ return(kmem_top); ++} ++ ++void set_kmem_end(unsigned long new) ++{ ++ kmem_top = new; ++} ++ ++#if CONFIG_HIGHMEM ++/* Changed during early boot */ ++pte_t *kmap_pte; ++pgprot_t kmap_prot; ++ ++#define kmap_get_fixmap_pte(vaddr) \ ++ pte_offset(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) ++ ++void __init kmap_init(void) ++{ ++ unsigned long kmap_vstart; ++ ++ /* cache the first kmap pte */ ++ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); ++ kmap_pte = kmap_get_fixmap_pte(kmap_vstart); ++ ++ kmap_prot = PAGE_KERNEL; ++} ++#endif /* CONFIG_HIGHMEM */ ++ ++static void __init fixrange_init(unsigned long start, unsigned long end, ++ pgd_t *pgd_base) ++{ ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ int i, j; ++ unsigned long vaddr; ++ ++ vaddr = start; ++ i = __pgd_offset(vaddr); ++ j = __pmd_offset(vaddr); ++ pgd = pgd_base + i; ++ ++ for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { ++ pmd = (pmd_t *)pgd; ++ for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { ++ if (pmd_none(*pmd)) { ++ pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); ++ set_pmd(pmd, __pmd(_KERNPG_TABLE + ++ (unsigned long) __pa(pte))); ++ if (pte != pte_offset(pmd, 0)) ++ BUG(); ++ } ++ vaddr += PMD_SIZE; ++ } ++ j = 0; ++ } ++} ++ ++int init_maps(struct mem_region *region) ++{ ++ struct page *p, *map; ++ int i, n, len; ++ ++ if(region == &physmem_region){ ++ region->mem_map = mem_map; ++ return(0); ++ } ++ else if(region->mem_map != NULL) return(0); ++ ++ n = region->len >> PAGE_SHIFT; ++ len = n * sizeof(struct page); ++ if(kmalloc_ok){ ++ map = kmalloc(len, GFP_KERNEL); ++ if(map == NULL) map = vmalloc(len); ++ } ++ else map = alloc_bootmem_low_pages(len); ++ ++ if(map == NULL) ++ return(-ENOMEM); ++ for(i = 0; i < n; i++){ ++ p = &map[i]; ++ set_page_count(p, 0); ++ SetPageReserved(p); ++ INIT_LIST_HEAD(&p->list); ++ } ++ region->mem_map = map; ++ return(0); ++} ++ ++DECLARE_MUTEX(regions_sem); ++ ++static int setup_one_range(int fd, char *driver, unsigned long start, ++ unsigned long pfn, int len, ++ struct mem_region *region) ++{ ++ int i; ++ ++ down(®ions_sem); ++ for(i = 0; i < NREGIONS; i++){ ++ if(regions[i] == NULL) break; ++ } ++ if(i == NREGIONS){ ++ printk("setup_one_range : no free regions\n"); ++ i = -1; ++ goto out; ++ } ++ ++ if(fd == -1) ++ fd = create_mem_file(len); ++ ++ if(region == NULL){ ++ if(kmalloc_ok) ++ region = kmalloc(sizeof(*region), GFP_KERNEL); ++ else region = alloc_bootmem_low_pages(sizeof(*region)); ++ if(region == NULL) ++ panic("Failed to allocating mem_region"); ++ } ++ ++ *region = ((struct mem_region) { .driver = driver, ++ .start_pfn = pfn, ++ .start = start, ++ .len = len, ++ .fd = fd } ); ++ regions[i] = region; ++ out: ++ up(®ions_sem); ++ return(i); ++} ++ ++#ifdef CONFIG_HIGHMEM ++static void init_highmem(void) ++{ ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long vaddr; ++ ++ /* ++ * Permanent kmaps: ++ */ ++ vaddr = PKMAP_BASE; ++ fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir); ++ ++ pgd = swapper_pg_dir + __pgd_offset(vaddr); ++ pmd = pmd_offset(pgd, vaddr); ++ pte = pte_offset(pmd, vaddr); ++ pkmap_page_table = pte; ++ ++ kmap_init(); ++} ++ ++void setup_highmem(unsigned long len) ++{ ++ struct mem_region *region; ++ struct page *page, *map; ++ unsigned long phys; ++ int i, cur, index; ++ ++ phys = physmem_size; ++ do { ++ cur = min(len, (unsigned long) REGION_SIZE); ++ i = setup_one_range(-1, NULL, -1, phys >> PAGE_SHIFT, cur, ++ NULL); ++ if(i == -1){ ++ printk("setup_highmem - setup_one_range failed\n"); ++ return; ++ } ++ region = regions[i]; ++ index = phys / PAGE_SIZE; ++ region->mem_map = &mem_map[index]; ++ ++ map = region->mem_map; ++ for(i = 0; i < (cur >> PAGE_SHIFT); i++){ ++ page = &map[i]; ++ ClearPageReserved(page); ++ set_bit(PG_highmem, &page->flags); ++ atomic_set(&page->count, 1); ++ __free_page(page); ++ } ++ phys += cur; ++ len -= cur; ++ } while(len > 0); ++} ++#endif ++ ++void paging_init(void) ++{ ++ struct mem_region *region; ++ unsigned long zones_size[MAX_NR_ZONES], start, end, vaddr; ++ int i, index; ++ ++ empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); ++ empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); ++ for(i=0;i<sizeof(zones_size)/sizeof(zones_size[0]);i++) ++ zones_size[i] = 0; ++ zones_size[0] = (high_physmem >> PAGE_SHIFT) - ++ (uml_physmem >> PAGE_SHIFT); ++ zones_size[2] = highmem >> PAGE_SHIFT; ++ free_area_init(zones_size); ++ start = phys_region_index(__pa(uml_physmem)); ++ end = phys_region_index(__pa(high_physmem - 1)); ++ for(i = start; i <= end; i++){ ++ region = regions[i]; ++ index = (region->start - uml_physmem) / PAGE_SIZE; ++ region->mem_map = &mem_map[index]; ++ if(i > start) free_bootmem(__pa(region->start), region->len); ++ } ++ ++ /* ++ * Fixed mappings, only the page table structure has to be ++ * created - mappings will be set by set_fixmap(): ++ */ ++ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; ++ fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); ++ ++#if CONFIG_HIGHMEM ++ init_highmem(); ++ setup_highmem(highmem); ++#endif ++} ++ ++/* Changed by meminfo_compat, which is a setup */ ++static int meminfo_22 = 0; ++ ++static int meminfo_compat(char *str) ++{ ++ meminfo_22 = 1; ++ return(1); ++} ++ ++__setup("22_meminfo", meminfo_compat); ++ ++void si_meminfo(struct sysinfo *val) ++{ ++ val->totalram = totalram_pages; ++ val->sharedram = 0; ++ val->freeram = nr_free_pages(); ++ val->bufferram = atomic_read(&buffermem_pages); ++ val->totalhigh = highmem >> PAGE_SHIFT; ++ val->freehigh = nr_free_highpages(); ++ val->mem_unit = PAGE_SIZE; ++ if(meminfo_22){ ++ val->freeram <<= PAGE_SHIFT; ++ val->bufferram <<= PAGE_SHIFT; ++ val->totalram <<= PAGE_SHIFT; ++ val->sharedram <<= PAGE_SHIFT; ++ } ++} ++ ++pte_t __bad_page(void) ++{ ++ clear_page(empty_bad_page); ++ return pte_mkdirty(mk_pte((struct page *) empty_bad_page, ++ PAGE_SHARED)); ++} ++ ++/* This can't do anything because nothing in the kernel image can be freed ++ * since it's not in kernel physical memory. ++ */ ++ ++void free_initmem(void) ++{ ++} ++ ++#ifdef CONFIG_BLK_DEV_INITRD ++ ++void free_initrd_mem(unsigned long start, unsigned long end) ++{ ++ if (start < end) ++ printk ("Freeing initrd memory: %ldk freed\n", ++ (end - start) >> 10); ++ for (; start < end; start += PAGE_SIZE) { ++ ClearPageReserved(virt_to_page(start)); ++ set_page_count(virt_to_page(start), 1); ++ free_page(start); ++ totalram_pages++; ++ } ++} ++ ++#endif ++ ++int do_check_pgt_cache(int low, int high) ++{ ++ int freed = 0; ++ if(pgtable_cache_size > high) { ++ do { ++ if (pgd_quicklist) { ++ free_pgd_slow(get_pgd_fast()); ++ freed++; ++ } ++ if (pmd_quicklist) { ++ pmd_free_slow(pmd_alloc_one_fast(NULL, 0)); ++ freed++; ++ } ++ if (pte_quicklist) { ++ pte_free_slow(pte_alloc_one_fast(NULL, 0)); ++ freed++; ++ } ++ } while(pgtable_cache_size > low); ++ } ++ return freed; ++} ++ ++void show_mem(void) ++{ ++ int i, total = 0, reserved = 0; ++ int shared = 0, cached = 0; ++ int highmem = 0; ++ ++ printk("Mem-info:\n"); ++ show_free_areas(); ++ printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); ++ i = max_mapnr; ++ while(i-- > 0) { ++ total++; ++ if(PageHighMem(mem_map + i)) ++ highmem++; ++ if(PageReserved(mem_map + i)) ++ reserved++; ++ else if(PageSwapCache(mem_map + i)) ++ cached++; ++ else if(page_count(mem_map + i)) ++ shared += page_count(mem_map + i) - 1; ++ } ++ printk("%d pages of RAM\n", total); ++ printk("%d pages of HIGHMEM\n", highmem); ++ printk("%d reserved pages\n", reserved); ++ printk("%d pages shared\n", shared); ++ printk("%d pages swap cached\n", cached); ++ printk("%ld pages in page table cache\n", pgtable_cache_size); ++ show_buffers(); ++} ++ ++static int __init uml_mem_setup(char *line, int *add) ++{ ++ char *retptr; ++ physmem_size = memparse(line,&retptr); ++ return 0; ++} ++__uml_setup("mem=", uml_mem_setup, ++"mem=<Amount of desired ram>\n" ++" This controls how much \"physical\" memory the kernel allocates\n" ++" for the system. The size is specified as a number followed by\n" ++" one of 'k', 'K', 'm', 'M', which have the obvious meanings.\n" ++" This is not related to the amount of memory in the physical\n" ++" machine. It can be more, and the excess, if it's ever used, will\n" ++" just be swapped out.\n Example: mem=64M\n\n" ++); ++ ++struct page *arch_validate(struct page *page, int mask, int order) ++{ ++ unsigned long addr, zero = 0; ++ int i; ++ ++ again: ++ if(page == NULL) return(page); ++ if(PageHighMem(page)) return(page); ++ ++ addr = (unsigned long) page_address(page); ++ for(i = 0; i < (1 << order); i++){ ++ current->thread.fault_addr = (void *) addr; ++ if(__do_copy_to_user((void *) addr, &zero, ++ sizeof(zero), ++ ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)){ ++ if(!(mask & __GFP_WAIT)) return(NULL); ++ else break; ++ } ++ addr += PAGE_SIZE; ++ } ++ if(i == (1 << order)) return(page); ++ page = _alloc_pages(mask, order); ++ goto again; ++} ++ ++DECLARE_MUTEX(vm_reserved_sem); ++static struct list_head vm_reserved = LIST_HEAD_INIT(vm_reserved); ++ ++/* Static structures, linked in to the list in early boot */ ++static struct vm_reserved head = { ++ .list = LIST_HEAD_INIT(head.list), ++ .start = 0, ++ .end = 0xffffffff ++}; ++ ++static struct vm_reserved tail = { ++ .list = LIST_HEAD_INIT(tail.list), ++ .start = 0, ++ .end = 0xffffffff ++}; ++ ++void set_usable_vm(unsigned long start, unsigned long end) ++{ ++ list_add(&head.list, &vm_reserved); ++ list_add(&tail.list, &head.list); ++ head.end = start; ++ tail.start = end; ++} ++ ++int reserve_vm(unsigned long start, unsigned long end, void *e) ++ ++{ ++ struct vm_reserved *entry = e, *reserved, *prev; ++ struct list_head *ele; ++ int err; ++ ++ down(&vm_reserved_sem); ++ list_for_each(ele, &vm_reserved){ ++ reserved = list_entry(ele, struct vm_reserved, list); ++ if(reserved->start >= end) goto found; ++ } ++ panic("Reserved vm out of range"); ++ found: ++ prev = list_entry(ele->prev, struct vm_reserved, list); ++ if(prev->end > start) ++ panic("Can't reserve vm"); ++ if(entry == NULL) ++ entry = kmalloc(sizeof(*entry), GFP_KERNEL); ++ if(entry == NULL){ ++ printk("reserve_vm : Failed to allocate entry\n"); ++ err = -ENOMEM; ++ goto out; ++ } ++ *entry = ((struct vm_reserved) ++ { .list = LIST_HEAD_INIT(entry->list), ++ .start = start, ++ .end = end }); ++ list_add(&entry->list, &prev->list); ++ err = 0; ++ out: ++ up(&vm_reserved_sem); ++ return(0); ++} ++ ++unsigned long get_vm(unsigned long len) ++{ ++ struct vm_reserved *this, *next; ++ struct list_head *ele; ++ unsigned long start; ++ int err; ++ ++ down(&vm_reserved_sem); ++ list_for_each(ele, &vm_reserved){ ++ this = list_entry(ele, struct vm_reserved, list); ++ next = list_entry(ele->next, struct vm_reserved, list); ++ if((this->start < next->start) && ++ (this->end + len + PAGE_SIZE <= next->start)) ++ goto found; ++ } ++ up(&vm_reserved_sem); ++ return(0); ++ found: ++ up(&vm_reserved_sem); ++ start = (unsigned long) UML_ROUND_UP(this->end) + PAGE_SIZE; ++ err = reserve_vm(start, start + len, NULL); ++ if(err) return(0); ++ return(start); ++} ++ ++int nregions(void) ++{ ++ return(NREGIONS); ++} ++ ++void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn, ++ unsigned long len, int need_vm, struct mem_region *region, ++ void *reserved) ++{ ++ int i, cur; ++ ++ do { ++ cur = min(len, (unsigned long) REGION_SIZE); ++ i = setup_one_range(fd, driver, start, pfn, cur, region); ++ region = regions[i]; ++ if(need_vm && setup_region(region, reserved)){ ++ kfree(region); ++ regions[i] = NULL; ++ return; ++ } ++ start += cur; ++ if(pfn != -1) pfn += cur; ++ len -= cur; ++ } while(len > 0); ++} ++ ++struct iomem { ++ char *name; ++ int fd; ++ unsigned long size; ++}; ++ ++/* iomem regions can only be added on the command line at the moment. ++ * Locking will be needed when they can be added via mconsole. ++ */ ++ ++struct iomem iomem_regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] = ++ { .name = NULL, ++ .fd = -1, ++ .size = 0 } }; ++ ++int num_iomem_regions = 0; ++ ++void add_iomem(char *name, int fd, unsigned long size) ++{ ++ if(num_iomem_regions == sizeof(iomem_regions)/sizeof(iomem_regions[0])) ++ return; ++ size = (size + PAGE_SIZE - 1) & PAGE_MASK; ++ iomem_regions[num_iomem_regions++] = ++ ((struct iomem) { .name = name, ++ .fd = fd, ++ .size = size } ); ++} ++ ++int setup_iomem(void) ++{ ++ struct iomem *iomem; ++ int i; ++ ++ for(i = 0; i < num_iomem_regions; i++){ ++ iomem = &iomem_regions[i]; ++ setup_range(iomem->fd, iomem->name, -1, -1, iomem->size, 1, ++ NULL, NULL); ++ } ++ return(0); ++} ++ ++__initcall(setup_iomem); ++ ++#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) ++#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) ++ ++/* Changed during early boot */ ++static struct mem_region physmem_region; ++static struct vm_reserved physmem_reserved; ++ ++void setup_physmem(unsigned long start, unsigned long reserve_end, ++ unsigned long len) ++{ ++ struct mem_region *region = &physmem_region; ++ struct vm_reserved *reserved = &physmem_reserved; ++ unsigned long cur, pfn = 0; ++ int do_free = 1, bootmap_size; ++ ++ do { ++ cur = min(len, (unsigned long) REGION_SIZE); ++ if(region == NULL) ++ region = alloc_bootmem_low_pages(sizeof(*region)); ++ if(reserved == NULL) ++ reserved = alloc_bootmem_low_pages(sizeof(*reserved)); ++ if((region == NULL) || (reserved == NULL)) ++ panic("Couldn't allocate physmem region or vm " ++ "reservation\n"); ++ setup_range(-1, NULL, start, pfn, cur, 1, region, reserved); ++ ++ if(do_free){ ++ unsigned long reserve = reserve_end - start; ++ int pfn = PFN_UP(__pa(reserve_end)); ++ int delta = (len - reserve) >> PAGE_SHIFT; ++ ++ bootmap_size = init_bootmem(pfn, pfn + delta); ++ free_bootmem(__pa(reserve_end) + bootmap_size, ++ cur - bootmap_size - reserve); ++ do_free = 0; ++ } ++ start += cur; ++ pfn += cur >> PAGE_SHIFT; ++ len -= cur; ++ region = NULL; ++ reserved = NULL; ++ } while(len > 0); ++} ++ ++struct mem_region *phys_region(unsigned long phys) ++{ ++ unsigned int n = phys_region_index(phys); ++ ++ if(regions[n] == NULL) ++ panic("Physical address in uninitialized region"); ++ return(regions[n]); ++} ++ ++unsigned long phys_offset(unsigned long phys) ++{ ++ return(phys_addr(phys)); ++} ++ ++struct page *phys_mem_map(unsigned long phys) ++{ ++ return((struct page *) phys_region(phys)->mem_map); ++} ++ ++struct page *pte_mem_map(pte_t pte) ++{ ++ return(phys_mem_map(pte_val(pte))); ++} ++ ++struct mem_region *page_region(struct page *page, int *index_out) ++{ ++ int i; ++ struct mem_region *region; ++ struct page *map; ++ ++ for(i = 0; i < NREGIONS; i++){ ++ region = regions[i]; ++ if(region == NULL) continue; ++ map = region->mem_map; ++ if((page >= map) && (page < &map[region->len >> PAGE_SHIFT])){ ++ if(index_out != NULL) *index_out = i; ++ return(region); ++ } ++ } ++ panic("No region found for page"); ++ return(NULL); ++} ++ ++unsigned long page_to_pfn(struct page *page) ++{ ++ struct mem_region *region = page_region(page, NULL); ++ ++ return(region->start_pfn + (page - (struct page *) region->mem_map)); ++} ++ ++struct mem_region *pfn_to_region(unsigned long pfn, int *index_out) ++{ ++ struct mem_region *region; ++ int i; ++ ++ for(i = 0; i < NREGIONS; i++){ ++ region = regions[i]; ++ if(region == NULL) ++ continue; ++ ++ if((region->start_pfn <= pfn) && ++ (region->start_pfn + (region->len >> PAGE_SHIFT) > pfn)){ ++ if(index_out != NULL) ++ *index_out = i; ++ return(region); ++ } ++ } ++ return(NULL); ++} ++ ++struct page *pfn_to_page(unsigned long pfn) ++{ ++ struct mem_region *region = pfn_to_region(pfn, NULL); ++ struct page *mem_map = (struct page *) region->mem_map; ++ ++ return(&mem_map[pfn - region->start_pfn]); ++} ++ ++unsigned long phys_to_pfn(unsigned long p) ++{ ++ struct mem_region *region = regions[phys_region_index(p)]; ++ ++ return(region->start_pfn + (phys_addr(p) >> PAGE_SHIFT)); ++} ++ ++unsigned long pfn_to_phys(unsigned long pfn) ++{ ++ int n; ++ struct mem_region *region = pfn_to_region(pfn, &n); ++ ++ return(mk_phys((pfn - region->start_pfn) << PAGE_SHIFT, n)); ++} ++ ++struct page *page_mem_map(struct page *page) ++{ ++ return((struct page *) page_region(page, NULL)->mem_map); ++} ++ ++extern unsigned long region_pa(void *virt) ++{ ++ struct mem_region *region; ++ unsigned long addr = (unsigned long) virt; ++ int i; ++ ++ for(i = 0; i < NREGIONS; i++){ ++ region = regions[i]; ++ if(region == NULL) continue; ++ if((region->start <= addr) && ++ (addr <= region->start + region->len)) ++ return(mk_phys(addr - region->start, i)); ++ } ++ panic("region_pa : no region for virtual address"); ++ return(0); ++} ++ ++extern void *region_va(unsigned long phys) ++{ ++ return((void *) (phys_region(phys)->start + phys_addr(phys))); ++} ++ ++unsigned long page_to_phys(struct page *page) ++{ ++ int n; ++ struct mem_region *region = page_region(page, &n); ++ struct page *map = region->mem_map; ++ return(mk_phys((page - map) << PAGE_SHIFT, n)); ++} ++ ++struct page *phys_to_page(unsigned long phys) ++{ ++ struct page *mem_map; ++ ++ mem_map = phys_mem_map(phys); ++ return(mem_map + (phys_offset(phys) >> PAGE_SHIFT)); ++} ++ ++static int setup_mem_maps(void) ++{ ++ struct mem_region *region; ++ int i; ++ ++ for(i = 0; i < NREGIONS; i++){ ++ region = regions[i]; ++ if((region != NULL) && (region->fd > 0)) init_maps(region); ++ } ++ return(0); ++} ++ ++__initcall(setup_mem_maps); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/mem_user.c um/arch/um/kernel/mem_user.c +--- orig/arch/um/kernel/mem_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/mem_user.c Thu Mar 6 16:05:21 2003 +@@ -0,0 +1,232 @@ ++/* ++ * arch/um/kernel/mem_user.c ++ * ++ * BRIEF MODULE DESCRIPTION ++ * user side memory routines for supporting IO memory inside user mode linux ++ * ++ * Copyright (C) 2001 RidgeRun, Inc. ++ * Author: RidgeRun, Inc. ++ * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED ++ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN ++ * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, ++ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT ++ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF ++ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ++ * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF ++ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 675 Mass Ave, Cambridge, MA 02139, USA. ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <stddef.h> ++#include <stdarg.h> ++#include <unistd.h> ++#include <fcntl.h> ++#include <errno.h> ++#include <string.h> ++#include <sys/stat.h> ++#include <sys/types.h> ++#include <sys/mman.h> ++#include "kern_util.h" ++#include "user.h" ++#include "user_util.h" ++#include "mem_user.h" ++#include "init.h" ++#include "os.h" ++#include "tempfile.h" ++ ++extern struct mem_region physmem_region; ++ ++#define TEMPNAME_TEMPLATE "vm_file-XXXXXX" ++ ++int create_mem_file(unsigned long len) ++{ ++ int fd; ++ char zero; ++ ++ fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1); ++ if (fchmod(fd, 0777) < 0){ ++ perror("fchmod"); ++ exit(1); ++ } ++ if(os_seek_file(fd, len) < 0){ ++ perror("lseek"); ++ exit(1); ++ } ++ zero = 0; ++ if(write(fd, &zero, 1) != 1){ ++ perror("write"); ++ exit(1); ++ } ++ if(fcntl(fd, F_SETFD, 1) != 0) ++ perror("Setting FD_CLOEXEC failed"); ++ return(fd); ++} ++ ++int setup_region(struct mem_region *region, void *entry) ++{ ++ void *loc, *start; ++ char *driver; ++ int err, offset; ++ ++ if(region->start != -1){ ++ err = reserve_vm(region->start, ++ region->start + region->len, entry); ++ if(err){ ++ printk("setup_region : failed to reserve " ++ "0x%x - 0x%x for driver '%s'\n", ++ region->start, ++ region->start + region->len, ++ region->driver); ++ return(-1); ++ } ++ } ++ else region->start = get_vm(region->len); ++ if(region->start == 0){ ++ if(region->driver == NULL) driver = "physmem"; ++ else driver = region->driver; ++ printk("setup_region : failed to find vm for " ++ "driver '%s' (length %d)\n", driver, region->len); ++ return(-1); ++ } ++ if(region->start == uml_physmem){ ++ start = (void *) uml_reserved; ++ offset = uml_reserved - uml_physmem; ++ } ++ else { ++ start = (void *) region->start; ++ offset = 0; ++ } ++ ++ loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, ++ MAP_SHARED | MAP_FIXED, region->fd, offset); ++ if(loc != start){ ++ perror("Mapping memory"); ++ exit(1); ++ } ++ return(0); ++} ++ ++static int __init parse_iomem(char *str, int *add) ++{ ++ struct stat64 buf; ++ char *file, *driver; ++ int fd; ++ ++ driver = str; ++ file = strchr(str,','); ++ if(file == NULL){ ++ printf("parse_iomem : failed to parse iomem\n"); ++ return(1); ++ } ++ *file = '\0'; ++ file++; ++ fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0); ++ if(fd < 0){ ++ printf("parse_iomem - Couldn't open io file, errno = %d\n", ++ errno); ++ return(1); ++ } ++ if(fstat64(fd, &buf) < 0) { ++ printf("parse_iomem - cannot fstat file, errno = %d\n", errno); ++ return(1); ++ } ++ add_iomem(driver, fd, buf.st_size); ++ return(0); ++} ++ ++__uml_setup("iomem=", parse_iomem, ++"iomem=<name>,<file>\n" ++" Configure <file> as an IO memory region named <name>.\n\n" ++); ++ ++#ifdef notdef ++int logging = 0; ++int logging_fd = -1; ++ ++int logging_line = 0; ++char logging_buf[256]; ++ ++void log(char *fmt, ...) ++{ ++ va_list ap; ++ struct timeval tv; ++ struct openflags flags; ++ ++ if(logging == 0) return; ++ if(logging_fd < 0){ ++ flags = of_create(of_trunc(of_rdrw(OPENFLAGS()))); ++ logging_fd = os_open_file("log", flags, 0644); ++ } ++ gettimeofday(&tv, NULL); ++ sprintf(logging_buf, "%d\t %u.%u ", logging_line++, tv.tv_sec, ++ tv.tv_usec); ++ va_start(ap, fmt); ++ vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap); ++ va_end(ap); ++ write(logging_fd, logging_buf, strlen(logging_buf)); ++} ++#endif ++ ++int map_memory(unsigned long virt, unsigned long phys, unsigned long len, ++ int r, int w, int x) ++{ ++ struct mem_region *region = phys_region(phys); ++ ++ return(os_map_memory((void *) virt, region->fd, phys_offset(phys), len, ++ r, w, x)); ++} ++ ++int protect_memory(unsigned long addr, unsigned long len, int r, int w, int x, ++ int must_succeed) ++{ ++ if(os_protect_memory((void *) addr, len, r, w, x) < 0){ ++ if(must_succeed) ++ panic("protect failed, errno = %d", errno); ++ else return(-errno); ++ } ++ return(0); ++} ++ ++unsigned long find_iomem(char *driver, unsigned long *len_out) ++{ ++ struct mem_region *region; ++ int i, n; ++ ++ n = nregions(); ++ for(i = 0; i < n; i++){ ++ region = regions[i]; ++ if(region == NULL) continue; ++ if((region->driver != NULL) && ++ !strcmp(region->driver, driver)){ ++ *len_out = region->len; ++ return(region->start); ++ } ++ } ++ *len_out = 0; ++ return 0; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/mprot.h um/arch/um/kernel/mprot.h +--- orig/arch/um/kernel/mprot.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/mprot.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,6 @@ ++#ifndef __MPROT_H__ ++#define __MPROT_H__ ++ ++extern void no_access(unsigned long addr, unsigned int len); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/kernel/process.c um/arch/um/kernel/process.c +--- orig/arch/um/kernel/process.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/process.c Wed Apr 23 20:36:15 2003 +@@ -0,0 +1,286 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <signal.h> ++#include <sched.h> ++#include <errno.h> ++#include <stdarg.h> ++#include <fcntl.h> ++#include <stdlib.h> ++#include <setjmp.h> ++#include <sys/time.h> ++#include <sys/ptrace.h> ++#include <sys/ioctl.h> ++#include <sys/wait.h> ++#include <sys/mman.h> ++#include <asm/ptrace.h> ++#include <asm/sigcontext.h> ++#include <asm/unistd.h> ++#include <asm/page.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "process.h" ++#include "signal_kern.h" ++#include "signal_user.h" ++#include "sysdep/ptrace.h" ++#include "sysdep/sigcontext.h" ++#include "irq_user.h" ++#include "ptrace_user.h" ++#include "time_user.h" ++#include "init.h" ++#include "os.h" ++#include "uml-config.h" ++#include "choose-mode.h" ++#include "mode.h" ++#ifdef UML_CONFIG_MODE_SKAS ++#include "skas.h" ++#include "skas_ptrace.h" ++#endif ++ ++void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) ++{ ++ int flags = 0, pages; ++ ++ if(sig_stack != NULL){ ++ pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2; ++ set_sigstack(sig_stack, pages * page_size()); ++ flags = SA_ONSTACK; ++ } ++ if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1); ++} ++ ++void init_new_thread_signals(int altstack) ++{ ++ int flags = altstack ? SA_ONSTACK : 0; ++ ++ set_handler(SIGSEGV, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGTRAP, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGFPE, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGILL, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGBUS, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGWINCH, (__sighandler_t) sig_handler, flags, ++ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); ++ set_handler(SIGUSR2, (__sighandler_t) sig_handler, ++ SA_NOMASK | flags, -1); ++ signal(SIGHUP, SIG_IGN); ++ ++ init_irq_signals(altstack); ++} ++ ++struct tramp { ++ int (*tramp)(void *); ++ void *tramp_data; ++ unsigned long temp_stack; ++ int flags; ++ int pid; ++}; ++ ++/* See above for why sigkill is here */ ++ ++int sigkill = SIGKILL; ++ ++int outer_tramp(void *arg) ++{ ++ struct tramp *t; ++ int sig = sigkill; ++ ++ t = arg; ++ t->pid = clone(t->tramp, (void *) t->temp_stack + page_size()/2, ++ t->flags, t->tramp_data); ++ if(t->pid > 0) wait_for_stop(t->pid, SIGSTOP, PTRACE_CONT, NULL); ++ kill(os_getpid(), sig); ++ _exit(0); ++} ++ ++int start_fork_tramp(void *thread_arg, unsigned long temp_stack, ++ int clone_flags, int (*tramp)(void *)) ++{ ++ struct tramp arg; ++ unsigned long sp; ++ int new_pid, status, err; ++ ++ /* The trampoline will run on the temporary stack */ ++ sp = stack_sp(temp_stack); ++ ++ clone_flags |= CLONE_FILES | SIGCHLD; ++ ++ arg.tramp = tramp; ++ arg.tramp_data = thread_arg; ++ arg.temp_stack = temp_stack; ++ arg.flags = clone_flags; ++ ++ /* Start the process and wait for it to kill itself */ ++ new_pid = clone(outer_tramp, (void *) sp, clone_flags, &arg); ++ if(new_pid < 0) return(-errno); ++ while((err = waitpid(new_pid, &status, 0) < 0) && (errno == EINTR)) ; ++ if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", ++ errno); ++ if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) ++ panic("outer trampoline didn't exit with SIGKILL"); ++ ++ return(arg.pid); ++} ++ ++void suspend_new_thread(int fd) ++{ ++ char c; ++ ++ os_stop_process(os_getpid()); ++ ++ if(read(fd, &c, sizeof(c)) != sizeof(c)) ++ panic("read failed in suspend_new_thread"); ++} ++ ++static int ptrace_child(void *arg) ++{ ++ int pid = os_getpid(); ++ ++ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){ ++ perror("ptrace"); ++ os_kill_process(pid, 0); ++ } ++ os_stop_process(pid); ++ _exit(os_getpid() == pid); ++} ++ ++static int start_ptraced_child(void **stack_out) ++{ ++ void *stack; ++ unsigned long sp; ++ int pid, n, status; ++ ++ stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if(stack == MAP_FAILED) ++ panic("check_ptrace : mmap failed, errno = %d", errno); ++ sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); ++ pid = clone(ptrace_child, (void *) sp, SIGCHLD, NULL); ++ if(pid < 0) ++ panic("check_ptrace : clone failed, errno = %d", errno); ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0) ++ panic("check_ptrace : wait failed, errno = %d", errno); ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) ++ panic("check_ptrace : expected SIGSTOP, got status = %d", ++ status); ++ ++ *stack_out = stack; ++ return(pid); ++} ++ ++static void stop_ptraced_child(int pid, void *stack, int exitcode) ++{ ++ int status, n; ++ ++ if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) ++ panic("check_ptrace : ptrace failed, errno = %d", errno); ++ n = waitpid(pid, &status, 0); ++ if(!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) ++ panic("check_ptrace : child exited with status 0x%x", status); ++ ++ if(munmap(stack, PAGE_SIZE) < 0) ++ panic("check_ptrace : munmap failed, errno = %d", errno); ++} ++ ++void __init check_ptrace(void) ++{ ++ void *stack; ++ int pid, syscall, n, status; ++ ++ printk("Checking that ptrace can change system call numbers..."); ++ pid = start_ptraced_child(&stack); ++ ++ while(1){ ++ if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) ++ panic("check_ptrace : ptrace failed, errno = %d", ++ errno); ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0) ++ panic("check_ptrace : wait failed, errno = %d", errno); ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) ++ panic("check_ptrace : expected SIGTRAP, " ++ "got status = %d", status); ++ ++ syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, ++ 0); ++ if(syscall == __NR_getpid){ ++ n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, ++ __NR_getppid); ++ if(n < 0) ++ panic("check_ptrace : failed to modify system " ++ "call, errno = %d", errno); ++ break; ++ } ++ } ++ stop_ptraced_child(pid, stack, 0); ++ printk("OK\n"); ++} ++ ++int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) ++{ ++ jmp_buf buf; ++ int n; ++ ++ *jmp_ptr = &buf; ++ n = setjmp(buf); ++ if(n != 0) ++ return(n); ++ (*fn)(arg); ++ return(0); ++} ++ ++int can_do_skas(void) ++{ ++#ifdef UML_CONFIG_MODE_SKAS ++ struct ptrace_faultinfo fi; ++ void *stack; ++ int pid, n, ret = 1; ++ ++ printf("Checking for the skas3 patch in the host..."); ++ pid = start_ptraced_child(&stack); ++ ++ n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi); ++ if(n < 0){ ++ if(errno == EIO) ++ printf("not found\n"); ++ else printf("No (unexpected errno - %d)\n", errno); ++ ret = 0; ++ } ++ else printf("found\n"); ++ ++ init_registers(pid); ++ stop_ptraced_child(pid, stack, 1); ++ ++ printf("Checking for /proc/mm..."); ++ if(access("/proc/mm", W_OK)){ ++ printf("not found\n"); ++ ret = 0; ++ } ++ else printf("found\n"); ++ ++ return(ret); ++#else ++ return(0); ++#endif ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/process_kern.c um/arch/um/kernel/process_kern.c +--- orig/arch/um/kernel/process_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/process_kern.c Wed Apr 16 16:02:09 2003 +@@ -0,0 +1,391 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/kernel.h" ++#include "linux/sched.h" ++#include "linux/interrupt.h" ++#include "linux/mm.h" ++#include "linux/slab.h" ++#include "linux/utsname.h" ++#include "linux/fs.h" ++#include "linux/utime.h" ++#include "linux/smp_lock.h" ++#include "linux/module.h" ++#include "linux/init.h" ++#include "linux/capability.h" ++#include "asm/unistd.h" ++#include "asm/mman.h" ++#include "asm/segment.h" ++#include "asm/stat.h" ++#include "asm/pgtable.h" ++#include "asm/processor.h" ++#include "asm/pgalloc.h" ++#include "asm/spinlock.h" ++#include "asm/uaccess.h" ++#include "asm/user.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "signal_kern.h" ++#include "signal_user.h" ++#include "init.h" ++#include "irq_user.h" ++#include "mem_user.h" ++#include "time_user.h" ++#include "tlb.h" ++#include "frame_kern.h" ++#include "sigcontext.h" ++#include "2_5compat.h" ++#include "os.h" ++#include "mode.h" ++#include "mode_kern.h" ++#include "choose-mode.h" ++ ++/* This is a per-cpu array. A processor only modifies its entry and it only ++ * cares about its entry, so it's OK if another processor is modifying its ++ * entry. ++ */ ++struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; ++ ++struct task_struct *get_task(int pid, int require) ++{ ++ struct task_struct *ret; ++ ++ read_lock(&tasklist_lock); ++ ret = find_task_by_pid(pid); ++ read_unlock(&tasklist_lock); ++ ++ if(require && (ret == NULL)) panic("get_task couldn't find a task\n"); ++ return(ret); ++} ++ ++int external_pid(void *t) ++{ ++ struct task_struct *task = t ? t : current; ++ ++ return(CHOOSE_MODE_PROC(external_pid_tt, external_pid_skas, task)); ++} ++ ++int pid_to_processor_id(int pid) ++{ ++ int i; ++ ++ for(i = 0; i < smp_num_cpus; i++){ ++ if(cpu_tasks[i].pid == pid) return(i); ++ } ++ return(-1); ++} ++ ++void free_stack(unsigned long stack, int order) ++{ ++ free_pages(stack, order); ++} ++ ++unsigned long alloc_stack(int order, int atomic) ++{ ++ unsigned long page; ++ int flags = GFP_KERNEL; ++ ++ if(atomic) flags |= GFP_ATOMIC; ++ if((page = __get_free_pages(flags, order)) == 0) ++ return(0); ++ stack_protections(page); ++ return(page); ++} ++ ++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++{ ++ int pid; ++ ++ current->thread.request.u.thread.proc = fn; ++ current->thread.request.u.thread.arg = arg; ++ pid = do_fork(CLONE_VM | flags, 0, NULL, 0); ++ if(pid < 0) panic("do_fork failed in kernel_thread"); ++ return(pid); ++} ++ ++void switch_mm(struct mm_struct *prev, struct mm_struct *next, ++ struct task_struct *tsk, unsigned cpu) ++{ ++ if (prev != next) ++ clear_bit(cpu, &prev->cpu_vm_mask); ++ set_bit(cpu, &next->cpu_vm_mask); ++} ++ ++void set_current(void *t) ++{ ++ struct task_struct *task = t; ++ ++ cpu_tasks[task->processor] = ((struct cpu_task) ++ { external_pid(task), task }); ++} ++ ++void *_switch_to(void *prev, void *next) ++{ ++ return(CHOOSE_MODE(_switch_to_tt(prev, next), ++ _switch_to_skas(prev, next))); ++} ++ ++void interrupt_end(void) ++{ ++ if(current->need_resched) schedule(); ++ if(current->sigpending != 0) do_signal(0); ++} ++ ++void release_thread(struct task_struct *task) ++{ ++ CHOOSE_MODE(release_thread_tt(task), release_thread_skas(task)); ++} ++ ++void exit_thread(void) ++{ ++ CHOOSE_MODE(exit_thread_tt(), exit_thread_skas()); ++ unprotect_stack((unsigned long) current); ++} ++ ++void *get_current(void) ++{ ++ return(current); ++} ++ ++int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long stack_top, struct task_struct * p, ++ struct pt_regs *regs) ++{ ++ p->thread = (struct thread_struct) INIT_THREAD; ++ p->thread.kernel_stack = (unsigned long) p + 2 * PAGE_SIZE; ++ ++ return(CHOOSE_MODE_PROC(copy_thread_tt, copy_thread_skas, nr, ++ clone_flags, sp, stack_top, p, regs)); ++} ++ ++void initial_thread_cb(void (*proc)(void *), void *arg) ++{ ++ int save_kmalloc_ok = kmalloc_ok; ++ ++ kmalloc_ok = 0; ++ CHOOSE_MODE_PROC(initial_thread_cb_tt, initial_thread_cb_skas, proc, ++ arg); ++ kmalloc_ok = save_kmalloc_ok; ++} ++ ++unsigned long stack_sp(unsigned long page) ++{ ++ return(page + PAGE_SIZE - sizeof(void *)); ++} ++ ++int current_pid(void) ++{ ++ return(current->pid); ++} ++ ++void cpu_idle(void) ++{ ++ CHOOSE_MODE(init_idle_tt(), init_idle_skas()); ++ ++ atomic_inc(&init_mm.mm_count); ++ current->mm = &init_mm; ++ current->active_mm = &init_mm; ++ ++ while(1){ ++ /* endless idle loop with no priority at all */ ++ SET_PRI(current); ++ ++ /* ++ * although we are an idle CPU, we do not want to ++ * get into the scheduler unnecessarily. ++ */ ++ if (current->need_resched) { ++ schedule(); ++ check_pgt_cache(); ++ } ++ idle_sleep(10); ++ } ++} ++ ++int page_size(void) ++{ ++ return(PAGE_SIZE); ++} ++ ++int page_mask(void) ++{ ++ return(PAGE_MASK); ++} ++ ++void *um_virt_to_phys(struct task_struct *task, unsigned long addr, ++ pte_t *pte_out) ++{ ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ ++ if(task->mm == NULL) ++ return(ERR_PTR(-EINVAL)); ++ pgd = pgd_offset(task->mm, addr); ++ pmd = pmd_offset(pgd, addr); ++ if(!pmd_present(*pmd)) ++ return(ERR_PTR(-EINVAL)); ++ pte = pte_offset(pmd, addr); ++ if(!pte_present(*pte)) ++ return(ERR_PTR(-EINVAL)); ++ if(pte_out != NULL) ++ *pte_out = *pte; ++ return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); ++} ++ ++char *current_cmd(void) ++{ ++#if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM) ++ return("(Unknown)"); ++#else ++ void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL); ++ return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); ++#endif ++} ++ ++void force_sigbus(void) ++{ ++ printk(KERN_ERR "Killing pid %d because of a lack of memory\n", ++ current->pid); ++ lock_kernel(); ++ sigaddset(¤t->pending.signal, SIGBUS); ++ recalc_sigpending(current); ++ current->flags |= PF_SIGNALED; ++ do_exit(SIGBUS | 0x80); ++} ++ ++void dump_thread(struct pt_regs *regs, struct user *u) ++{ ++} ++ ++void enable_hlt(void) ++{ ++ panic("enable_hlt"); ++} ++ ++void disable_hlt(void) ++{ ++ panic("disable_hlt"); ++} ++ ++extern int signal_frame_size; ++ ++void *um_kmalloc(int size) ++{ ++ return(kmalloc(size, GFP_KERNEL)); ++} ++ ++void *um_kmalloc_atomic(int size) ++{ ++ return(kmalloc(size, GFP_ATOMIC)); ++} ++ ++unsigned long get_fault_addr(void) ++{ ++ return((unsigned long) current->thread.fault_addr); ++} ++ ++EXPORT_SYMBOL(get_fault_addr); ++ ++void not_implemented(void) ++{ ++ printk(KERN_DEBUG "Something isn't implemented in here\n"); ++} ++ ++EXPORT_SYMBOL(not_implemented); ++ ++int user_context(unsigned long sp) ++{ ++ unsigned long stack; ++ ++ stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); ++ stack += 2 * PAGE_SIZE; ++ return(stack != current->thread.kernel_stack); ++} ++ ++extern void remove_umid_dir(void); ++ ++__uml_exitcall(remove_umid_dir); ++ ++extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; ++ ++void do_uml_exitcalls(void) ++{ ++ exitcall_t *call; ++ ++ call = &__uml_exitcall_end; ++ while (--call >= &__uml_exitcall_begin) ++ (*call)(); ++} ++ ++char *uml_strdup(char *string) ++{ ++ char *new; ++ ++ new = kmalloc(strlen(string) + 1, GFP_KERNEL); ++ if(new == NULL) return(NULL); ++ strcpy(new, string); ++ return(new); ++} ++ ++void *get_init_task(void) ++{ ++ return(&init_task_union.task); ++} ++ ++int copy_to_user_proc(void *to, void *from, int size) ++{ ++ return(copy_to_user(to, from, size)); ++} ++ ++int copy_from_user_proc(void *to, void *from, int size) ++{ ++ return(copy_from_user(to, from, size)); ++} ++ ++int clear_user_proc(void *buf, int size) ++{ ++ return(clear_user(buf, size)); ++} ++ ++int strlen_user_proc(char *str) ++{ ++ return(strlen_user(str)); ++} ++ ++int smp_sigio_handler(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu = current->processor; ++ ++ IPI_handler(cpu); ++ if(cpu != 0) ++ return(1); ++#endif ++ return(0); ++} ++ ++int um_in_interrupt(void) ++{ ++ return(in_interrupt()); ++} ++ ++int cpu(void) ++{ ++ return(current->processor); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/ptrace.c um/arch/um/kernel/ptrace.c +--- orig/arch/um/kernel/ptrace.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/ptrace.c Sat Dec 28 22:50:21 2002 +@@ -0,0 +1,325 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/mm.h" ++#include "linux/errno.h" ++#include "linux/smp_lock.h" ++#ifdef CONFIG_PROC_MM ++#include "linux/proc_mm.h" ++#endif ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "kern_util.h" ++#include "ptrace_user.h" ++ ++/* ++ * Called by kernel/ptrace.c when detaching.. ++ */ ++void ptrace_disable(struct task_struct *child) ++{ ++} ++ ++extern long do_mmap2(struct task_struct *task, unsigned long addr, ++ unsigned long len, unsigned long prot, ++ unsigned long flags, unsigned long fd, ++ unsigned long pgoff); ++ ++int sys_ptrace(long request, long pid, long addr, long data) ++{ ++ struct task_struct *child; ++ int i, ret; ++ ++ lock_kernel(); ++ ret = -EPERM; ++ if (request == PTRACE_TRACEME) { ++ /* are we already being traced? */ ++ if (current->ptrace & PT_PTRACED) ++ goto out; ++ /* set the ptrace bit in the process flags. */ ++ current->ptrace |= PT_PTRACED; ++ ret = 0; ++ goto out; ++ } ++ ret = -ESRCH; ++ read_lock(&tasklist_lock); ++ child = find_task_by_pid(pid); ++ if (child) ++ get_task_struct(child); ++ read_unlock(&tasklist_lock); ++ if (!child) ++ goto out; ++ ++ ret = -EPERM; ++ if (pid == 1) /* you may not mess with init */ ++ goto out_tsk; ++ ++ if (request == PTRACE_ATTACH) { ++ ret = ptrace_attach(child); ++ goto out_tsk; ++ } ++ ++ ret = ptrace_check_attach(child, request == PTRACE_KILL); ++ if (ret < 0) ++ goto out_tsk; ++ ++ switch (request) { ++ /* when I and D space are separate, these will need to be fixed. */ ++ case PTRACE_PEEKTEXT: /* read word at location addr. */ ++ case PTRACE_PEEKDATA: { ++ unsigned long tmp; ++ int copied; ++ ++ ret = -EIO; ++ copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); ++ if (copied != sizeof(tmp)) ++ break; ++ ret = put_user(tmp,(unsigned long *) data); ++ break; ++ } ++ ++ /* read the word at location addr in the USER area. */ ++ case PTRACE_PEEKUSR: { ++ unsigned long tmp; ++ ++ ret = -EIO; ++ if ((addr & 3) || addr < 0) ++ break; ++ ++ tmp = 0; /* Default return condition */ ++ if(addr < FRAME_SIZE_OFFSET){ ++ tmp = getreg(child, addr); ++ } ++ else if((addr >= offsetof(struct user, u_debugreg[0])) && ++ (addr <= offsetof(struct user, u_debugreg[7]))){ ++ addr -= offsetof(struct user, u_debugreg[0]); ++ addr = addr >> 2; ++ tmp = child->thread.arch.debugregs[addr]; ++ } ++ ret = put_user(tmp, (unsigned long *) data); ++ break; ++ } ++ ++ /* when I and D space are separate, this will have to be fixed. */ ++ case PTRACE_POKETEXT: /* write the word at location addr. */ ++ case PTRACE_POKEDATA: ++ ret = -EIO; ++ if (access_process_vm(child, addr, &data, sizeof(data), ++ 1) != sizeof(data)) ++ break; ++ ret = 0; ++ break; ++ ++ case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ++ ret = -EIO; ++ if ((addr & 3) || addr < 0) ++ break; ++ ++ if (addr < FRAME_SIZE_OFFSET) { ++ ret = putreg(child, addr, data); ++ break; ++ } ++ else if((addr >= offsetof(struct user, u_debugreg[0])) && ++ (addr <= offsetof(struct user, u_debugreg[7]))){ ++ addr -= offsetof(struct user, u_debugreg[0]); ++ addr = addr >> 2; ++ if((addr == 4) || (addr == 5)) break; ++ child->thread.arch.debugregs[addr] = data; ++ ret = 0; ++ } ++ ++ break; ++ ++ case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ ++ case PTRACE_CONT: { /* restart after signal. */ ++ ret = -EIO; ++ if ((unsigned long) data > _NSIG) ++ break; ++ if (request == PTRACE_SYSCALL) ++ child->ptrace |= PT_TRACESYS; ++ else ++ child->ptrace &= ~PT_TRACESYS; ++ child->exit_code = data; ++ wake_up_process(child); ++ ret = 0; ++ break; ++ } ++ ++/* ++ * make the child exit. Best I can do is send it a sigkill. ++ * perhaps it should be put in the status that it wants to ++ * exit. ++ */ ++ case PTRACE_KILL: { ++ ret = 0; ++ if (child->state == TASK_ZOMBIE) /* already dead */ ++ break; ++ child->exit_code = SIGKILL; ++ wake_up_process(child); ++ break; ++ } ++ ++ case PTRACE_SINGLESTEP: { /* set the trap flag. */ ++ ret = -EIO; ++ if ((unsigned long) data > _NSIG) ++ break; ++ child->ptrace &= ~PT_TRACESYS; ++ child->ptrace |= PT_DTRACE; ++ child->exit_code = data; ++ /* give it a chance to run. */ ++ wake_up_process(child); ++ ret = 0; ++ break; ++ } ++ ++ case PTRACE_DETACH: ++ /* detach a process that was attached. */ ++ ret = ptrace_detach(child, data); ++ break; ++ ++#ifdef PTRACE_GETREGS ++ case PTRACE_GETREGS: { /* Get all gp regs from the child. */ ++ if (!access_ok(VERIFY_WRITE, (unsigned long *)data, ++ FRAME_SIZE_OFFSET)) { ++ ret = -EIO; ++ break; ++ } ++ for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) { ++ __put_user(getreg(child, i), (unsigned long *) data); ++ data += sizeof(long); ++ } ++ ret = 0; ++ break; ++ } ++#endif ++#ifdef PTRACE_SETREGS ++ case PTRACE_SETREGS: { /* Set all gp regs in the child. */ ++ unsigned long tmp = 0; ++ if (!access_ok(VERIFY_READ, (unsigned *)data, ++ FRAME_SIZE_OFFSET)) { ++ ret = -EIO; ++ break; ++ } ++ for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) { ++ __get_user(tmp, (unsigned long *) data); ++ putreg(child, i, tmp); ++ data += sizeof(long); ++ } ++ ret = 0; ++ break; ++ } ++#endif ++#ifdef PTRACE_GETFPREGS ++ case PTRACE_GETFPREGS: /* Get the child FPU state. */ ++ ret = get_fpregs(data, child); ++ break; ++#endif ++#ifdef PTRACE_SETFPREGS ++ case PTRACE_SETFPREGS: /* Set the child FPU state. */ ++ ret = set_fpregs(data, child); ++ break; ++#endif ++#ifdef PTRACE_GETFPXREGS ++ case PTRACE_GETFPXREGS: /* Get the child FPU state. */ ++ ret = get_fpxregs(data, child); ++ break; ++#endif ++#ifdef PTRACE_SETFPXREGS ++ case PTRACE_SETFPXREGS: /* Set the child FPU state. */ ++ ret = set_fpxregs(data, child); ++ break; ++#endif ++ case PTRACE_FAULTINFO: { ++ struct ptrace_faultinfo fault; ++ ++ fault = ((struct ptrace_faultinfo) ++ { .is_write = child->thread.err, ++ .addr = child->thread.cr2 }); ++ ret = copy_to_user((unsigned long *) data, &fault, ++ sizeof(fault)); ++ if(ret) ++ break; ++ break; ++ } ++ case PTRACE_SIGPENDING: ++ ret = copy_to_user((unsigned long *) data, ++ &child->pending.signal, ++ sizeof(child->pending.signal)); ++ break; ++ ++ case PTRACE_LDT: { ++ struct ptrace_ldt ldt; ++ ++ if(copy_from_user(&ldt, (unsigned long *) data, ++ sizeof(ldt))){ ++ ret = -EIO; ++ break; ++ } ++ ++ /* This one is confusing, so just punt and return -EIO for ++ * now ++ */ ++ ret = -EIO; ++ break; ++ } ++#ifdef CONFIG_PROC_MM ++ case PTRACE_SWITCH_MM: { ++ struct mm_struct *old = child->mm; ++ struct mm_struct *new = proc_mm_get_mm(data); ++ ++ if(IS_ERR(new)){ ++ ret = PTR_ERR(new); ++ break; ++ } ++ ++ atomic_inc(&new->mm_users); ++ child->mm = new; ++ child->active_mm = new; ++ mmput(old); ++ ret = 0; ++ break; ++ } ++#endif ++ default: ++ ret = -EIO; ++ break; ++ } ++ out_tsk: ++ free_task_struct(child); ++ out: ++ unlock_kernel(); ++ return ret; ++} ++ ++void syscall_trace(void) ++{ ++ if ((current->ptrace & (PT_PTRACED|PT_TRACESYS)) ++ != (PT_PTRACED|PT_TRACESYS)) ++ return; ++ current->exit_code = SIGTRAP; ++ current->state = TASK_STOPPED; ++ notify_parent(current, SIGCHLD); ++ schedule(); ++ /* ++ * this isn't the same as continuing with a signal, but it will do ++ * for normal use. strace only continues with a signal if the ++ * stopping signal is not SIGTRAP. -brl ++ */ ++ if (current->exit_code) { ++ send_sig(current->exit_code, current, 1); ++ current->exit_code = 0; ++ } ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/reboot.c um/arch/um/kernel/reboot.c +--- orig/arch/um/kernel/reboot.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/reboot.c Mon Dec 30 20:57:42 2002 +@@ -0,0 +1,71 @@ ++/* ++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "os.h" ++#include "mode.h" ++#include "choose-mode.h" ++ ++#ifdef CONFIG_SMP ++static void kill_idlers(int me) ++{ ++ struct task_struct *p; ++ int i; ++ ++ for(i = 0; i < sizeof(init_tasks)/sizeof(init_tasks[0]); i++){ ++ p = init_tasks[i]; ++ if((p != NULL) && (p->thread.mode.tt.extern_pid != me) && ++ (p->thread.mode.tt.extern_pid != -1)) ++ os_kill_process(p->thread.mode.tt.extern_pid, 0); ++ } ++} ++#endif ++ ++static void kill_off_processes(void) ++{ ++ CHOOSE_MODE(kill_off_processes_tt(), kill_off_processes_skas()); ++#ifdef CONFIG_SMP ++ kill_idlers(os_getpid()); ++#endif ++} ++ ++void uml_cleanup(void) ++{ ++ kill_off_processes(); ++ do_uml_exitcalls(); ++} ++ ++void machine_restart(char * __unused) ++{ ++ do_uml_exitcalls(); ++ kill_off_processes(); ++ CHOOSE_MODE(reboot_tt(), reboot_skas()); ++} ++ ++void machine_power_off(void) ++{ ++ do_uml_exitcalls(); ++ kill_off_processes(); ++ CHOOSE_MODE(halt_tt(), halt_skas()); ++} ++ ++void machine_halt(void) ++{ ++ machine_power_off(); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/resource.c um/arch/um/kernel/resource.c +--- orig/arch/um/kernel/resource.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/resource.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/pci.h" ++ ++unsigned long resource_fixup(struct pci_dev * dev, struct resource * res, ++ unsigned long start, unsigned long size) ++{ ++ return start; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/sigio_kern.c um/arch/um/kernel/sigio_kern.c +--- orig/arch/um/kernel/sigio_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/sigio_kern.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,56 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/list.h" ++#include "linux/slab.h" ++#include "asm/irq.h" ++#include "init.h" ++#include "sigio.h" ++#include "irq_user.h" ++ ++/* Protected by sigio_lock() called from write_sigio_workaround */ ++static int sigio_irq_fd = -1; ++ ++void sigio_interrupt(int irq, void *data, struct pt_regs *unused) ++{ ++ read_sigio_fd(sigio_irq_fd); ++ reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); ++} ++ ++int write_sigio_irq(int fd) ++{ ++ if(um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, ++ SA_INTERRUPT | SA_SAMPLE_RANDOM, "write sigio", ++ NULL)){ ++ printk("write_sigio_irq : um_request_irq failed\n"); ++ return(-1); ++ } ++ sigio_irq_fd = fd; ++ return(0); ++} ++ ++static spinlock_t sigio_spinlock = SPIN_LOCK_UNLOCKED; ++ ++void sigio_lock(void) ++{ ++ spin_lock(&sigio_spinlock); ++} ++ ++void sigio_unlock(void) ++{ ++ spin_unlock(&sigio_spinlock); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/sigio_user.c um/arch/um/kernel/sigio_user.c +--- orig/arch/um/kernel/sigio_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/sigio_user.c Sun Dec 29 23:36:35 2002 +@@ -0,0 +1,440 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <stdlib.h> ++#include <termios.h> ++#include <pty.h> ++#include <fcntl.h> ++#include <signal.h> ++#include <errno.h> ++#include <string.h> ++#include <sched.h> ++#include <sys/socket.h> ++#include <sys/poll.h> ++#include "init.h" ++#include "user.h" ++#include "kern_util.h" ++#include "sigio.h" ++#include "helper.h" ++#include "os.h" ++ ++/* Changed during early boot */ ++int pty_output_sigio = 0; ++int pty_close_sigio = 0; ++ ++/* Used as a flag during SIGIO testing early in boot */ ++static int got_sigio = 0; ++ ++void __init handler(int sig) ++{ ++ got_sigio = 1; ++} ++ ++struct openpty_arg { ++ int master; ++ int slave; ++ int err; ++}; ++ ++static void openpty_cb(void *arg) ++{ ++ struct openpty_arg *info = arg; ++ ++ info->err = 0; ++ if(openpty(&info->master, &info->slave, NULL, NULL, NULL)) ++ info->err = errno; ++} ++ ++void __init check_one_sigio(void (*proc)(int, int)) ++{ ++ struct sigaction old, new; ++ struct termios tt; ++ struct openpty_arg pty = { .master = -1, .slave = -1 }; ++ int master, slave, flags; ++ ++ initial_thread_cb(openpty_cb, &pty); ++ if(pty.err){ ++ printk("openpty failed, errno = %d\n", pty.err); ++ return; ++ } ++ ++ master = pty.master; ++ slave = pty.slave; ++ ++ if((master == -1) || (slave == -1)){ ++ printk("openpty failed to allocate a pty\n"); ++ return; ++ } ++ ++ if(tcgetattr(master, &tt) < 0) ++ panic("check_sigio : tcgetattr failed, errno = %d\n", errno); ++ cfmakeraw(&tt); ++ if(tcsetattr(master, TCSADRAIN, &tt) < 0) ++ panic("check_sigio : tcsetattr failed, errno = %d\n", errno); ++ ++ if((flags = fcntl(master, F_GETFL)) < 0) ++ panic("tty_fds : fcntl F_GETFL failed, errno = %d\n", errno); ++ ++ if((fcntl(master, F_SETFL, flags | O_NONBLOCK | O_ASYNC) < 0) || ++ (fcntl(master, F_SETOWN, os_getpid()) < 0)) ++ panic("check_sigio : fcntl F_SETFL or F_SETOWN failed, " ++ "errno = %d\n", errno); ++ ++ if((fcntl(slave, F_SETFL, flags | O_NONBLOCK) < 0)) ++ panic("check_sigio : fcntl F_SETFL failed, errno = %d\n", ++ errno); ++ ++ if(sigaction(SIGIO, NULL, &old) < 0) ++ panic("check_sigio : sigaction 1 failed, errno = %d\n", errno); ++ new = old; ++ new.sa_handler = handler; ++ if(sigaction(SIGIO, &new, NULL) < 0) ++ panic("check_sigio : sigaction 2 failed, errno = %d\n", errno); ++ ++ got_sigio = 0; ++ (*proc)(master, slave); ++ ++ close(master); ++ close(slave); ++ ++ if(sigaction(SIGIO, &old, NULL) < 0) ++ panic("check_sigio : sigaction 3 failed, errno = %d\n", errno); ++} ++ ++static void tty_output(int master, int slave) ++{ ++ int n; ++ char buf[512]; ++ ++ printk("Checking that host ptys support output SIGIO..."); ++ ++ memset(buf, 0, sizeof(buf)); ++ while(write(master, buf, sizeof(buf)) > 0) ; ++ if(errno != EAGAIN) ++ panic("check_sigio : write failed, errno = %d\n", errno); ++ ++ while(((n = read(slave, buf, sizeof(buf))) > 0) && !got_sigio) ; ++ ++ if(got_sigio){ ++ printk("Yes\n"); ++ pty_output_sigio = 1; ++ } ++ else if(errno == EAGAIN) printk("No, enabling workaround\n"); ++ else panic("check_sigio : read failed, errno = %d\n", errno); ++} ++ ++static void tty_close(int master, int slave) ++{ ++ printk("Checking that host ptys support SIGIO on close..."); ++ ++ close(slave); ++ if(got_sigio){ ++ printk("Yes\n"); ++ pty_close_sigio = 1; ++ } ++ else printk("No, enabling workaround\n"); ++} ++ ++void __init check_sigio(void) ++{ ++ if(access("/dev/ptmx", R_OK) && access("/dev/ptyp0", R_OK)){ ++ printk("No pseudo-terminals available - skipping pty SIGIO " ++ "check\n"); ++ return; ++ } ++ check_one_sigio(tty_output); ++ check_one_sigio(tty_close); ++} ++ ++/* Protected by sigio_lock(), also used by sigio_cleanup, which is an ++ * exitcall. ++ */ ++static int write_sigio_pid = -1; ++ ++/* These arrays are initialized before the sigio thread is started, and ++ * the descriptors closed after it is killed. So, it can't see them change. ++ * On the UML side, they are changed under the sigio_lock. ++ */ ++static int write_sigio_fds[2] = { -1, -1 }; ++static int sigio_private[2] = { -1, -1 }; ++ ++struct pollfds { ++ struct pollfd *poll; ++ int size; ++ int used; ++}; ++ ++/* Protected by sigio_lock(). Used by the sigio thread, but the UML thread ++ * synchronizes with it. ++ */ ++struct pollfds current_poll = { ++ .poll = NULL, ++ .size = 0, ++ .used = 0 ++}; ++ ++struct pollfds next_poll = { ++ .poll = NULL, ++ .size = 0, ++ .used = 0 ++}; ++ ++static int write_sigio_thread(void *unused) ++{ ++ struct pollfds *fds, tmp; ++ struct pollfd *p; ++ int i, n, respond_fd; ++ char c; ++ ++ fds = ¤t_poll; ++ while(1){ ++ n = poll(fds->poll, fds->used, -1); ++ if(n < 0){ ++ if(errno == EINTR) continue; ++ printk("write_sigio_thread : poll returned %d, " ++ "errno = %d\n", n, errno); ++ } ++ for(i = 0; i < fds->used; i++){ ++ p = &fds->poll[i]; ++ if(p->revents == 0) continue; ++ if(p->fd == sigio_private[1]){ ++ n = read(sigio_private[1], &c, sizeof(c)); ++ if(n != sizeof(c)) ++ printk("write_sigio_thread : " ++ "read failed, errno = %d\n", ++ errno); ++ tmp = current_poll; ++ current_poll = next_poll; ++ next_poll = tmp; ++ respond_fd = sigio_private[1]; ++ } ++ else { ++ respond_fd = write_sigio_fds[1]; ++ fds->used--; ++ memmove(&fds->poll[i], &fds->poll[i + 1], ++ (fds->used - i) * sizeof(*fds->poll)); ++ } ++ ++ n = write(respond_fd, &c, sizeof(c)); ++ if(n != sizeof(c)) ++ printk("write_sigio_thread : write failed, " ++ "errno = %d\n", errno); ++ } ++ } ++} ++ ++static int need_poll(int n) ++{ ++ if(n <= next_poll.size){ ++ next_poll.used = n; ++ return(0); ++ } ++ if(next_poll.poll != NULL) kfree(next_poll.poll); ++ next_poll.poll = um_kmalloc_atomic(n * sizeof(struct pollfd)); ++ if(next_poll.poll == NULL){ ++ printk("need_poll : failed to allocate new pollfds\n"); ++ next_poll.size = 0; ++ next_poll.used = 0; ++ return(-1); ++ } ++ next_poll.size = n; ++ next_poll.used = n; ++ return(0); ++} ++ ++static void update_thread(void) ++{ ++ unsigned long flags; ++ int n; ++ char c; ++ ++ flags = set_signals(0); ++ n = write(sigio_private[0], &c, sizeof(c)); ++ if(n != sizeof(c)){ ++ printk("update_thread : write failed, errno = %d\n", errno); ++ goto fail; ++ } ++ ++ n = read(sigio_private[0], &c, sizeof(c)); ++ if(n != sizeof(c)){ ++ printk("update_thread : read failed, errno = %d\n", errno); ++ goto fail; ++ } ++ ++ set_signals(flags); ++ return; ++ fail: ++ sigio_lock(); ++ if(write_sigio_pid != -1) ++ os_kill_process(write_sigio_pid, 1); ++ write_sigio_pid = -1; ++ close(sigio_private[0]); ++ close(sigio_private[1]); ++ close(write_sigio_fds[0]); ++ close(write_sigio_fds[1]); ++ sigio_unlock(); ++ set_signals(flags); ++} ++ ++int add_sigio_fd(int fd, int read) ++{ ++ int err = 0, i, n, events; ++ ++ sigio_lock(); ++ for(i = 0; i < current_poll.used; i++){ ++ if(current_poll.poll[i].fd == fd) ++ goto out; ++ } ++ ++ n = current_poll.used + 1; ++ err = need_poll(n); ++ if(err) ++ goto out; ++ ++ for(i = 0; i < current_poll.used; i++) ++ next_poll.poll[i] = current_poll.poll[i]; ++ ++ if(read) events = POLLIN; ++ else events = POLLOUT; ++ ++ next_poll.poll[n - 1] = ((struct pollfd) { .fd = fd, ++ .events = events, ++ .revents = 0 }); ++ update_thread(); ++ out: ++ sigio_unlock(); ++ return(err); ++} ++ ++int ignore_sigio_fd(int fd) ++{ ++ struct pollfd *p; ++ int err = 0, i, n = 0; ++ ++ sigio_lock(); ++ for(i = 0; i < current_poll.used; i++){ ++ if(current_poll.poll[i].fd == fd) break; ++ } ++ if(i == current_poll.used) ++ goto out; ++ ++ err = need_poll(current_poll.used - 1); ++ if(err) ++ goto out; ++ ++ for(i = 0; i < current_poll.used; i++){ ++ p = ¤t_poll.poll[i]; ++ if(p->fd != fd) next_poll.poll[n++] = current_poll.poll[i]; ++ } ++ if(n == i){ ++ printk("ignore_sigio_fd : fd %d not found\n", fd); ++ err = -1; ++ goto out; ++ } ++ ++ update_thread(); ++ out: ++ sigio_unlock(); ++ return(err); ++} ++ ++static int setup_initial_poll(int fd) ++{ ++ struct pollfd *p; ++ ++ p = um_kmalloc(sizeof(struct pollfd)); ++ if(p == NULL){ ++ printk("setup_initial_poll : failed to allocate poll\n"); ++ return(-1); ++ } ++ *p = ((struct pollfd) { .fd = fd, ++ .events = POLLIN, ++ .revents = 0 }); ++ current_poll = ((struct pollfds) { .poll = p, ++ .used = 1, ++ .size = 1 }); ++ return(0); ++} ++ ++void write_sigio_workaround(void) ++{ ++ unsigned long stack; ++ int err; ++ ++ sigio_lock(); ++ if(write_sigio_pid != -1) ++ goto out; ++ ++ err = os_pipe(write_sigio_fds, 1, 1); ++ if(err){ ++ printk("write_sigio_workaround - os_pipe 1 failed, " ++ "errno = %d\n", -err); ++ goto out; ++ } ++ err = os_pipe(sigio_private, 1, 1); ++ if(err){ ++ printk("write_sigio_workaround - os_pipe 2 failed, " ++ "errno = %d\n", -err); ++ goto out_close1; ++ } ++ if(setup_initial_poll(sigio_private[1])) ++ goto out_close2; ++ ++ write_sigio_pid = run_helper_thread(write_sigio_thread, NULL, ++ CLONE_FILES | CLONE_VM, &stack, 0); ++ ++ if(write_sigio_pid < 0) goto out_close2; ++ ++ if(write_sigio_irq(write_sigio_fds[0])) ++ goto out_kill; ++ ++ out: ++ sigio_unlock(); ++ return; ++ ++ out_kill: ++ os_kill_process(write_sigio_pid, 1); ++ write_sigio_pid = -1; ++ out_close2: ++ close(sigio_private[0]); ++ close(sigio_private[1]); ++ out_close1: ++ close(write_sigio_fds[0]); ++ close(write_sigio_fds[1]); ++ sigio_unlock(); ++} ++ ++int read_sigio_fd(int fd) ++{ ++ int n; ++ char c; ++ ++ n = read(fd, &c, sizeof(c)); ++ if(n != sizeof(c)){ ++ printk("read_sigio_fd - read failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ return(n); ++} ++ ++static void sigio_cleanup(void) ++{ ++ if(write_sigio_pid != -1) ++ os_kill_process(write_sigio_pid, 1); ++} ++ ++__uml_exitcall(sigio_cleanup); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/signal_kern.c um/arch/um/kernel/signal_kern.c +--- orig/arch/um/kernel/signal_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/signal_kern.c Sun Dec 8 19:44:13 2002 +@@ -0,0 +1,367 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/stddef.h" ++#include "linux/sys.h" ++#include "linux/sched.h" ++#include "linux/wait.h" ++#include "linux/kernel.h" ++#include "linux/smp_lock.h" ++#include "linux/module.h" ++#include "linux/slab.h" ++#include "asm/signal.h" ++#include "asm/uaccess.h" ++#include "asm/ucontext.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "signal_kern.h" ++#include "signal_user.h" ++#include "kern.h" ++#include "frame_kern.h" ++#include "sigcontext.h" ++#include "mode.h" ++ ++EXPORT_SYMBOL(block_signals); ++EXPORT_SYMBOL(unblock_signals); ++ ++static void force_segv(int sig) ++{ ++ if(sig == SIGSEGV){ ++ struct k_sigaction *ka; ++ ++ ka = ¤t->sig->action[SIGSEGV - 1]; ++ ka->sa.sa_handler = SIG_DFL; ++ } ++ force_sig(SIGSEGV, current); ++} ++ ++#define _S(nr) (1<<((nr)-1)) ++ ++#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) ++ ++/* ++ * OK, we're invoking a handler ++ */ ++static int handle_signal(struct pt_regs *regs, unsigned long signr, ++ struct k_sigaction *ka, siginfo_t *info, ++ sigset_t *oldset, int error) ++{ ++ __sighandler_t handler; ++ void (*restorer)(void); ++ unsigned long sp; ++ sigset_t save; ++ int err, ret; ++ ++ ret = 0; ++ switch(error){ ++ case -ERESTARTNOHAND: ++ ret = -EINTR; ++ break; ++ ++ case -ERESTARTSYS: ++ if (!(ka->sa.sa_flags & SA_RESTART)) { ++ ret = -EINTR; ++ break; ++ } ++ /* fallthrough */ ++ case -ERESTARTNOINTR: ++ PT_REGS_RESTART_SYSCALL(regs); ++ PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); ++ ++ /* This is because of the UM_SET_SYSCALL_RETURN and the fact ++ * that on i386 the system call number and return value are ++ * in the same register. When the system call restarts, %eax ++ * had better have the system call number in it. Since the ++ * return value doesn't matter (except that it shouldn't be ++ * -ERESTART*), we'll stick the system call number there. ++ */ ++ ret = PT_REGS_SYSCALL_NR(regs); ++ break; ++ } ++ ++ handler = ka->sa.sa_handler; ++ save = *oldset; ++ ++ if (ka->sa.sa_flags & SA_ONESHOT) ++ ka->sa.sa_handler = SIG_DFL; ++ ++ if (!(ka->sa.sa_flags & SA_NODEFER)) { ++ spin_lock_irq(¤t->sigmask_lock); ++ sigorsets(¤t->blocked, ¤t->blocked, ++ &ka->sa.sa_mask); ++ sigaddset(¤t->blocked, signr); ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ } ++ ++ sp = PT_REGS_SP(regs); ++ ++ if((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) ++ sp = current->sas_ss_sp + current->sas_ss_size; ++ ++ if(error != 0) PT_REGS_SET_SYSCALL_RETURN(regs, ret); ++ ++ if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; ++ else restorer = NULL; ++ ++ if(ka->sa.sa_flags & SA_SIGINFO) ++ err = setup_signal_stack_si(sp, signr, (unsigned long) handler, ++ restorer, regs, info, &save); ++ else ++ err = setup_signal_stack_sc(sp, signr, (unsigned long) handler, ++ restorer, regs, &save); ++ if(err) goto segv; ++ ++ return(0); ++ segv: ++ force_segv(signr); ++ return(1); ++} ++ ++/* ++ * Note that 'init' is a special process: it doesn't get signals it doesn't ++ * want to handle. Thus you cannot kill init even with a SIGKILL even by ++ * mistake. ++ */ ++ ++static int kern_do_signal(struct pt_regs *regs, sigset_t *oldset, int error) ++{ ++ siginfo_t info; ++ struct k_sigaction *ka; ++ int err; ++ ++ if (!oldset) ++ oldset = ¤t->blocked; ++ ++ for (;;) { ++ unsigned long signr; ++ ++ spin_lock_irq(¤t->sigmask_lock); ++ signr = dequeue_signal(¤t->blocked, &info); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ if (!signr) ++ break; ++ ++ if ((current->ptrace & PT_PTRACED) && signr != SIGKILL) { ++ /* Let the debugger run. */ ++ current->exit_code = signr; ++ current->state = TASK_STOPPED; ++ notify_parent(current, SIGCHLD); ++ schedule(); ++ ++ /* We're back. Did the debugger cancel the sig? */ ++ if (!(signr = current->exit_code)) ++ continue; ++ current->exit_code = 0; ++ ++ /* The debugger continued. Ignore SIGSTOP. */ ++ if (signr == SIGSTOP) ++ continue; ++ ++ /* Update the siginfo structure. Is this good? */ ++ if (signr != info.si_signo) { ++ info.si_signo = signr; ++ info.si_errno = 0; ++ info.si_code = SI_USER; ++ info.si_pid = current->p_pptr->pid; ++ info.si_uid = current->p_pptr->uid; ++ } ++ ++ /* If the (new) signal is now blocked, requeue it. */ ++ if (sigismember(¤t->blocked, signr)) { ++ send_sig_info(signr, &info, current); ++ continue; ++ } ++ } ++ ++ ka = ¤t->sig->action[signr-1]; ++ if (ka->sa.sa_handler == SIG_IGN) { ++ if (signr != SIGCHLD) ++ continue; ++ /* Check for SIGCHLD: it's special. */ ++ while (sys_wait4(-1, NULL, WNOHANG, NULL) > 0) ++ /* nothing */; ++ continue; ++ } ++ ++ if (ka->sa.sa_handler == SIG_DFL) { ++ int exit_code = signr; ++ ++ /* Init gets no signals it doesn't want. */ ++ if (current->pid == 1) ++ continue; ++ ++ switch (signr) { ++ case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG: ++ continue; ++ ++ case SIGTSTP: case SIGTTIN: case SIGTTOU: ++ if (is_orphaned_pgrp(current->pgrp)) ++ continue; ++ /* FALLTHRU */ ++ ++ case SIGSTOP: { ++ struct signal_struct *sig; ++ current->state = TASK_STOPPED; ++ current->exit_code = signr; ++ sig = current->p_pptr->sig; ++ if (sig && !(sig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) ++ notify_parent(current, SIGCHLD); ++ schedule(); ++ continue; ++ } ++ case SIGQUIT: case SIGILL: case SIGTRAP: ++ case SIGABRT: case SIGFPE: case SIGSEGV: ++ case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ: ++ if (do_coredump(signr, ¤t->thread.regs)) ++ exit_code |= 0x80; ++ /* FALLTHRU */ ++ ++ default: ++ sig_exit(signr, exit_code, &info); ++ /* NOTREACHED */ ++ } ++ } ++ ++ /* Whee! Actually deliver the signal. */ ++ err = handle_signal(regs, signr, ka, &info, oldset, error); ++ if(!err) return(1); ++ } ++ ++ /* Did we come from a system call? */ ++ if(PT_REGS_SYSCALL_NR(regs) >= 0){ ++ /* Restart the system call - no handlers present */ ++ if(PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOHAND || ++ PT_REGS_SYSCALL_RET(regs) == -ERESTARTSYS || ++ PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOINTR){ ++ PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); ++ PT_REGS_RESTART_SYSCALL(regs); ++ } ++ } ++ ++ /* This closes a way to execute a system call on the host. If ++ * you set a breakpoint on a system call instruction and singlestep ++ * from it, the tracing thread used to PTRACE_SINGLESTEP the process ++ * rather than PTRACE_SYSCALL it, allowing the system call to execute ++ * on the host. The tracing thread will check this flag and ++ * PTRACE_SYSCALL if necessary. ++ */ ++ if((current->ptrace & PT_DTRACE) && ++ is_syscall(PT_REGS_IP(¤t->thread.regs))) ++ (void) CHOOSE_MODE(current->thread.mode.tt.singlestep_syscall = 1, 0); ++ ++ return(0); ++} ++ ++int do_signal(int error) ++{ ++ return(kern_do_signal(¤t->thread.regs, NULL, error)); ++} ++ ++/* ++ * Atomically swap in the new signal mask, and wait for a signal. ++ */ ++int sys_sigsuspend(int history0, int history1, old_sigset_t mask) ++{ ++ sigset_t saveset; ++ ++ mask &= _BLOCKABLE; ++ spin_lock_irq(¤t->sigmask_lock); ++ saveset = current->blocked; ++ siginitset(¤t->blocked, mask); ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ while (1) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ if(kern_do_signal(¤t->thread.regs, &saveset, -EINTR)) ++ return(-EINTR); ++ } ++} ++ ++int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) ++{ ++ sigset_t saveset, newset; ++ ++ /* XXX: Don't preclude handling different sized sigset_t's. */ ++ if (sigsetsize != sizeof(sigset_t)) ++ return -EINVAL; ++ ++ if (copy_from_user(&newset, unewset, sizeof(newset))) ++ return -EFAULT; ++ sigdelsetmask(&newset, ~_BLOCKABLE); ++ ++ spin_lock_irq(¤t->sigmask_lock); ++ saveset = current->blocked; ++ current->blocked = newset; ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ ++ while (1) { ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ if (kern_do_signal(¤t->thread.regs, &saveset, -EINTR)) ++ return(-EINTR); ++ } ++} ++ ++static int copy_sc_from_user(struct pt_regs *to, void *from, ++ struct arch_frame_data *arch) ++{ ++ int ret; ++ ++ ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch), ++ copy_sc_from_user_skas(&to->regs, from)); ++ return(ret); ++} ++ ++int sys_sigreturn(struct pt_regs regs) ++{ ++ void *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); ++ void *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); ++ int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); ++ ++ spin_lock_irq(¤t->sigmask_lock); ++ copy_from_user(¤t->blocked.sig[0], sc_sigmask(sc), ++ sizeof(current->blocked.sig[0])); ++ copy_from_user(¤t->blocked.sig[1], mask, sig_size); ++ sigdelsetmask(¤t->blocked, ~_BLOCKABLE); ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ copy_sc_from_user(¤t->thread.regs, sc, ++ &signal_frame_sc.common.arch); ++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); ++} ++ ++int sys_rt_sigreturn(struct pt_regs regs) ++{ ++ struct ucontext *uc = sp_to_uc(PT_REGS_SP(¤t->thread.regs)); ++ void *fp; ++ int sig_size = _NSIG_WORDS * sizeof(unsigned long); ++ ++ spin_lock_irq(¤t->sigmask_lock); ++ copy_from_user(¤t->blocked, &uc->uc_sigmask, sig_size); ++ sigdelsetmask(¤t->blocked, ~_BLOCKABLE); ++ recalc_sigpending(current); ++ spin_unlock_irq(¤t->sigmask_lock); ++ fp = (void *) (((unsigned long) uc) + sizeof(struct ucontext)); ++ copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext, ++ &signal_frame_si.common.arch); ++ return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/signal_user.c um/arch/um/kernel/signal_user.c +--- orig/arch/um/kernel/signal_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/signal_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,142 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <stdlib.h> ++#include <signal.h> ++#include <errno.h> ++#include <stdarg.h> ++#include <string.h> ++#include <sys/mman.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "signal_user.h" ++#include "signal_kern.h" ++#include "sysdep/sigcontext.h" ++#include "sigcontext.h" ++ ++void set_sigstack(void *sig_stack, int size) ++{ ++ stack_t stack = ((stack_t) { .ss_flags = 0, ++ .ss_sp = (__ptr_t) sig_stack, ++ .ss_size = size - sizeof(void *) }); ++ ++ if(sigaltstack(&stack, NULL) != 0) ++ panic("enabling signal stack failed, errno = %d\n", errno); ++} ++ ++void set_handler(int sig, void (*handler)(int), int flags, ...) ++{ ++ struct sigaction action; ++ va_list ap; ++ int mask; ++ ++ va_start(ap, flags); ++ action.sa_handler = handler; ++ sigemptyset(&action.sa_mask); ++ while((mask = va_arg(ap, int)) != -1){ ++ sigaddset(&action.sa_mask, mask); ++ } ++ action.sa_flags = flags; ++ action.sa_restorer = NULL; ++ if(sigaction(sig, &action, NULL) < 0) ++ panic("sigaction failed"); ++} ++ ++int change_sig(int signal, int on) ++{ ++ sigset_t sigset, old; ++ ++ sigemptyset(&sigset); ++ sigaddset(&sigset, signal); ++ sigprocmask(on ? SIG_UNBLOCK : SIG_BLOCK, &sigset, &old); ++ return(!sigismember(&old, signal)); ++} ++ ++static void change_signals(int type) ++{ ++ sigset_t mask; ++ ++ sigemptyset(&mask); ++ sigaddset(&mask, SIGVTALRM); ++ sigaddset(&mask, SIGALRM); ++ sigaddset(&mask, SIGIO); ++ sigaddset(&mask, SIGPROF); ++ if(sigprocmask(type, &mask, NULL) < 0) ++ panic("Failed to change signal mask - errno = %d", errno); ++} ++ ++void block_signals(void) ++{ ++ change_signals(SIG_BLOCK); ++} ++ ++void unblock_signals(void) ++{ ++ change_signals(SIG_UNBLOCK); ++} ++ ++#define SIGIO_BIT 0 ++#define SIGVTALRM_BIT 1 ++ ++static int enable_mask(sigset_t *mask) ++{ ++ int sigs; ++ ++ sigs = sigismember(mask, SIGIO) ? 0 : 1 << SIGIO_BIT; ++ sigs |= sigismember(mask, SIGVTALRM) ? 0 : 1 << SIGVTALRM_BIT; ++ sigs |= sigismember(mask, SIGALRM) ? 0 : 1 << SIGVTALRM_BIT; ++ return(sigs); ++} ++ ++int get_signals(void) ++{ ++ sigset_t mask; ++ ++ if(sigprocmask(SIG_SETMASK, NULL, &mask) < 0) ++ panic("Failed to get signal mask"); ++ return(enable_mask(&mask)); ++} ++ ++int set_signals(int enable) ++{ ++ sigset_t mask; ++ int ret; ++ ++ sigemptyset(&mask); ++ if(enable & (1 << SIGIO_BIT)) ++ sigaddset(&mask, SIGIO); ++ if(enable & (1 << SIGVTALRM_BIT)){ ++ sigaddset(&mask, SIGVTALRM); ++ sigaddset(&mask, SIGALRM); ++ } ++ if(sigprocmask(SIG_UNBLOCK, &mask, &mask) < 0) ++ panic("Failed to enable signals"); ++ ret = enable_mask(&mask); ++ sigemptyset(&mask); ++ if((enable & (1 << SIGIO_BIT)) == 0) ++ sigaddset(&mask, SIGIO); ++ if((enable & (1 << SIGVTALRM_BIT)) == 0){ ++ sigaddset(&mask, SIGVTALRM); ++ sigaddset(&mask, SIGALRM); ++ } ++ if(sigprocmask(SIG_BLOCK, &mask, NULL) < 0) ++ panic("Failed to block signals"); ++ ++ return(ret); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/Makefile um/arch/um/kernel/skas/Makefile +--- orig/arch/um/kernel/skas/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/Makefile Fri Nov 1 16:05:44 2002 +@@ -0,0 +1,30 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = skas.o ++ ++obj-y = exec_kern.o exec_user.o mem.o mem_user.o mmu.o process.o \ ++ process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o ++ ++subdir-y = sys-$(SUBARCH) ++ ++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) ++ ++USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o ++ ++include $(TOPDIR)/Rules.make ++ ++include/skas_ptregs.h : util/mk_ptregs ++ util/mk_ptregs > $@ ++ ++util/mk_ptregs : ++ $(MAKE) -C util ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean : ++ $(MAKE) -C util clean ++ $(RM) -f include/skas_ptregs.h +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/exec_kern.c um/arch/um/kernel/skas/exec_kern.c +--- orig/arch/um/kernel/skas/exec_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/exec_kern.c Mon Nov 11 18:57:19 2002 +@@ -0,0 +1,41 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "asm/current.h" ++#include "asm/page.h" ++#include "asm/signal.h" ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "asm/mmu_context.h" ++#include "tlb.h" ++#include "skas.h" ++#include "mmu.h" ++#include "os.h" ++ ++void flush_thread_skas(void) ++{ ++ force_flush_all(); ++ switch_mm_skas(current->mm->context.skas.mm_fd); ++} ++ ++void start_thread_skas(struct pt_regs *regs, unsigned long eip, ++ unsigned long esp) ++{ ++ set_fs(USER_DS); ++ PT_REGS_IP(regs) = eip; ++ PT_REGS_SP(regs) = esp; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/exec_user.c um/arch/um/kernel/skas/exec_user.c +--- orig/arch/um/kernel/skas/exec_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/exec_user.c Sun Nov 3 19:23:01 2002 +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <errno.h> ++#include <signal.h> ++#include <sched.h> ++#include <sys/wait.h> ++#include <sys/ptrace.h> ++#include "user.h" ++#include "kern_util.h" ++#include "os.h" ++#include "time_user.h" ++ ++static int user_thread_tramp(void *arg) ++{ ++ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) ++ panic("user_thread_tramp - PTRACE_TRACEME failed, " ++ "errno = %d\n", errno); ++ enable_timer(); ++ os_stop_process(os_getpid()); ++ return(0); ++} ++ ++int user_thread(unsigned long stack, int flags) ++{ ++ int pid, status; ++ ++ pid = clone(user_thread_tramp, (void *) stack_sp(stack), ++ flags | CLONE_FILES | SIGCHLD, NULL); ++ if(pid < 0){ ++ printk("user_thread - clone failed, errno = %d\n", errno); ++ return(pid); ++ } ++ ++ if(waitpid(pid, &status, WUNTRACED) < 0){ ++ printk("user_thread - waitpid failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)){ ++ printk("user_thread - trampoline didn't stop, status = %d\n", ++ status); ++ return(-EINVAL); ++ } ++ ++ return(pid); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mmu.h um/arch/um/kernel/skas/include/mmu.h +--- orig/arch/um/kernel/skas/include/mmu.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/mmu.h Sun Nov 10 21:21:50 2002 +@@ -0,0 +1,27 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_MMU_H ++#define __SKAS_MMU_H ++ ++#include "linux/list.h" ++#include "linux/spinlock.h" ++ ++struct mmu_context_skas { ++ int mm_fd; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mode.h um/arch/um/kernel/skas/include/mode.h +--- orig/arch/um/kernel/skas/include/mode.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/mode.h Wed Mar 26 13:27:46 2003 +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_SKAS_H__ ++#define __MODE_SKAS_H__ ++ ++extern unsigned long exec_regs[]; ++extern unsigned long exec_fp_regs[]; ++extern unsigned long exec_fpx_regs[]; ++extern int have_fpx_regs; ++ ++extern void user_time_init_skas(void); ++extern int copy_sc_from_user_skas(union uml_pt_regs *regs, void *from_ptr); ++extern int copy_sc_to_user_skas(void *to_ptr, void *fp, ++ union uml_pt_regs *regs, ++ unsigned long fault_addr, int fault_type); ++extern void sig_handler_common_skas(int sig, void *sc_ptr); ++extern void halt_skas(void); ++extern void reboot_skas(void); ++extern void kill_off_processes_skas(void); ++extern int is_skas_winch(int pid, int fd, void *data); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/mode_kern.h um/arch/um/kernel/skas/include/mode_kern.h +--- orig/arch/um/kernel/skas/include/mode_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/mode_kern.h Mon Dec 16 21:49:11 2002 +@@ -0,0 +1,51 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_MODE_KERN_H__ ++#define __SKAS_MODE_KERN_H__ ++ ++#include "linux/sched.h" ++#include "asm/page.h" ++#include "asm/ptrace.h" ++ ++extern void flush_thread_skas(void); ++extern void *_switch_to_skas(void *prev, void *next); ++extern void start_thread_skas(struct pt_regs *regs, unsigned long eip, ++ unsigned long esp); ++extern int copy_thread_skas(int nr, unsigned long clone_flags, ++ unsigned long sp, unsigned long stack_top, ++ struct task_struct *p, struct pt_regs *regs); ++extern void release_thread_skas(struct task_struct *task); ++extern void exit_thread_skas(void); ++extern void initial_thread_cb_skas(void (*proc)(void *), void *arg); ++extern void init_idle_skas(void); ++extern void flush_tlb_kernel_vm_skas(void); ++extern void __flush_tlb_one_skas(unsigned long addr); ++extern void flush_tlb_range_skas(struct mm_struct *mm, unsigned long start, ++ unsigned long end); ++extern void flush_tlb_mm_skas(struct mm_struct *mm); ++extern void force_flush_all_skas(void); ++extern long execute_syscall_skas(void *r); ++extern void before_mem_skas(unsigned long unused); ++extern unsigned long set_task_sizes_skas(int arg, unsigned long *host_size_out, ++ unsigned long *task_size_out); ++extern int start_uml_skas(void); ++extern int external_pid_skas(struct task_struct *task); ++extern int thread_pid_skas(struct thread_struct *thread); ++ ++#define kmem_end_skas (host_task_size - 1024 * 1024) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/proc_mm.h um/arch/um/kernel/skas/include/proc_mm.h +--- orig/arch/um/kernel/skas/include/proc_mm.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/proc_mm.h Wed Nov 13 11:57:23 2002 +@@ -0,0 +1,55 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_PROC_MM_H ++#define __SKAS_PROC_MM_H ++ ++#define MM_MMAP 54 ++#define MM_MUNMAP 55 ++#define MM_MPROTECT 56 ++#define MM_COPY_SEGMENTS 57 ++ ++struct mm_mmap { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++struct mm_munmap { ++ unsigned long addr; ++ unsigned long len; ++}; ++ ++struct mm_mprotect { ++ unsigned long addr; ++ unsigned long len; ++ unsigned int prot; ++}; ++ ++struct proc_mm_op { ++ int op; ++ union { ++ struct mm_mmap mmap; ++ struct mm_munmap munmap; ++ struct mm_mprotect mprotect; ++ int copy_segments; ++ } u; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/ptrace-skas.h um/arch/um/kernel/skas/include/ptrace-skas.h +--- orig/arch/um/kernel/skas/include/ptrace-skas.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/ptrace-skas.h Fri Jan 17 13:22:09 2003 +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PTRACE_SKAS_H ++#define __PTRACE_SKAS_H ++ ++#include "uml-config.h" ++ ++#ifdef UML_CONFIG_MODE_SKAS ++ ++#include "skas_ptregs.h" ++ ++#define HOST_FRAME_SIZE 17 ++ ++#define REGS_IP(r) ((r)[HOST_IP]) ++#define REGS_SP(r) ((r)[HOST_SP]) ++#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) ++#define REGS_EAX(r) ((r)[HOST_EAX]) ++#define REGS_EBX(r) ((r)[HOST_EBX]) ++#define REGS_ECX(r) ((r)[HOST_ECX]) ++#define REGS_EDX(r) ((r)[HOST_EDX]) ++#define REGS_ESI(r) ((r)[HOST_ESI]) ++#define REGS_EDI(r) ((r)[HOST_EDI]) ++#define REGS_EBP(r) ((r)[HOST_EBP]) ++#define REGS_CS(r) ((r)[HOST_CS]) ++#define REGS_SS(r) ((r)[HOST_SS]) ++#define REGS_DS(r) ((r)[HOST_DS]) ++#define REGS_ES(r) ((r)[HOST_ES]) ++#define REGS_FS(r) ((r)[HOST_FS]) ++#define REGS_GS(r) ((r)[HOST_GS]) ++ ++#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res) ++ ++#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) ++ ++#define REGS_SEGV_IS_FIXABLE(r) SEGV_IS_FIXABLE((r)->trap_type) ++ ++#define REGS_FAULT_ADDR(r) ((r)->fault_addr) ++ ++#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type) ++ ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/skas.h um/arch/um/kernel/skas/include/skas.h +--- orig/arch/um/kernel/skas/include/skas.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/skas.h Sun Dec 8 21:00:12 2002 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_H ++#define __SKAS_H ++ ++#include "sysdep/ptrace.h" ++ ++extern int userspace_pid; ++ ++extern void switch_threads(void *me, void *next); ++extern void thread_wait(void *sw, void *fb); ++extern void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, ++ void (*handler)(int)); ++extern int start_idle_thread(void *stack, void *switch_buf_ptr, ++ void **fork_buf_ptr); ++extern int user_thread(unsigned long stack, int flags); ++extern void userspace(union uml_pt_regs *regs); ++extern void new_thread_proc(void *stack, void (*handler)(int sig)); ++extern void remove_sigstack(void); ++extern void new_thread_handler(int sig); ++extern void handle_syscall(union uml_pt_regs *regs); ++extern void map(int fd, unsigned long virt, unsigned long phys, ++ unsigned long len, int r, int w, int x); ++extern int unmap(int fd, void *addr, int len); ++extern int protect(int fd, unsigned long addr, unsigned long len, ++ int r, int w, int x, int must_succeed); ++extern void user_signal(int sig, union uml_pt_regs *regs); ++extern int singlestepping_skas(void); ++extern int new_mm(int from); ++extern void save_registers(union uml_pt_regs *regs); ++extern void restore_registers(union uml_pt_regs *regs); ++extern void start_userspace(void); ++extern void init_registers(int pid); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/include/uaccess.h um/arch/um/kernel/skas/include/uaccess.h +--- orig/arch/um/kernel/skas/include/uaccess.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/include/uaccess.h Fri Jan 31 23:05:56 2003 +@@ -0,0 +1,232 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __SKAS_UACCESS_H ++#define __SKAS_UACCESS_H ++ ++#include "linux/string.h" ++#include "linux/sched.h" ++#include "asm/processor.h" ++#include "asm/pgtable.h" ++#include "asm/errno.h" ++#include "asm/current.h" ++#include "asm/a.out.h" ++#include "kern_util.h" ++ ++#define access_ok_skas(type, addr, size) \ ++ ((segment_eq(get_fs(), KERNEL_DS)) || \ ++ (((unsigned long) (addr) < TASK_SIZE) && \ ++ ((unsigned long) (addr) + (size) <= TASK_SIZE))) ++ ++static inline int verify_area_skas(int type, const void * addr, ++ unsigned long size) ++{ ++ return(access_ok_skas(type, addr, size) ? 0 : -EFAULT); ++} ++ ++static inline unsigned long maybe_map(unsigned long virt, int is_write) ++{ ++ pte_t pte; ++ ++ void *phys = um_virt_to_phys(current, virt, &pte); ++ int dummy_code; ++ ++ if(IS_ERR(phys) || (is_write && !pte_write(pte))){ ++ if(!handle_page_fault(virt, 0, is_write, 0, &dummy_code)) ++ return(0); ++ phys = um_virt_to_phys(current, virt, NULL); ++ } ++ return((unsigned long) __va((unsigned long) phys)); ++} ++ ++static inline int buffer_op(unsigned long addr, int len, ++ int (*op)(unsigned long addr, int len, void *arg), ++ void *arg) ++{ ++ int size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); ++ int remain = len, n; ++ ++ n = (*op)(addr, size, arg); ++ if(n != 0) ++ return(n < 0 ? remain : 0); ++ ++ addr += size; ++ remain -= size; ++ if(remain == 0) ++ return(0); ++ ++ while(addr < ((addr + remain) & PAGE_MASK)){ ++ n = (*op)(addr, PAGE_SIZE, arg); ++ if(n != 0) ++ return(n < 0 ? remain : 0); ++ ++ addr += PAGE_SIZE; ++ remain -= PAGE_SIZE; ++ } ++ if(remain == 0) ++ return(0); ++ ++ n = (*op)(addr, remain, arg); ++ if(n != 0) ++ return(n < 0 ? remain : 0); ++ return(0); ++} ++ ++static inline int copy_chunk_from_user(unsigned long from, int len, void *arg) ++{ ++ unsigned long *to_ptr = arg, to = *to_ptr; ++ ++ from = maybe_map(from, 0); ++ if(from == 0) ++ return(-1); ++ ++ memcpy((void *) to, (void *) from, len); ++ *to_ptr += len; ++ return(0); ++} ++ ++static inline int copy_from_user_skas(void *to, const void *from, int n) ++{ ++ if(segment_eq(get_fs(), KERNEL_DS)){ ++ memcpy(to, from, n); ++ return(0); ++ } ++ ++ return(access_ok_skas(VERIFY_READ, from, n) ? ++ buffer_op((unsigned long) from, n, copy_chunk_from_user, &to) : ++ n); ++} ++ ++static inline int copy_chunk_to_user(unsigned long to, int len, void *arg) ++{ ++ unsigned long *from_ptr = arg, from = *from_ptr; ++ ++ to = maybe_map(to, 1); ++ if(to == 0) ++ return(-1); ++ ++ memcpy((void *) to, (void *) from, len); ++ *from_ptr += len; ++ return(0); ++} ++ ++static inline int copy_to_user_skas(void *to, const void *from, int n) ++{ ++ if(segment_eq(get_fs(), KERNEL_DS)){ ++ memcpy(to, from, n); ++ return(0); ++ } ++ ++ return(access_ok_skas(VERIFY_WRITE, to, n) ? ++ buffer_op((unsigned long) to, n, copy_chunk_to_user, &from) : ++ n); ++} ++ ++static inline int strncpy_chunk_from_user(unsigned long from, int len, ++ void *arg) ++{ ++ char **to_ptr = arg, *to = *to_ptr; ++ int n; ++ ++ from = maybe_map(from, 0); ++ if(from == 0) ++ return(-1); ++ ++ strncpy(to, (void *) from, len); ++ n = strnlen(to, len); ++ *to_ptr += n; ++ ++ if(n < len) ++ return(1); ++ return(0); ++} ++ ++static inline int strncpy_from_user_skas(char *dst, const char *src, int count) ++{ ++ int n; ++ char *ptr = dst; ++ ++ if(segment_eq(get_fs(), KERNEL_DS)){ ++ strncpy(dst, src, count); ++ return(strnlen(dst, count)); ++ } ++ ++ if(!access_ok_skas(VERIFY_READ, src, 1)) ++ return(-EFAULT); ++ ++ n = buffer_op((unsigned long) src, count, strncpy_chunk_from_user, ++ &ptr); ++ if(n != 0) ++ return(-EFAULT); ++ return(strnlen(dst, count)); ++} ++ ++static inline int clear_chunk(unsigned long addr, int len, void *unused) ++{ ++ addr = maybe_map(addr, 1); ++ if(addr == 0) ++ return(-1); ++ ++ memset((void *) addr, 0, len); ++ return(0); ++} ++ ++static inline int __clear_user_skas(void *mem, int len) ++{ ++ return(buffer_op((unsigned long) mem, len, clear_chunk, NULL)); ++} ++ ++static inline int clear_user_skas(void *mem, int len) ++{ ++ if(segment_eq(get_fs(), KERNEL_DS)){ ++ memset(mem, 0, len); ++ return(0); ++ } ++ ++ return(access_ok_skas(VERIFY_WRITE, mem, len) ? ++ buffer_op((unsigned long) mem, len, clear_chunk, NULL) : len); ++} ++ ++static inline int strnlen_chunk(unsigned long str, int len, void *arg) ++{ ++ int *len_ptr = arg, n; ++ ++ str = maybe_map(str, 0); ++ if(str == 0) ++ return(-1); ++ ++ n = strnlen((void *) str, len); ++ *len_ptr += n; ++ ++ if(n < len) ++ return(1); ++ return(0); ++} ++ ++static inline int strnlen_user_skas(const void *str, int len) ++{ ++ int count = 0, n; ++ ++ if(segment_eq(get_fs(), KERNEL_DS)) ++ return(strnlen(str, len) + 1); ++ ++ n = buffer_op((unsigned long) str, len, strnlen_chunk, &count); ++ if(n == 0) ++ return(count + 1); ++ return(-EFAULT); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mem.c um/arch/um/kernel/skas/mem.c +--- orig/arch/um/kernel/skas/mem.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/mem.c Mon Dec 16 21:49:39 2002 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/mm.h" ++#include "mem_user.h" ++ ++unsigned long set_task_sizes_skas(int arg, unsigned long *host_size_out, ++ unsigned long *task_size_out) ++{ ++ /* Round up to the nearest 4M */ ++ unsigned long top = ROUND_4M((unsigned long) &arg); ++ ++ *host_size_out = top; ++ *task_size_out = top; ++ return(((unsigned long) set_task_sizes_skas) & ~0xffffff); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mem_user.c um/arch/um/kernel/skas/mem_user.c +--- orig/arch/um/kernel/skas/mem_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/mem_user.c Tue Dec 31 00:13:18 2002 +@@ -0,0 +1,95 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <errno.h> ++#include <sys/mman.h> ++#include <sys/ptrace.h> ++#include "mem_user.h" ++#include "user.h" ++#include "os.h" ++#include "proc_mm.h" ++ ++void map(int fd, unsigned long virt, unsigned long phys, unsigned long len, ++ int r, int w, int x) ++{ ++ struct proc_mm_op map; ++ struct mem_region *region; ++ int prot, n; ++ ++ prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | ++ (x ? PROT_EXEC : 0); ++ region = phys_region(phys); ++ ++ map = ((struct proc_mm_op) { .op = MM_MMAP, ++ .u = ++ { .mmap = ++ { .addr = virt, ++ .len = len, ++ .prot = prot, ++ .flags = MAP_SHARED | ++ MAP_FIXED, ++ .fd = region->fd, ++ .offset = phys_offset(phys) ++ } } } ); ++ n = os_write_file(fd, &map, sizeof(map)); ++ if(n != sizeof(map)) ++ printk("map : /proc/mm map failed, errno = %d\n", errno); ++} ++ ++int unmap(int fd, void *addr, int len) ++{ ++ struct proc_mm_op unmap; ++ int n; ++ ++ unmap = ((struct proc_mm_op) { .op = MM_MUNMAP, ++ .u = ++ { .munmap = ++ { .addr = (unsigned long) addr, ++ .len = len } } } ); ++ n = os_write_file(fd, &unmap, sizeof(unmap)); ++ if((n != 0) && (n != sizeof(unmap))) ++ return(-errno); ++ return(0); ++} ++ ++int protect(int fd, unsigned long addr, unsigned long len, int r, int w, ++ int x, int must_succeed) ++{ ++ struct proc_mm_op protect; ++ int prot, n; ++ ++ prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | ++ (x ? PROT_EXEC : 0); ++ ++ protect = ((struct proc_mm_op) { .op = MM_MPROTECT, ++ .u = ++ { .mprotect = ++ { .addr = (unsigned long) addr, ++ .len = len, ++ .prot = prot } } } ); ++ ++ n = os_write_file(fd, &protect, sizeof(protect)); ++ if((n != 0) && (n != sizeof(protect))){ ++ if(must_succeed) ++ panic("protect failed, errno = %d", errno); ++ return(-errno); ++ } ++ return(0); ++} ++ ++void before_mem_skas(unsigned long unused) ++{ ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/mmu.c um/arch/um/kernel/skas/mmu.c +--- orig/arch/um/kernel/skas/mmu.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/mmu.c Wed Nov 13 13:09:57 2002 +@@ -0,0 +1,44 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/list.h" ++#include "linux/spinlock.h" ++#include "linux/slab.h" ++#include "asm/segment.h" ++#include "asm/mmu.h" ++#include "os.h" ++#include "skas.h" ++ ++int init_new_context_skas(struct task_struct *task, struct mm_struct *mm) ++{ ++ int from; ++ ++ if((current->mm != NULL) && (current->mm != &init_mm)) ++ from = current->mm->context.skas.mm_fd; ++ else from = -1; ++ ++ mm->context.skas.mm_fd = new_mm(from); ++ if(mm->context.skas.mm_fd < 0) ++ panic("init_new_context_skas - new_mm failed, errno = %d\n", ++ mm->context.skas.mm_fd); ++ ++ return(0); ++} ++ ++void destroy_context_skas(struct mm_struct *mm) ++{ ++ os_close_file(mm->context.skas.mm_fd); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/process.c um/arch/um/kernel/skas/process.c +--- orig/arch/um/kernel/skas/process.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/process.c Wed Mar 26 14:43:19 2003 +@@ -0,0 +1,407 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <unistd.h> ++#include <errno.h> ++#include <signal.h> ++#include <setjmp.h> ++#include <sched.h> ++#include <sys/wait.h> ++#include <sys/ptrace.h> ++#include <sys/mman.h> ++#include <sys/user.h> ++#include <asm/unistd.h> ++#include "user.h" ++#include "ptrace_user.h" ++#include "time_user.h" ++#include "sysdep/ptrace.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "skas.h" ++#include "sysdep/sigcontext.h" ++#include "os.h" ++#include "proc_mm.h" ++#include "skas_ptrace.h" ++#include "chan_user.h" ++ ++int is_skas_winch(int pid, int fd, void *data) ++{ ++ if(pid != getpid()) ++ return(0); ++ ++ register_winch_irq(-1, fd, -1, data); ++ return(1); ++} ++ ++unsigned long exec_regs[FRAME_SIZE]; ++unsigned long exec_fp_regs[HOST_FP_SIZE]; ++unsigned long exec_fpx_regs[HOST_XFP_SIZE]; ++int have_fpx_regs = 1; ++ ++static void handle_segv(int pid) ++{ ++ struct ptrace_faultinfo fault; ++ int err; ++ ++ err = ptrace(PTRACE_FAULTINFO, pid, 0, &fault); ++ if(err) ++ panic("handle_segv - PTRACE_FAULTINFO failed, errno = %d\n", ++ errno); ++ ++ segv(fault.addr, 0, FAULT_WRITE(fault.is_write), 1, NULL); ++} ++ ++static void handle_trap(int pid, union uml_pt_regs *regs) ++{ ++ int err, syscall_nr, status; ++ ++ syscall_nr = PT_SYSCALL_NR(regs->skas.regs); ++ if(syscall_nr < 1){ ++ relay_signal(SIGTRAP, regs); ++ return; ++ } ++ UPT_SYSCALL_NR(regs) = syscall_nr; ++ ++ err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); ++ if(err < 0) ++ panic("handle_trap - nullifying syscall failed errno = %d\n", ++ errno); ++ ++ err = ptrace(PTRACE_SYSCALL, pid, 0, 0); ++ if(err < 0) ++ panic("handle_trap - continuing to end of syscall failed, " ++ "errno = %d\n", errno); ++ ++ err = waitpid(pid, &status, WUNTRACED); ++ if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) ++ panic("handle_trap - failed to wait at end of syscall, " ++ "errno = %d, status = %d\n", errno, status); ++ ++ handle_syscall(regs); ++} ++ ++static int userspace_tramp(void *arg) ++{ ++ init_new_thread_signals(0); ++ enable_timer(); ++ ptrace(PTRACE_TRACEME, 0, 0, 0); ++ os_stop_process(os_getpid()); ++ return(0); ++} ++ ++int userspace_pid; ++ ++void start_userspace(void) ++{ ++ void *stack; ++ unsigned long sp; ++ int pid, status, n; ++ ++ stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); ++ if(stack == MAP_FAILED) ++ panic("start_userspace : mmap failed, errno = %d", errno); ++ sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); ++ ++ pid = clone(userspace_tramp, (void *) sp, ++ CLONE_FILES | CLONE_VM | SIGCHLD, NULL); ++ if(pid < 0) ++ panic("start_userspace : clone failed, errno = %d", errno); ++ ++ do { ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0) ++ panic("start_userspace : wait failed, errno = %d", ++ errno); ++ } while(WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM)); ++ ++ if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) ++ panic("start_userspace : expected SIGSTOP, got status = %d", ++ status); ++ ++ if(munmap(stack, PAGE_SIZE) < 0) ++ panic("start_userspace : munmap failed, errno = %d\n", errno); ++ ++ userspace_pid = pid; ++} ++ ++void userspace(union uml_pt_regs *regs) ++{ ++ int err, status, op; ++ ++ restore_registers(regs); ++ ++ err = ptrace(PTRACE_SYSCALL, userspace_pid, 0, 0); ++ if(err) ++ panic("userspace - PTRACE_SYSCALL failed, errno = %d\n", ++ errno); ++ while(1){ ++ err = waitpid(userspace_pid, &status, WUNTRACED); ++ if(err < 0) ++ panic("userspace - waitpid failed, errno = %d\n", ++ errno); ++ ++ regs->skas.is_user = 1; ++ save_registers(regs); ++ ++ if(WIFSTOPPED(status)){ ++ switch(WSTOPSIG(status)){ ++ case SIGSEGV: ++ handle_segv(userspace_pid); ++ break; ++ case SIGTRAP: ++ handle_trap(userspace_pid, regs); ++ break; ++ case SIGIO: ++ case SIGVTALRM: ++ case SIGILL: ++ case SIGBUS: ++ case SIGFPE: ++ case SIGWINCH: ++ user_signal(WSTOPSIG(status), regs); ++ break; ++ default: ++ printk("userspace - child stopped with signal " ++ "%d\n", WSTOPSIG(status)); ++ } ++ interrupt_end(); ++ } ++ ++ restore_registers(regs); ++ ++ op = singlestepping_skas() ? PTRACE_SINGLESTEP : ++ PTRACE_SYSCALL; ++ err = ptrace(op, userspace_pid, 0, 0); ++ if(err) ++ panic("userspace - PTRACE_SYSCALL failed, " ++ "errno = %d\n", errno); ++ } ++} ++ ++void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, ++ void (*handler)(int)) ++{ ++ jmp_buf switch_buf, fork_buf; ++ ++ *switch_buf_ptr = &switch_buf; ++ *fork_buf_ptr = &fork_buf; ++ ++ if(setjmp(fork_buf) == 0) ++ new_thread_proc(stack, handler); ++ ++ remove_sigstack(); ++} ++ ++void thread_wait(void *sw, void *fb) ++{ ++ jmp_buf buf, **switch_buf = sw, *fork_buf; ++ ++ *switch_buf = &buf; ++ fork_buf = fb; ++ if(setjmp(buf) == 0) ++ longjmp(*fork_buf, 1); ++} ++ ++static int move_registers(int int_op, int fp_op, union uml_pt_regs *regs, ++ unsigned long *fp_regs) ++{ ++ if(ptrace(int_op, userspace_pid, 0, regs->skas.regs) < 0) ++ return(-errno); ++ if(ptrace(fp_op, userspace_pid, 0, fp_regs) < 0) ++ return(-errno); ++ return(0); ++} ++ ++void save_registers(union uml_pt_regs *regs) ++{ ++ unsigned long *fp_regs; ++ int err, fp_op; ++ ++ if(have_fpx_regs){ ++ fp_op = PTRACE_GETFPXREGS; ++ fp_regs = regs->skas.xfp; ++ } ++ else { ++ fp_op = PTRACE_GETFPREGS; ++ fp_regs = regs->skas.fp; ++ } ++ ++ err = move_registers(PTRACE_GETREGS, fp_op, regs, fp_regs); ++ if(err) ++ panic("save_registers - saving registers failed, errno = %d\n", ++ err); ++} ++ ++void restore_registers(union uml_pt_regs *regs) ++{ ++ unsigned long *fp_regs; ++ int err, fp_op; ++ ++ if(have_fpx_regs){ ++ fp_op = PTRACE_SETFPXREGS; ++ fp_regs = regs->skas.xfp; ++ } ++ else { ++ fp_op = PTRACE_SETFPREGS; ++ fp_regs = regs->skas.fp; ++ } ++ ++ err = move_registers(PTRACE_SETREGS, fp_op, regs, fp_regs); ++ if(err) ++ panic("restore_registers - saving registers failed, " ++ "errno = %d\n", err); ++} ++ ++void switch_threads(void *me, void *next) ++{ ++ jmp_buf my_buf, **me_ptr = me, *next_buf = next; ++ ++ *me_ptr = &my_buf; ++ if(setjmp(my_buf) == 0) ++ longjmp(*next_buf, 1); ++} ++ ++static jmp_buf initial_jmpbuf; ++ ++/* XXX Make these percpu */ ++static void (*cb_proc)(void *arg); ++static void *cb_arg; ++static jmp_buf *cb_back; ++ ++int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr) ++{ ++ jmp_buf **switch_buf = switch_buf_ptr; ++ int n; ++ ++ *fork_buf_ptr = &initial_jmpbuf; ++ n = setjmp(initial_jmpbuf); ++ if(n == 0) ++ new_thread_proc((void *) stack, new_thread_handler); ++ else if(n == 1) ++ remove_sigstack(); ++ else if(n == 2){ ++ (*cb_proc)(cb_arg); ++ longjmp(*cb_back, 1); ++ } ++ else if(n == 3){ ++ kmalloc_ok = 0; ++ return(0); ++ } ++ else if(n == 4){ ++ kmalloc_ok = 0; ++ return(1); ++ } ++ longjmp(**switch_buf, 1); ++} ++ ++void remove_sigstack(void) ++{ ++ stack_t stack = ((stack_t) { .ss_flags = SS_DISABLE, ++ .ss_sp = NULL, ++ .ss_size = 0 }); ++ ++ if(sigaltstack(&stack, NULL) != 0) ++ panic("disabling signal stack failed, errno = %d\n", errno); ++} ++ ++void initial_thread_cb_skas(void (*proc)(void *), void *arg) ++{ ++ jmp_buf here; ++ ++ cb_proc = proc; ++ cb_arg = arg; ++ cb_back = &here; ++ ++ block_signals(); ++ if(setjmp(here) == 0) ++ longjmp(initial_jmpbuf, 2); ++ unblock_signals(); ++ ++ cb_proc = NULL; ++ cb_arg = NULL; ++ cb_back = NULL; ++} ++ ++void halt_skas(void) ++{ ++ block_signals(); ++ longjmp(initial_jmpbuf, 3); ++} ++ ++void reboot_skas(void) ++{ ++ block_signals(); ++ longjmp(initial_jmpbuf, 4); ++} ++ ++int new_mm(int from) ++{ ++ struct proc_mm_op copy; ++ int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0); ++ ++ if(fd < 0) ++ return(-errno); ++ ++ if(from != -1){ ++ copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, ++ .u = ++ { .copy_segments = from } } ); ++ n = os_write_file(fd, ©, sizeof(copy)); ++ if(n != sizeof(copy)) ++ printk("new_mm : /proc/mm copy_segments failed, " ++ "errno = %d\n", errno); ++ } ++ return(fd); ++} ++ ++void switch_mm_skas(int mm_fd) ++{ ++ int err; ++ ++ err = ptrace(PTRACE_SWITCH_MM, userspace_pid, 0, mm_fd); ++ if(err) ++ panic("switch_mm_skas - PTRACE_SWITCH_MM failed, errno = %d\n", ++ errno); ++} ++ ++void kill_off_processes_skas(void) ++{ ++ os_kill_process(userspace_pid, 1); ++} ++ ++void init_registers(int pid) ++{ ++ int err; ++ ++ if(ptrace(PTRACE_GETREGS, pid, 0, exec_regs) < 0) ++ panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", ++ errno); ++ ++ err = ptrace(PTRACE_GETFPXREGS, pid, 0, exec_fpx_regs); ++ if(!err) ++ return; ++ ++ have_fpx_regs = 0; ++ if(errno != EIO) ++ panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", ++ errno); ++ ++ err = ptrace(PTRACE_GETFPREGS, pid, 0, exec_fp_regs); ++ if(err) ++ panic("check_ptrace : PTRACE_GETFPREGS failed, errno = %d", ++ errno); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/process_kern.c um/arch/um/kernel/skas/process_kern.c +--- orig/arch/um/kernel/skas/process_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/process_kern.c Sun Dec 22 20:37:39 2002 +@@ -0,0 +1,191 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/slab.h" ++#include "kern_util.h" ++#include "time_user.h" ++#include "signal_user.h" ++#include "skas.h" ++#include "os.h" ++#include "user_util.h" ++#include "tlb.h" ++#include "frame.h" ++#include "kern.h" ++#include "mode.h" ++ ++int singlestepping_skas(void) ++{ ++ int ret = current->ptrace & PT_DTRACE; ++ ++ current->ptrace &= ~PT_DTRACE; ++ return(ret); ++} ++ ++void *_switch_to_skas(void *prev, void *next) ++{ ++ struct task_struct *from, *to; ++ ++ from = prev; ++ to = next; ++ ++ /* XXX need to check runqueues[cpu].idle */ ++ if(current->pid == 0) ++ switch_timers(0); ++ ++ to->thread.prev_sched = from; ++ set_current(to); ++ ++ switch_threads(&from->thread.mode.skas.switch_buf, ++ to->thread.mode.skas.switch_buf); ++ ++ if(current->pid == 0) ++ switch_timers(1); ++ ++ return(current->thread.prev_sched); ++} ++ ++extern void schedule_tail(struct task_struct *prev); ++ ++void new_thread_handler(int sig) ++{ ++ int (*fn)(void *), n; ++ void *arg; ++ ++ fn = current->thread.request.u.thread.proc; ++ arg = current->thread.request.u.thread.arg; ++ change_sig(SIGUSR1, 1); ++ thread_wait(¤t->thread.mode.skas.switch_buf, ++ current->thread.mode.skas.fork_buf); ++ ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ ++ n = run_kernel_thread(fn, arg, ¤t->thread.exec_buf); ++ if(n == 1) ++ userspace(¤t->thread.regs.regs); ++ else do_exit(0); ++} ++ ++void new_thread_proc(void *stack, void (*handler)(int sig)) ++{ ++ init_new_thread_stack(stack, handler); ++ os_usr1_process(os_getpid()); ++} ++ ++void release_thread_skas(struct task_struct *task) ++{ ++} ++ ++void exit_thread_skas(void) ++{ ++} ++ ++void fork_handler(int sig) ++{ ++ change_sig(SIGUSR1, 1); ++ thread_wait(¤t->thread.mode.skas.switch_buf, ++ current->thread.mode.skas.fork_buf); ++ ++ force_flush_all(); ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ unblock_signals(); ++ ++ userspace(¤t->thread.regs.regs); ++} ++ ++int copy_thread_skas(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long stack_top, struct task_struct * p, ++ struct pt_regs *regs) ++{ ++ void (*handler)(int); ++ ++ if(current->thread.forking){ ++ memcpy(&p->thread.regs.regs.skas, ++ ¤t->thread.regs.regs.skas, ++ sizeof(p->thread.regs.regs.skas)); ++ REGS_SET_SYSCALL_RETURN(p->thread.regs.regs.skas.regs, 0); ++ if(sp != 0) REGS_SP(p->thread.regs.regs.skas.regs) = sp; ++ ++ handler = fork_handler; ++ } ++ else { ++ memcpy(p->thread.regs.regs.skas.regs, exec_regs, ++ sizeof(p->thread.regs.regs.skas.regs)); ++ memcpy(p->thread.regs.regs.skas.fp, exec_fp_regs, ++ sizeof(p->thread.regs.regs.skas.fp)); ++ memcpy(p->thread.regs.regs.skas.xfp, exec_fpx_regs, ++ sizeof(p->thread.regs.regs.skas.xfp)); ++ p->thread.request.u.thread = current->thread.request.u.thread; ++ handler = new_thread_handler; ++ } ++ ++ new_thread((void *) p->thread.kernel_stack, ++ &p->thread.mode.skas.switch_buf, ++ &p->thread.mode.skas.fork_buf, handler); ++ return(0); ++} ++ ++void init_idle_skas(void) ++{ ++ cpu_tasks[current->processor].pid = os_getpid(); ++} ++ ++extern void start_kernel(void); ++ ++static int start_kernel_proc(void *unused) ++{ ++ int pid; ++ ++ block_signals(); ++ pid = os_getpid(); ++ ++ cpu_tasks[0].pid = pid; ++ cpu_tasks[0].task = current; ++#ifdef CONFIG_SMP ++ cpu_online_map = 1; ++#endif ++ start_kernel(); ++ return(0); ++} ++ ++int start_uml_skas(void) ++{ ++ start_userspace(); ++ capture_signal_stack(); ++ ++ init_new_thread_signals(1); ++ idle_timer(); ++ ++ init_task.thread.request.u.thread.proc = start_kernel_proc; ++ init_task.thread.request.u.thread.arg = NULL; ++ return(start_idle_thread((void *) init_task.thread.kernel_stack, ++ &init_task.thread.mode.skas.switch_buf, ++ &init_task.thread.mode.skas.fork_buf)); ++} ++ ++int external_pid_skas(struct task_struct *task) ++{ ++ return(userspace_pid); ++} ++ ++int thread_pid_skas(struct thread_struct *thread) ++{ ++ return(userspace_pid); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/sys-i386/Makefile um/arch/um/kernel/skas/sys-i386/Makefile +--- orig/arch/um/kernel/skas/sys-i386/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/sys-i386/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,17 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = sys-i386.o ++ ++obj-y = sigcontext.o ++ ++USER_OBJS = sigcontext.o ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean : +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/sys-i386/sigcontext.c um/arch/um/kernel/skas/sys-i386/sigcontext.c +--- orig/arch/um/kernel/skas/sys-i386/sigcontext.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/sys-i386/sigcontext.c Sun Dec 8 20:38:46 2002 +@@ -0,0 +1,115 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <errno.h> ++#include <asm/sigcontext.h> ++#include <sys/ptrace.h> ++#include <linux/ptrace.h> ++#include "sysdep/ptrace.h" ++#include "sysdep/ptrace_user.h" ++#include "kern_util.h" ++#include "user.h" ++#include "sigcontext.h" ++ ++extern int userspace_pid; ++ ++int copy_sc_from_user_skas(union uml_pt_regs *regs, void *from_ptr) ++{ ++ struct sigcontext sc, *from = from_ptr; ++ unsigned long fpregs[FP_FRAME_SIZE]; ++ int err; ++ ++ err = copy_from_user_proc(&sc, from, sizeof(sc)); ++ err |= copy_from_user_proc(fpregs, sc.fpstate, sizeof(fpregs)); ++ if(err) ++ return(err); ++ ++ regs->skas.regs[GS] = sc.gs; ++ regs->skas.regs[FS] = sc.fs; ++ regs->skas.regs[ES] = sc.es; ++ regs->skas.regs[DS] = sc.ds; ++ regs->skas.regs[EDI] = sc.edi; ++ regs->skas.regs[ESI] = sc.esi; ++ regs->skas.regs[EBP] = sc.ebp; ++ regs->skas.regs[UESP] = sc.esp; ++ regs->skas.regs[EBX] = sc.ebx; ++ regs->skas.regs[EDX] = sc.edx; ++ regs->skas.regs[ECX] = sc.ecx; ++ regs->skas.regs[EAX] = sc.eax; ++ regs->skas.regs[EIP] = sc.eip; ++ regs->skas.regs[CS] = sc.cs; ++ regs->skas.regs[EFL] = sc.eflags; ++ regs->skas.regs[UESP] = sc.esp_at_signal; ++ regs->skas.regs[SS] = sc.ss; ++ regs->skas.fault_addr = sc.cr2; ++ regs->skas.fault_type = FAULT_WRITE(sc.err); ++ regs->skas.trap_type = sc.trapno; ++ ++ err = ptrace(PTRACE_SETFPREGS, userspace_pid, 0, fpregs); ++ if(err < 0){ ++ printk("copy_sc_to_user - PTRACE_SETFPREGS failed, " ++ "errno = %d\n", errno); ++ return(1); ++ } ++ ++ return(0); ++} ++ ++int copy_sc_to_user_skas(void *to_ptr, void *fp, union uml_pt_regs *regs, ++ unsigned long fault_addr, int fault_type) ++{ ++ struct sigcontext sc, *to = to_ptr; ++ struct _fpstate *to_fp; ++ unsigned long fpregs[FP_FRAME_SIZE]; ++ int err; ++ ++ sc.gs = regs->skas.regs[GS]; ++ sc.fs = regs->skas.regs[FS]; ++ sc.es = regs->skas.regs[ES]; ++ sc.ds = regs->skas.regs[DS]; ++ sc.edi = regs->skas.regs[EDI]; ++ sc.esi = regs->skas.regs[ESI]; ++ sc.ebp = regs->skas.regs[EBP]; ++ sc.esp = regs->skas.regs[UESP]; ++ sc.ebx = regs->skas.regs[EBX]; ++ sc.edx = regs->skas.regs[EDX]; ++ sc.ecx = regs->skas.regs[ECX]; ++ sc.eax = regs->skas.regs[EAX]; ++ sc.eip = regs->skas.regs[EIP]; ++ sc.cs = regs->skas.regs[CS]; ++ sc.eflags = regs->skas.regs[EFL]; ++ sc.esp_at_signal = regs->skas.regs[UESP]; ++ sc.ss = regs->skas.regs[SS]; ++ sc.cr2 = fault_addr; ++ sc.err = TO_SC_ERR(fault_type); ++ sc.trapno = regs->skas.trap_type; ++ ++ err = ptrace(PTRACE_GETFPREGS, userspace_pid, 0, fpregs); ++ if(err < 0){ ++ printk("copy_sc_to_user - PTRACE_GETFPREGS failed, " ++ "errno = %d\n", errno); ++ return(1); ++ } ++ to_fp = (struct _fpstate *) ++ (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to))); ++ sc.fpstate = to_fp; ++ ++ if(err) ++ return(err); ++ ++ return(copy_to_user_proc(to, &sc, sizeof(sc)) || ++ copy_to_user_proc(to_fp, fpregs, sizeof(fpregs))); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/syscall_kern.c um/arch/um/kernel/skas/syscall_kern.c +--- orig/arch/um/kernel/skas/syscall_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/syscall_kern.c Sun Dec 8 21:01:44 2002 +@@ -0,0 +1,42 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sys.h" ++#include "asm/errno.h" ++#include "asm/unistd.h" ++#include "asm/ptrace.h" ++#include "asm/current.h" ++#include "sysdep/syscalls.h" ++#include "kern_util.h" ++ ++extern syscall_handler_t *sys_call_table[]; ++ ++long execute_syscall_skas(void *r) ++{ ++ struct pt_regs *regs = r; ++ long res; ++ int syscall; ++ ++ current->thread.nsyscalls++; ++ nsyscalls++; ++ syscall = UPT_SYSCALL_NR(®s->regs); ++ ++ if((syscall >= NR_syscalls) || (syscall < 0)) ++ res = -ENOSYS; ++ else res = EXECUTE_SYSCALL(syscall, regs); ++ ++ return(res); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/syscall_user.c um/arch/um/kernel/skas/syscall_user.c +--- orig/arch/um/kernel/skas/syscall_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/syscall_user.c Sun Dec 8 21:00:12 2002 +@@ -0,0 +1,46 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <signal.h> ++#include "kern_util.h" ++#include "syscall_user.h" ++#include "sysdep/ptrace.h" ++#include "sysdep/sigcontext.h" ++ ++/* XXX Bogus */ ++#define ERESTARTSYS 512 ++#define ERESTARTNOINTR 513 ++#define ERESTARTNOHAND 514 ++ ++void handle_syscall(union uml_pt_regs *regs) ++{ ++ long result; ++ int index; ++ ++ index = record_syscall_start(UPT_SYSCALL_NR(regs)); ++ ++ syscall_trace(); ++ result = execute_syscall(regs); ++ ++ REGS_SET_SYSCALL_RETURN(regs->skas.regs, result); ++ if((result == -ERESTARTNOHAND) || (result == -ERESTARTSYS) || ++ (result == -ERESTARTNOINTR)) ++ do_signal(result); ++ ++ syscall_trace(); ++ record_syscall_end(index, result); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/time.c um/arch/um/kernel/skas/time.c +--- orig/arch/um/kernel/skas/time.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/time.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <sys/signal.h> ++#include <sys/time.h> ++#include "time_user.h" ++#include "process.h" ++#include "user.h" ++ ++void user_time_init_skas(void) ++{ ++ if(signal(SIGALRM, (__sighandler_t) alarm_handler) == SIG_ERR) ++ panic("Couldn't set SIGALRM handler"); ++ if(signal(SIGVTALRM, (__sighandler_t) alarm_handler) == SIG_ERR) ++ panic("Couldn't set SIGVTALRM handler"); ++ set_interval(ITIMER_VIRTUAL); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/tlb.c um/arch/um/kernel/skas/tlb.c +--- orig/arch/um/kernel/skas/tlb.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/tlb.c Sun Dec 22 18:30:35 2002 +@@ -0,0 +1,153 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/stddef.h" ++#include "linux/sched.h" ++#include "asm/page.h" ++#include "asm/pgtable.h" ++#include "asm/mmu.h" ++#include "user_util.h" ++#include "mem_user.h" ++#include "skas.h" ++#include "os.h" ++ ++static void fix_range(struct mm_struct *mm, unsigned long start_addr, ++ unsigned long end_addr, int force) ++{ ++ pgd_t *npgd; ++ pmd_t *npmd; ++ pte_t *npte; ++ unsigned long addr; ++ int r, w, x, err, fd; ++ ++ if(mm == NULL) return; ++ fd = mm->context.skas.mm_fd; ++ for(addr = start_addr; addr < end_addr;){ ++ npgd = pgd_offset(mm, addr); ++ npmd = pmd_offset(npgd, addr); ++ if(pmd_present(*npmd)){ ++ npte = pte_offset(npmd, addr); ++ r = pte_read(*npte); ++ w = pte_write(*npte); ++ x = pte_exec(*npte); ++ if(!pte_dirty(*npte)) w = 0; ++ if(!pte_young(*npte)){ ++ r = 0; ++ w = 0; ++ } ++ if(force || pte_newpage(*npte)){ ++ err = unmap(fd, (void *) addr, PAGE_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ if(pte_present(*npte)) ++ map(fd, addr, ++ pte_val(*npte) & PAGE_MASK, ++ PAGE_SIZE, r, w, x); ++ } ++ else if(pte_newprot(*npte)){ ++ protect(fd, addr, PAGE_SIZE, r, w, x, 1); ++ } ++ *npte = pte_mkuptodate(*npte); ++ addr += PAGE_SIZE; ++ } ++ else { ++ if(force || pmd_newpage(*npmd)){ ++ err = unmap(fd, (void *) addr, PMD_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ pmd_mkuptodate(*npmd); ++ } ++ addr += PMD_SIZE; ++ } ++ } ++} ++ ++static void flush_kernel_vm_range(unsigned long start, unsigned long end) ++{ ++ struct mm_struct *mm; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long addr; ++ int updated = 0, err; ++ ++ mm = &init_mm; ++ for(addr = start_vm; addr < end_vm;){ ++ pgd = pgd_offset(mm, addr); ++ pmd = pmd_offset(pgd, addr); ++ if(pmd_present(*pmd)){ ++ pte = pte_offset(pmd, addr); ++ if(!pte_present(*pte) || pte_newpage(*pte)){ ++ updated = 1; ++ err = os_unmap_memory((void *) addr, ++ PAGE_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ if(pte_present(*pte)) ++ map_memory(addr, ++ pte_val(*pte) & PAGE_MASK, ++ PAGE_SIZE, 1, 1, 1); ++ } ++ else if(pte_newprot(*pte)){ ++ updated = 1; ++ protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1); ++ } ++ addr += PAGE_SIZE; ++ } ++ else { ++ if(pmd_newpage(*pmd)){ ++ updated = 1; ++ err = os_unmap_memory((void *) addr, PMD_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ } ++ addr += PMD_SIZE; ++ } ++ } ++} ++ ++void flush_tlb_kernel_vm_skas(void) ++{ ++ flush_kernel_vm_range(start_vm, end_vm); ++} ++ ++void __flush_tlb_one_skas(unsigned long addr) ++{ ++ flush_kernel_vm_range(addr, addr + PAGE_SIZE); ++} ++ ++void flush_tlb_range_skas(struct mm_struct *mm, unsigned long start, ++ unsigned long end) ++{ ++ if(mm == NULL) ++ flush_kernel_vm_range(start, end); ++ else fix_range(mm, start, end, 0); ++} ++ ++void flush_tlb_mm_skas(struct mm_struct *mm) ++{ ++ flush_tlb_kernel_vm_skas(); ++ fix_range(mm, 0, host_task_size, 0); ++} ++ ++void force_flush_all_skas(void) ++{ ++ fix_range(current->mm, 0, host_task_size, 1); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/trap_user.c um/arch/um/kernel/skas/trap_user.c +--- orig/arch/um/kernel/skas/trap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/trap_user.c Sun Dec 15 13:28:41 2002 +@@ -0,0 +1,65 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <signal.h> ++#include <errno.h> ++#include <asm/sigcontext.h> ++#include "sysdep/ptrace.h" ++#include "signal_user.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "task.h" ++#include "sigcontext.h" ++ ++void sig_handler_common_skas(int sig, void *sc_ptr) ++{ ++ struct sigcontext *sc = sc_ptr; ++ struct skas_regs *r; ++ struct signal_info *info; ++ int save_errno = errno; ++ ++ r = &TASK_REGS(get_current())->skas; ++ r->is_user = 0; ++ r->fault_addr = SC_FAULT_ADDR(sc); ++ r->fault_type = SC_FAULT_TYPE(sc); ++ r->trap_type = SC_TRAP_TYPE(sc); ++ ++ change_sig(SIGUSR1, 1); ++ info = &sig_info[sig]; ++ if(!info->is_irq) unblock_signals(); ++ ++ (*info->handler)(sig, (union uml_pt_regs *) r); ++ ++ errno = save_errno; ++} ++ ++extern int missed_ticks[]; ++ ++void user_signal(int sig, union uml_pt_regs *regs) ++{ ++ struct signal_info *info; ++ ++ if(sig == SIGVTALRM) ++ missed_ticks[cpu()]++; ++ regs->skas.is_user = 1; ++ regs->skas.fault_addr = 0; ++ regs->skas.fault_type = 0; ++ regs->skas.trap_type = 0; ++ info = &sig_info[sig]; ++ (*info->handler)(sig, regs); ++ ++ unblock_signals(); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/util/Makefile um/arch/um/kernel/skas/util/Makefile +--- orig/arch/um/kernel/skas/util/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/util/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,10 @@ ++all: mk_ptregs ++ ++mk_ptregs : mk_ptregs.o ++ $(CC) -o mk_ptregs mk_ptregs.o ++ ++mk_ptregs.o : mk_ptregs.c ++ $(CC) -c $< ++ ++clean : ++ $(RM) -f mk_ptregs *.o *~ +diff -Naur -X ../exclude-files orig/arch/um/kernel/skas/util/mk_ptregs.c um/arch/um/kernel/skas/util/mk_ptregs.c +--- orig/arch/um/kernel/skas/util/mk_ptregs.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/skas/util/mk_ptregs.c Mon Nov 11 12:10:06 2002 +@@ -0,0 +1,50 @@ ++#include <asm/ptrace.h> ++#include <asm/user.h> ++ ++#define PRINT_REG(name, val) printf("#define HOST_%s %d\n", (name), (val)) ++ ++int main(int argc, char **argv) ++{ ++ printf("/* Automatically generated by " ++ "arch/um/kernel/skas/util/mk_ptregs */\n"); ++ printf("\n"); ++ printf("#ifndef __SKAS_PT_REGS_\n"); ++ printf("#define __SKAS_PT_REGS_\n"); ++ printf("\n"); ++ printf("#define HOST_FRAME_SIZE %d\n", FRAME_SIZE); ++ printf("#define HOST_FP_SIZE %d\n", ++ sizeof(struct user_i387_struct) / sizeof(unsigned long)); ++ printf("#define HOST_XFP_SIZE %d\n", ++ sizeof(struct user_fxsr_struct) / sizeof(unsigned long)); ++ ++ PRINT_REG("IP", EIP); ++ PRINT_REG("SP", UESP); ++ PRINT_REG("EFLAGS", EFL); ++ PRINT_REG("EAX", EAX); ++ PRINT_REG("EBX", EBX); ++ PRINT_REG("ECX", ECX); ++ PRINT_REG("EDX", EDX); ++ PRINT_REG("ESI", ESI); ++ PRINT_REG("EDI", EDI); ++ PRINT_REG("EBP", EBP); ++ PRINT_REG("CS", CS); ++ PRINT_REG("SS", SS); ++ PRINT_REG("DS", DS); ++ PRINT_REG("FS", FS); ++ PRINT_REG("ES", ES); ++ PRINT_REG("GS", GS); ++ printf("\n"); ++ printf("#endif\n"); ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/smp.c um/arch/um/kernel/smp.c +--- orig/arch/um/kernel/smp.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/smp.c Sat Feb 22 14:28:45 2003 +@@ -0,0 +1,324 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++ ++ ++#ifdef CONFIG_SMP ++ ++#include "linux/sched.h" ++#include "linux/threads.h" ++#include "linux/interrupt.h" ++#include "asm/smp.h" ++#include "asm/processor.h" ++#include "asm/spinlock.h" ++#include "asm/softirq.h" ++#include "asm/hardirq.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "irq_user.h" ++#include "kern.h" ++#include "os.h" ++ ++/* Total count of live CPUs, set by smp_boot_cpus */ ++int smp_num_cpus = 1; ++ ++/* The 'big kernel lock' */ ++spinlock_cacheline_t kernel_flag_cacheline = {SPIN_LOCK_UNLOCKED}; ++ ++/* Per CPU bogomips and other parameters */ ++ ++/* The only piece used here is the ipi pipe, which is set before SMP is ++ * started and never changed. ++ */ ++struct cpuinfo_um cpu_data[NR_CPUS]; ++ ++/* CPU online map, set by smp_boot_cpus */ ++unsigned long cpu_online_map; ++ ++atomic_t global_bh_count; ++ ++/* Set when the idlers are all forked */ ++int smp_threads_ready = 0; ++ ++/* Not used by UML */ ++unsigned char global_irq_holder = 0; ++unsigned volatile long global_irq_lock; ++ ++/* A statistic, can be a little off */ ++static int num_reschedules_sent = 0; ++ ++void smp_send_reschedule(int cpu) ++{ ++ write(cpu_data[cpu].ipi_pipe[1], "R", 1); ++ num_reschedules_sent++; ++} ++ ++static void show(char * str) ++{ ++ int cpu = smp_processor_id(); ++ ++ printk(KERN_INFO "\n%s, CPU %d:\n", str, cpu); ++} ++ ++#define MAXCOUNT 100000000 ++ ++static inline void wait_on_bh(void) ++{ ++ int count = MAXCOUNT; ++ do { ++ if (!--count) { ++ show("wait_on_bh"); ++ count = ~0; ++ } ++ /* nothing .. wait for the other bh's to go away */ ++ } while (atomic_read(&global_bh_count) != 0); ++} ++ ++/* ++ * This is called when we want to synchronize with ++ * bottom half handlers. We need to wait until ++ * no other CPU is executing any bottom half handler. ++ * ++ * Don't wait if we're already running in an interrupt ++ * context or are inside a bh handler. ++ */ ++void synchronize_bh(void) ++{ ++ if (atomic_read(&global_bh_count) && !in_interrupt()) ++ wait_on_bh(); ++} ++ ++void smp_send_stop(void) ++{ ++ int i; ++ ++ printk(KERN_INFO "Stopping all CPUs..."); ++ for(i = 0; i < ncpus; i++){ ++ if(i == current->processor) ++ continue; ++ write(cpu_data[i].ipi_pipe[1], "S", 1); ++ } ++ printk("done\n"); ++} ++ ++ ++static atomic_t smp_commenced = ATOMIC_INIT(0); ++static volatile unsigned long smp_callin_map = 0; ++ ++void smp_commence(void) ++{ ++ printk("All CPUs are go!\n"); ++ ++ wmb(); ++ atomic_set(&smp_commenced, 1); ++} ++ ++static int idle_proc(void *unused) ++{ ++ int cpu, err; ++ ++ set_current(current); ++ del_from_runqueue(current); ++ unhash_process(current); ++ ++ cpu = current->processor; ++ err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1); ++ if(err) ++ panic("CPU#%d failed to create IPI pipe, errno = %d", cpu, ++ -err); ++ ++ activate_ipi(cpu_data[cpu].ipi_pipe[0], ++ current->thread.mode.tt.extern_pid); ++ ++ wmb(); ++ if (test_and_set_bit(current->processor, &smp_callin_map)) { ++ printk("huh, CPU#%d already present??\n", current->processor); ++ BUG(); ++ } ++ ++ while (!atomic_read(&smp_commenced)) ++ cpu_relax(); ++ ++ init_idle(); ++ cpu_idle(); ++ return(0); ++} ++ ++static int idle_thread(int (*fn)(void *), int cpu) ++{ ++ struct task_struct *new_task; ++ int pid; ++ unsigned char c; ++ ++ current->thread.request.u.thread.proc = fn; ++ current->thread.request.u.thread.arg = NULL; ++ pid = do_fork(CLONE_VM | CLONE_PID, 0, NULL, 0); ++ if(pid < 0) panic("do_fork failed in idle_thread"); ++ new_task = get_task(pid, 1); ++ ++ cpu_tasks[cpu].pid = new_task->thread.mode.tt.extern_pid; ++ cpu_tasks[cpu].task = new_task; ++ init_tasks[cpu] = new_task; ++ new_task->processor = cpu; ++ new_task->cpus_allowed = 1 << cpu; ++ new_task->cpus_runnable = new_task->cpus_allowed; ++ CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, ++ sizeof(c)), ++ ({ panic("skas mode doesn't support SMP"); })); ++ return(new_task->thread.mode.tt.extern_pid); ++} ++ ++void smp_boot_cpus(void) ++{ ++ int err; ++ ++ set_bit(0, &cpu_online_map); ++ set_bit(0, &smp_callin_map); ++ ++ err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); ++ if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); ++ ++ activate_ipi(cpu_data[0].ipi_pipe[0], ++ current->thread.mode.tt.extern_pid); ++ ++ if(ncpus < 1){ ++ printk(KERN_INFO "ncpus set to 1\n"); ++ ncpus = 1; ++ } ++ else if(ncpus > NR_CPUS){ ++ printk(KERN_INFO ++ "ncpus can't be greater than NR_CPUS, set to %d\n", ++ NR_CPUS); ++ ncpus = NR_CPUS; ++ } ++ ++ if(ncpus > 1){ ++ int i, pid; ++ ++ printk(KERN_INFO "Starting up other processors:\n"); ++ for(i=1;i<ncpus;i++){ ++ int waittime; ++ ++ /* Do this early, for hard_smp_processor_id() */ ++ cpu_tasks[i].pid = -1; ++ set_bit(i, &cpu_online_map); ++ smp_num_cpus++; ++ ++ pid = idle_thread(idle_proc, i); ++ printk(KERN_INFO "\t#%d - idle thread pid = %d.. ", ++ i, pid); ++ ++ waittime = 200000000; ++ while (waittime-- && !test_bit(i, &smp_callin_map)) ++ cpu_relax(); ++ ++ if (test_bit(i, &smp_callin_map)) ++ printk("online\n"); ++ else { ++ printk("failed\n"); ++ clear_bit(i, &cpu_online_map); ++ } ++ } ++ } ++} ++ ++int setup_profiling_timer(unsigned int multiplier) ++{ ++ printk(KERN_INFO "setup_profiling_timer\n"); ++ return(0); ++} ++ ++void smp_call_function_slave(int cpu); ++ ++void IPI_handler(int cpu) ++{ ++ unsigned char c; ++ int fd; ++ ++ fd = cpu_data[cpu].ipi_pipe[0]; ++ while (read(fd, &c, 1) == 1) { ++ switch (c) { ++ case 'C': ++ smp_call_function_slave(cpu); ++ break; ++ ++ case 'R': ++ current->need_resched = 1; ++ break; ++ ++ case 'S': ++ printk("CPU#%d stopping\n", cpu); ++ while(1) ++ pause(); ++ break; ++ ++ default: ++ printk("CPU#%d received unknown IPI [%c]!\n", cpu, c); ++ break; ++ } ++ } ++} ++ ++int hard_smp_processor_id(void) ++{ ++ return(pid_to_processor_id(os_getpid())); ++} ++ ++static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; ++static atomic_t scf_started; ++static atomic_t scf_finished; ++static void (*func)(void *info); ++static void *info; ++ ++void smp_call_function_slave(int cpu) ++{ ++ atomic_inc(&scf_started); ++ (*func)(info); ++ atomic_inc(&scf_finished); ++} ++ ++int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, ++ int wait) ++{ ++ int cpus = smp_num_cpus - 1; ++ int i; ++ ++ if (!cpus) ++ return 0; ++ ++ spin_lock_bh(&call_lock); ++ atomic_set(&scf_started, 0); ++ atomic_set(&scf_finished, 0); ++ func = _func; ++ info = _info; ++ ++ for (i=0;i<NR_CPUS;i++) ++ if (i != current->processor && test_bit(i, &cpu_online_map)) ++ write(cpu_data[i].ipi_pipe[1], "C", 1); ++ ++ while (atomic_read(&scf_started) != cpus) ++ barrier(); ++ ++ if (wait) ++ while (atomic_read(&scf_finished) != cpus) ++ barrier(); ++ ++ spin_unlock_bh(&call_lock); ++ return 0; ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/sys_call_table.c um/arch/um/kernel/sys_call_table.c +--- orig/arch/um/kernel/sys_call_table.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/sys_call_table.c Thu Feb 27 13:33:23 2003 +@@ -0,0 +1,485 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/unistd.h" ++#include "linux/version.h" ++#include "linux/sys.h" ++#include "asm/signal.h" ++#include "sysdep/syscalls.h" ++#include "kern_util.h" ++ ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_exit; ++extern syscall_handler_t sys_fork; ++extern syscall_handler_t sys_creat; ++extern syscall_handler_t sys_link; ++extern syscall_handler_t sys_unlink; ++extern syscall_handler_t sys_chdir; ++extern syscall_handler_t sys_mknod; ++extern syscall_handler_t sys_chmod; ++extern syscall_handler_t sys_lchown16; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_stat; ++extern syscall_handler_t sys_getpid; ++extern syscall_handler_t sys_oldumount; ++extern syscall_handler_t sys_setuid16; ++extern syscall_handler_t sys_getuid16; ++extern syscall_handler_t sys_ptrace; ++extern syscall_handler_t sys_alarm; ++extern syscall_handler_t sys_fstat; ++extern syscall_handler_t sys_pause; ++extern syscall_handler_t sys_utime; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_access; ++extern syscall_handler_t sys_nice; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_sync; ++extern syscall_handler_t sys_kill; ++extern syscall_handler_t sys_rename; ++extern syscall_handler_t sys_mkdir; ++extern syscall_handler_t sys_rmdir; ++extern syscall_handler_t sys_pipe; ++extern syscall_handler_t sys_times; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_brk; ++extern syscall_handler_t sys_setgid16; ++extern syscall_handler_t sys_getgid16; ++extern syscall_handler_t sys_signal; ++extern syscall_handler_t sys_geteuid16; ++extern syscall_handler_t sys_getegid16; ++extern syscall_handler_t sys_acct; ++extern syscall_handler_t sys_umount; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_ioctl; ++extern syscall_handler_t sys_fcntl; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_setpgid; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_olduname; ++extern syscall_handler_t sys_umask; ++extern syscall_handler_t sys_chroot; ++extern syscall_handler_t sys_ustat; ++extern syscall_handler_t sys_dup2; ++extern syscall_handler_t sys_getppid; ++extern syscall_handler_t sys_getpgrp; ++extern syscall_handler_t sys_sigaction; ++extern syscall_handler_t sys_sgetmask; ++extern syscall_handler_t sys_ssetmask; ++extern syscall_handler_t sys_setreuid16; ++extern syscall_handler_t sys_setregid16; ++extern syscall_handler_t sys_sigsuspend; ++extern syscall_handler_t sys_sigpending; ++extern syscall_handler_t sys_sethostname; ++extern syscall_handler_t sys_setrlimit; ++extern syscall_handler_t sys_old_getrlimit; ++extern syscall_handler_t sys_getrusage; ++extern syscall_handler_t sys_gettimeofday; ++extern syscall_handler_t sys_settimeofday; ++extern syscall_handler_t sys_getgroups16; ++extern syscall_handler_t sys_setgroups16; ++extern syscall_handler_t sys_symlink; ++extern syscall_handler_t sys_lstat; ++extern syscall_handler_t sys_readlink; ++extern syscall_handler_t sys_uselib; ++extern syscall_handler_t sys_swapon; ++extern syscall_handler_t sys_reboot; ++extern syscall_handler_t old_readdir; ++extern syscall_handler_t sys_munmap; ++extern syscall_handler_t sys_truncate; ++extern syscall_handler_t sys_ftruncate; ++extern syscall_handler_t sys_fchmod; ++extern syscall_handler_t sys_fchown16; ++extern syscall_handler_t sys_getpriority; ++extern syscall_handler_t sys_setpriority; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_statfs; ++extern syscall_handler_t sys_fstatfs; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_socketcall; ++extern syscall_handler_t sys_syslog; ++extern syscall_handler_t sys_setitimer; ++extern syscall_handler_t sys_getitimer; ++extern syscall_handler_t sys_newstat; ++extern syscall_handler_t sys_newlstat; ++extern syscall_handler_t sys_newfstat; ++extern syscall_handler_t sys_uname; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_vhangup; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_swapoff; ++extern syscall_handler_t sys_sysinfo; ++extern syscall_handler_t sys_ipc; ++extern syscall_handler_t sys_fsync; ++extern syscall_handler_t sys_sigreturn; ++extern syscall_handler_t sys_rt_sigreturn; ++extern syscall_handler_t sys_clone; ++extern syscall_handler_t sys_setdomainname; ++extern syscall_handler_t sys_newuname; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_adjtimex; ++extern syscall_handler_t sys_mprotect; ++extern syscall_handler_t sys_sigprocmask; ++extern syscall_handler_t sys_create_module; ++extern syscall_handler_t sys_init_module; ++extern syscall_handler_t sys_delete_module; ++extern syscall_handler_t sys_get_kernel_syms; ++extern syscall_handler_t sys_quotactl; ++extern syscall_handler_t sys_getpgid; ++extern syscall_handler_t sys_fchdir; ++extern syscall_handler_t sys_bdflush; ++extern syscall_handler_t sys_sysfs; ++extern syscall_handler_t sys_personality; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_setfsuid16; ++extern syscall_handler_t sys_setfsgid16; ++extern syscall_handler_t sys_llseek; ++extern syscall_handler_t sys_getdents; ++extern syscall_handler_t sys_flock; ++extern syscall_handler_t sys_msync; ++extern syscall_handler_t sys_readv; ++extern syscall_handler_t sys_writev; ++extern syscall_handler_t sys_getsid; ++extern syscall_handler_t sys_fdatasync; ++extern syscall_handler_t sys_sysctl; ++extern syscall_handler_t sys_mlock; ++extern syscall_handler_t sys_munlock; ++extern syscall_handler_t sys_mlockall; ++extern syscall_handler_t sys_munlockall; ++extern syscall_handler_t sys_sched_setparam; ++extern syscall_handler_t sys_sched_getparam; ++extern syscall_handler_t sys_sched_setscheduler; ++extern syscall_handler_t sys_sched_getscheduler; ++extern syscall_handler_t sys_sched_get_priority_max; ++extern syscall_handler_t sys_sched_get_priority_min; ++extern syscall_handler_t sys_sched_rr_get_interval; ++extern syscall_handler_t sys_nanosleep; ++extern syscall_handler_t sys_mremap; ++extern syscall_handler_t sys_setresuid16; ++extern syscall_handler_t sys_getresuid16; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_query_module; ++extern syscall_handler_t sys_poll; ++extern syscall_handler_t sys_nfsservctl; ++extern syscall_handler_t sys_setresgid16; ++extern syscall_handler_t sys_getresgid16; ++extern syscall_handler_t sys_prctl; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_rt_sigaction; ++extern syscall_handler_t sys_rt_sigprocmask; ++extern syscall_handler_t sys_rt_sigpending; ++extern syscall_handler_t sys_rt_sigtimedwait; ++extern syscall_handler_t sys_rt_sigqueueinfo; ++extern syscall_handler_t sys_rt_sigsuspend; ++extern syscall_handler_t sys_pread; ++extern syscall_handler_t sys_pwrite; ++extern syscall_handler_t sys_chown16; ++extern syscall_handler_t sys_getcwd; ++extern syscall_handler_t sys_capget; ++extern syscall_handler_t sys_capset; ++extern syscall_handler_t sys_sigaltstack; ++extern syscall_handler_t sys_sendfile; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_ni_syscall; ++extern syscall_handler_t sys_vfork; ++extern syscall_handler_t sys_getrlimit; ++extern syscall_handler_t sys_mmap2; ++extern syscall_handler_t sys_truncate64; ++extern syscall_handler_t sys_ftruncate64; ++extern syscall_handler_t sys_stat64; ++extern syscall_handler_t sys_lstat64; ++extern syscall_handler_t sys_fstat64; ++extern syscall_handler_t sys_lchown; ++extern syscall_handler_t sys_getuid; ++extern syscall_handler_t sys_getgid; ++extern syscall_handler_t sys_geteuid; ++extern syscall_handler_t sys_getegid; ++extern syscall_handler_t sys_setreuid; ++extern syscall_handler_t sys_setregid; ++extern syscall_handler_t sys_getgroups; ++extern syscall_handler_t sys_setgroups; ++extern syscall_handler_t sys_fchown; ++extern syscall_handler_t sys_setresuid; ++extern syscall_handler_t sys_getresuid; ++extern syscall_handler_t sys_setresgid; ++extern syscall_handler_t sys_getresgid; ++extern syscall_handler_t sys_chown; ++extern syscall_handler_t sys_setuid; ++extern syscall_handler_t sys_setgid; ++extern syscall_handler_t sys_setfsuid; ++extern syscall_handler_t sys_setfsgid; ++extern syscall_handler_t sys_pivot_root; ++extern syscall_handler_t sys_mincore; ++extern syscall_handler_t sys_madvise; ++extern syscall_handler_t sys_fcntl64; ++extern syscall_handler_t sys_getdents64; ++extern syscall_handler_t sys_gettid; ++extern syscall_handler_t sys_readahead; ++extern syscall_handler_t sys_tkill; ++extern syscall_handler_t sys_setxattr; ++extern syscall_handler_t sys_lsetxattr; ++extern syscall_handler_t sys_fsetxattr; ++extern syscall_handler_t sys_getxattr; ++extern syscall_handler_t sys_lgetxattr; ++extern syscall_handler_t sys_fgetxattr; ++extern syscall_handler_t sys_listxattr; ++extern syscall_handler_t sys_llistxattr; ++extern syscall_handler_t sys_flistxattr; ++extern syscall_handler_t sys_removexattr; ++extern syscall_handler_t sys_lremovexattr; ++extern syscall_handler_t sys_fremovexattr; ++ ++extern syscall_handler_t um_mount; ++extern syscall_handler_t um_time; ++extern syscall_handler_t um_stime; ++ ++#define LAST_GENERIC_SYSCALL __NR_sched_getaffinity ++ ++#if LAST_GENERIC_SYSCALL > LAST_ARCH_SYSCALL ++#define LAST_SYSCALL LAST_GENERIC_SYSCALL ++#else ++#define LAST_SYSCALL LAST_ARCH_SYSCALL ++#endif ++ ++syscall_handler_t *sys_call_table[] = { ++ [ 0 ] = sys_ni_syscall, ++ [ __NR_exit ] = sys_exit, ++ [ __NR_fork ] = sys_fork, ++ [ __NR_read ] = (syscall_handler_t *) sys_read, ++ [ __NR_write ] = (syscall_handler_t *) sys_write, ++ ++ /* These three are declared differently in asm/unistd.h */ ++ [ __NR_open ] = (syscall_handler_t *) sys_open, ++ [ __NR_close ] = (syscall_handler_t *) sys_close, ++ [ __NR_waitpid ] = (syscall_handler_t *) sys_waitpid, ++ [ __NR_creat ] = sys_creat, ++ [ __NR_link ] = sys_link, ++ [ __NR_unlink ] = sys_unlink, ++ ++ /* declared differently in kern_util.h */ ++ [ __NR_execve ] = (syscall_handler_t *) sys_execve, ++ [ __NR_chdir ] = sys_chdir, ++ [ __NR_time ] = um_time, ++ [ __NR_mknod ] = sys_mknod, ++ [ __NR_chmod ] = sys_chmod, ++ [ __NR_lchown ] = sys_lchown16, ++ [ __NR_break ] = sys_ni_syscall, ++ [ __NR_oldstat ] = sys_stat, ++ [ __NR_lseek ] = (syscall_handler_t *) sys_lseek, ++ [ __NR_getpid ] = sys_getpid, ++ [ __NR_mount ] = um_mount, ++ [ __NR_umount ] = sys_oldumount, ++ [ __NR_setuid ] = sys_setuid16, ++ [ __NR_getuid ] = sys_getuid16, ++ [ __NR_stime ] = um_stime, ++ [ __NR_ptrace ] = sys_ptrace, ++ [ __NR_alarm ] = sys_alarm, ++ [ __NR_oldfstat ] = sys_fstat, ++ [ __NR_pause ] = sys_pause, ++ [ __NR_utime ] = sys_utime, ++ [ __NR_stty ] = sys_ni_syscall, ++ [ __NR_gtty ] = sys_ni_syscall, ++ [ __NR_access ] = sys_access, ++ [ __NR_nice ] = sys_nice, ++ [ __NR_ftime ] = sys_ni_syscall, ++ [ __NR_sync ] = sys_sync, ++ [ __NR_kill ] = sys_kill, ++ [ __NR_rename ] = sys_rename, ++ [ __NR_mkdir ] = sys_mkdir, ++ [ __NR_rmdir ] = sys_rmdir, ++ ++ /* Declared differently in asm/unistd.h */ ++ [ __NR_dup ] = (syscall_handler_t *) sys_dup, ++ [ __NR_pipe ] = sys_pipe, ++ [ __NR_times ] = sys_times, ++ [ __NR_prof ] = sys_ni_syscall, ++ [ __NR_brk ] = sys_brk, ++ [ __NR_setgid ] = sys_setgid16, ++ [ __NR_getgid ] = sys_getgid16, ++ [ __NR_signal ] = sys_signal, ++ [ __NR_geteuid ] = sys_geteuid16, ++ [ __NR_getegid ] = sys_getegid16, ++ [ __NR_acct ] = sys_acct, ++ [ __NR_umount2 ] = sys_umount, ++ [ __NR_lock ] = sys_ni_syscall, ++ [ __NR_ioctl ] = sys_ioctl, ++ [ __NR_fcntl ] = sys_fcntl, ++ [ __NR_mpx ] = sys_ni_syscall, ++ [ __NR_setpgid ] = sys_setpgid, ++ [ __NR_ulimit ] = sys_ni_syscall, ++ [ __NR_oldolduname ] = sys_olduname, ++ [ __NR_umask ] = sys_umask, ++ [ __NR_chroot ] = sys_chroot, ++ [ __NR_ustat ] = sys_ustat, ++ [ __NR_dup2 ] = sys_dup2, ++ [ __NR_getppid ] = sys_getppid, ++ [ __NR_getpgrp ] = sys_getpgrp, ++ [ __NR_setsid ] = (syscall_handler_t *) sys_setsid, ++ [ __NR_sigaction ] = sys_sigaction, ++ [ __NR_sgetmask ] = sys_sgetmask, ++ [ __NR_ssetmask ] = sys_ssetmask, ++ [ __NR_setreuid ] = sys_setreuid16, ++ [ __NR_setregid ] = sys_setregid16, ++ [ __NR_sigsuspend ] = sys_sigsuspend, ++ [ __NR_sigpending ] = sys_sigpending, ++ [ __NR_sethostname ] = sys_sethostname, ++ [ __NR_setrlimit ] = sys_setrlimit, ++ [ __NR_getrlimit ] = sys_old_getrlimit, ++ [ __NR_getrusage ] = sys_getrusage, ++ [ __NR_gettimeofday ] = sys_gettimeofday, ++ [ __NR_settimeofday ] = sys_settimeofday, ++ [ __NR_getgroups ] = sys_getgroups16, ++ [ __NR_setgroups ] = sys_setgroups16, ++ [ __NR_symlink ] = sys_symlink, ++ [ __NR_oldlstat ] = sys_lstat, ++ [ __NR_readlink ] = sys_readlink, ++ [ __NR_uselib ] = sys_uselib, ++ [ __NR_swapon ] = sys_swapon, ++ [ __NR_reboot ] = sys_reboot, ++ [ __NR_readdir ] = old_readdir, ++ [ __NR_munmap ] = sys_munmap, ++ [ __NR_truncate ] = sys_truncate, ++ [ __NR_ftruncate ] = sys_ftruncate, ++ [ __NR_fchmod ] = sys_fchmod, ++ [ __NR_fchown ] = sys_fchown16, ++ [ __NR_getpriority ] = sys_getpriority, ++ [ __NR_setpriority ] = sys_setpriority, ++ [ __NR_profil ] = sys_ni_syscall, ++ [ __NR_statfs ] = sys_statfs, ++ [ __NR_fstatfs ] = sys_fstatfs, ++ [ __NR_ioperm ] = sys_ni_syscall, ++ [ __NR_socketcall ] = sys_socketcall, ++ [ __NR_syslog ] = sys_syslog, ++ [ __NR_setitimer ] = sys_setitimer, ++ [ __NR_getitimer ] = sys_getitimer, ++ [ __NR_stat ] = sys_newstat, ++ [ __NR_lstat ] = sys_newlstat, ++ [ __NR_fstat ] = sys_newfstat, ++ [ __NR_olduname ] = sys_uname, ++ [ __NR_iopl ] = sys_ni_syscall, ++ [ __NR_vhangup ] = sys_vhangup, ++ [ __NR_idle ] = sys_ni_syscall, ++ [ __NR_wait4 ] = (syscall_handler_t *) sys_wait4, ++ [ __NR_swapoff ] = sys_swapoff, ++ [ __NR_sysinfo ] = sys_sysinfo, ++ [ __NR_ipc ] = sys_ipc, ++ [ __NR_fsync ] = sys_fsync, ++ [ __NR_sigreturn ] = sys_sigreturn, ++ [ __NR_clone ] = sys_clone, ++ [ __NR_setdomainname ] = sys_setdomainname, ++ [ __NR_uname ] = sys_newuname, ++ [ __NR_adjtimex ] = sys_adjtimex, ++ [ __NR_mprotect ] = sys_mprotect, ++ [ __NR_sigprocmask ] = sys_sigprocmask, ++ [ __NR_create_module ] = sys_create_module, ++ [ __NR_init_module ] = sys_init_module, ++ [ __NR_delete_module ] = sys_delete_module, ++ [ __NR_get_kernel_syms ] = sys_get_kernel_syms, ++ [ __NR_quotactl ] = sys_quotactl, ++ [ __NR_getpgid ] = sys_getpgid, ++ [ __NR_fchdir ] = sys_fchdir, ++ [ __NR_bdflush ] = sys_bdflush, ++ [ __NR_sysfs ] = sys_sysfs, ++ [ __NR_personality ] = sys_personality, ++ [ __NR_afs_syscall ] = sys_ni_syscall, ++ [ __NR_setfsuid ] = sys_setfsuid16, ++ [ __NR_setfsgid ] = sys_setfsgid16, ++ [ __NR__llseek ] = sys_llseek, ++ [ __NR_getdents ] = sys_getdents, ++ [ __NR__newselect ] = (syscall_handler_t *) sys_select, ++ [ __NR_flock ] = sys_flock, ++ [ __NR_msync ] = sys_msync, ++ [ __NR_readv ] = sys_readv, ++ [ __NR_writev ] = sys_writev, ++ [ __NR_getsid ] = sys_getsid, ++ [ __NR_fdatasync ] = sys_fdatasync, ++ [ __NR__sysctl ] = sys_sysctl, ++ [ __NR_mlock ] = sys_mlock, ++ [ __NR_munlock ] = sys_munlock, ++ [ __NR_mlockall ] = sys_mlockall, ++ [ __NR_munlockall ] = sys_munlockall, ++ [ __NR_sched_setparam ] = sys_sched_setparam, ++ [ __NR_sched_getparam ] = sys_sched_getparam, ++ [ __NR_sched_setscheduler ] = sys_sched_setscheduler, ++ [ __NR_sched_getscheduler ] = sys_sched_getscheduler, ++ [ __NR_sched_yield ] = (syscall_handler_t *) yield, ++ [ __NR_sched_get_priority_max ] = sys_sched_get_priority_max, ++ [ __NR_sched_get_priority_min ] = sys_sched_get_priority_min, ++ [ __NR_sched_rr_get_interval ] = sys_sched_rr_get_interval, ++ [ __NR_nanosleep ] = sys_nanosleep, ++ [ __NR_mremap ] = sys_mremap, ++ [ __NR_setresuid ] = sys_setresuid16, ++ [ __NR_getresuid ] = sys_getresuid16, ++ [ __NR_vm86 ] = sys_ni_syscall, ++ [ __NR_query_module ] = sys_query_module, ++ [ __NR_poll ] = sys_poll, ++ [ __NR_nfsservctl ] = sys_nfsservctl, ++ [ __NR_setresgid ] = sys_setresgid16, ++ [ __NR_getresgid ] = sys_getresgid16, ++ [ __NR_prctl ] = sys_prctl, ++ [ __NR_rt_sigreturn ] = sys_rt_sigreturn, ++ [ __NR_rt_sigaction ] = sys_rt_sigaction, ++ [ __NR_rt_sigprocmask ] = sys_rt_sigprocmask, ++ [ __NR_rt_sigpending ] = sys_rt_sigpending, ++ [ __NR_rt_sigtimedwait ] = sys_rt_sigtimedwait, ++ [ __NR_rt_sigqueueinfo ] = sys_rt_sigqueueinfo, ++ [ __NR_rt_sigsuspend ] = sys_rt_sigsuspend, ++ [ __NR_pread ] = sys_pread, ++ [ __NR_pwrite ] = sys_pwrite, ++ [ __NR_chown ] = sys_chown16, ++ [ __NR_getcwd ] = sys_getcwd, ++ [ __NR_capget ] = sys_capget, ++ [ __NR_capset ] = sys_capset, ++ [ __NR_sigaltstack ] = sys_sigaltstack, ++ [ __NR_sendfile ] = sys_sendfile, ++ [ __NR_getpmsg ] = sys_ni_syscall, ++ [ __NR_putpmsg ] = sys_ni_syscall, ++ [ __NR_vfork ] = sys_vfork, ++ [ __NR_ugetrlimit ] = sys_getrlimit, ++ [ __NR_mmap2 ] = sys_mmap2, ++ [ __NR_truncate64 ] = sys_truncate64, ++ [ __NR_ftruncate64 ] = sys_ftruncate64, ++ [ __NR_stat64 ] = sys_stat64, ++ [ __NR_lstat64 ] = sys_lstat64, ++ [ __NR_fstat64 ] = sys_fstat64, ++ [ __NR_fcntl64 ] = sys_fcntl64, ++ [ __NR_getdents64 ] = sys_getdents64, ++ [ __NR_security ] = sys_ni_syscall, ++ [ __NR_gettid ] = sys_gettid, ++ [ __NR_readahead ] = sys_readahead, ++ [ __NR_setxattr ] = sys_setxattr, ++ [ __NR_lsetxattr ] = sys_lsetxattr, ++ [ __NR_fsetxattr ] = sys_fsetxattr, ++ [ __NR_getxattr ] = sys_getxattr, ++ [ __NR_lgetxattr ] = sys_lgetxattr, ++ [ __NR_fgetxattr ] = sys_fgetxattr, ++ [ __NR_listxattr ] = sys_listxattr, ++ [ __NR_llistxattr ] = sys_llistxattr, ++ [ __NR_flistxattr ] = sys_flistxattr, ++ [ __NR_removexattr ] = sys_removexattr, ++ [ __NR_lremovexattr ] = sys_lremovexattr, ++ [ __NR_fremovexattr ] = sys_fremovexattr, ++ [ __NR_tkill ] = sys_tkill, ++ [ __NR_sendfile64 ] = sys_ni_syscall, ++ [ __NR_futex ] = sys_ni_syscall, ++ [ __NR_sched_setaffinity ] = sys_ni_syscall, ++ [ __NR_sched_getaffinity ] = sys_ni_syscall, ++ ++ ARCH_SYSCALLS ++ [ LAST_SYSCALL + 1 ... NR_syscalls ] = ++ (syscall_handler_t *) sys_ni_syscall ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/syscall_kern.c um/arch/um/kernel/syscall_kern.c +--- orig/arch/um/kernel/syscall_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/syscall_kern.c Fri Nov 8 14:04:10 2002 +@@ -0,0 +1,343 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/file.h" ++#include "linux/smp_lock.h" ++#include "linux/mm.h" ++#include "linux/utsname.h" ++#include "linux/msg.h" ++#include "linux/shm.h" ++#include "linux/sys.h" ++#include "linux/unistd.h" ++#include "linux/slab.h" ++#include "linux/utime.h" ++#include "asm/mman.h" ++#include "asm/uaccess.h" ++#include "asm/ipc.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "sysdep/syscalls.h" ++#include "mode_kern.h" ++#include "choose-mode.h" ++ ++/* Unlocked, I don't care if this is a bit off */ ++int nsyscalls = 0; ++ ++long um_mount(char * dev_name, char * dir_name, char * type, ++ unsigned long new_flags, void * data) ++{ ++ if(type == NULL) type = ""; ++ return(sys_mount(dev_name, dir_name, type, new_flags, data)); ++} ++ ++long sys_fork(void) ++{ ++ long ret; ++ ++ current->thread.forking = 1; ++ ret = do_fork(SIGCHLD, 0, NULL, 0); ++ current->thread.forking = 0; ++ return(ret); ++} ++ ++long sys_clone(unsigned long clone_flags, unsigned long newsp) ++{ ++ long ret; ++ ++ current->thread.forking = 1; ++ ret = do_fork(clone_flags, newsp, NULL, 0); ++ current->thread.forking = 0; ++ return(ret); ++} ++ ++long sys_vfork(void) ++{ ++ long ret; ++ ++ current->thread.forking = 1; ++ ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0); ++ current->thread.forking = 0; ++ return(ret); ++} ++ ++/* common code for old and new mmaps */ ++long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, unsigned long fd, ++ unsigned long pgoff) ++{ ++ int error = -EBADF; ++ struct file * file = NULL; ++ ++ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); ++ if (!(flags & MAP_ANONYMOUS)) { ++ file = fget(fd); ++ if (!file) ++ goto out; ++ } ++ ++ down_write(&mm->mmap_sem); ++ error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff); ++ up_write(&mm->mmap_sem); ++ ++ if (file) ++ fput(file); ++ out: ++ return error; ++} ++ ++long sys_mmap2(unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long fd, unsigned long pgoff) ++{ ++ return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff); ++} ++ ++/* ++ * Perform the select(nd, in, out, ex, tv) and mmap() system ++ * calls. Linux/i386 didn't use to be able to handle more than ++ * 4 system call parameters, so these system calls used a memory ++ * block for parameter passing.. ++ */ ++ ++struct mmap_arg_struct { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++int old_mmap(unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long fd, unsigned long offset) ++{ ++ int err = -EINVAL; ++ if (offset & ~PAGE_MASK) ++ goto out; ++ ++ err = do_mmap2(current->mm, addr, len, prot, flags, fd, ++ offset >> PAGE_SHIFT); ++ out: ++ return err; ++} ++/* ++ * sys_pipe() is the normal C calling standard for creating ++ * a pipe. It's not the way unix traditionally does this, though. ++ */ ++int sys_pipe(unsigned long * fildes) ++{ ++ int fd[2]; ++ int error; ++ ++ error = do_pipe(fd); ++ if (!error) { ++ if (copy_to_user(fildes, fd, 2*sizeof(int))) ++ error = -EFAULT; ++ } ++ return error; ++} ++ ++int sys_pause(void) ++{ ++ current->state = TASK_INTERRUPTIBLE; ++ schedule(); ++ return -ERESTARTNOHAND; ++} ++ ++int sys_sigaction(int sig, const struct old_sigaction *act, ++ struct old_sigaction *oact) ++{ ++ struct k_sigaction new_ka, old_ka; ++ int ret; ++ ++ if (act) { ++ old_sigset_t mask; ++ if (verify_area(VERIFY_READ, act, sizeof(*act)) || ++ __get_user(new_ka.sa.sa_handler, &act->sa_handler) || ++ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) ++ return -EFAULT; ++ __get_user(new_ka.sa.sa_flags, &act->sa_flags); ++ __get_user(mask, &act->sa_mask); ++ siginitset(&new_ka.sa.sa_mask, mask); ++ } ++ ++ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); ++ ++ if (!ret && oact) { ++ if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || ++ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || ++ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) ++ return -EFAULT; ++ __put_user(old_ka.sa.sa_flags, &oact->sa_flags); ++ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); ++ } ++ ++ return ret; ++} ++ ++/* ++ * sys_ipc() is the de-multiplexer for the SysV IPC calls.. ++ * ++ * This is really horribly ugly. ++ */ ++int sys_ipc (uint call, int first, int second, ++ int third, void *ptr, long fifth) ++{ ++ int version, ret; ++ ++ version = call >> 16; /* hack for backward compatibility */ ++ call &= 0xffff; ++ ++ switch (call) { ++ case SEMOP: ++ return sys_semop (first, (struct sembuf *)ptr, second); ++ case SEMGET: ++ return sys_semget (first, second, third); ++ case SEMCTL: { ++ union semun fourth; ++ if (!ptr) ++ return -EINVAL; ++ if (get_user(fourth.__pad, (void **) ptr)) ++ return -EFAULT; ++ return sys_semctl (first, second, third, fourth); ++ } ++ ++ case MSGSND: ++ return sys_msgsnd (first, (struct msgbuf *) ptr, ++ second, third); ++ case MSGRCV: ++ switch (version) { ++ case 0: { ++ struct ipc_kludge tmp; ++ if (!ptr) ++ return -EINVAL; ++ ++ if (copy_from_user(&tmp, ++ (struct ipc_kludge *) ptr, ++ sizeof (tmp))) ++ return -EFAULT; ++ return sys_msgrcv (first, tmp.msgp, second, ++ tmp.msgtyp, third); ++ } ++ default: ++ panic("msgrcv with version != 0"); ++ return sys_msgrcv (first, ++ (struct msgbuf *) ptr, ++ second, fifth, third); ++ } ++ case MSGGET: ++ return sys_msgget ((key_t) first, second); ++ case MSGCTL: ++ return sys_msgctl (first, second, (struct msqid_ds *) ptr); ++ ++ case SHMAT: ++ switch (version) { ++ default: { ++ ulong raddr; ++ ret = sys_shmat (first, (char *) ptr, second, &raddr); ++ if (ret) ++ return ret; ++ return put_user (raddr, (ulong *) third); ++ } ++ case 1: /* iBCS2 emulator entry point */ ++ if (!segment_eq(get_fs(), get_ds())) ++ return -EINVAL; ++ return sys_shmat (first, (char *) ptr, second, (ulong *) third); ++ } ++ case SHMDT: ++ return sys_shmdt ((char *)ptr); ++ case SHMGET: ++ return sys_shmget (first, second, third); ++ case SHMCTL: ++ return sys_shmctl (first, second, ++ (struct shmid_ds *) ptr); ++ default: ++ return -EINVAL; ++ } ++} ++ ++int sys_uname(struct old_utsname * name) ++{ ++ int err; ++ if (!name) ++ return -EFAULT; ++ down_read(&uts_sem); ++ err=copy_to_user(name, &system_utsname, sizeof (*name)); ++ up_read(&uts_sem); ++ return err?-EFAULT:0; ++} ++ ++int sys_olduname(struct oldold_utsname * name) ++{ ++ int error; ++ ++ if (!name) ++ return -EFAULT; ++ if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) ++ return -EFAULT; ++ ++ down_read(&uts_sem); ++ ++ error = __copy_to_user(&name->sysname,&system_utsname.sysname, ++ __OLD_UTS_LEN); ++ error |= __put_user(0,name->sysname+__OLD_UTS_LEN); ++ error |= __copy_to_user(&name->nodename,&system_utsname.nodename, ++ __OLD_UTS_LEN); ++ error |= __put_user(0,name->nodename+__OLD_UTS_LEN); ++ error |= __copy_to_user(&name->release,&system_utsname.release, ++ __OLD_UTS_LEN); ++ error |= __put_user(0,name->release+__OLD_UTS_LEN); ++ error |= __copy_to_user(&name->version,&system_utsname.version, ++ __OLD_UTS_LEN); ++ error |= __put_user(0,name->version+__OLD_UTS_LEN); ++ error |= __copy_to_user(&name->machine,&system_utsname.machine, ++ __OLD_UTS_LEN); ++ error |= __put_user(0,name->machine+__OLD_UTS_LEN); ++ ++ up_read(&uts_sem); ++ ++ error = error ? -EFAULT : 0; ++ ++ return error; ++} ++ ++int sys_sigaltstack(const stack_t *uss, stack_t *uoss) ++{ ++ return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); ++} ++ ++long execute_syscall(void *r) ++{ ++ return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r)); ++} ++ ++spinlock_t syscall_lock = SPIN_LOCK_UNLOCKED; ++ ++static int syscall_index = 0; ++ ++int next_syscall_index(int limit) ++{ ++ int ret; ++ ++ spin_lock(&syscall_lock); ++ ret = syscall_index; ++ if(++syscall_index == limit) ++ syscall_index = 0; ++ spin_unlock(&syscall_lock); ++ return(ret); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/syscall_user.c um/arch/um/kernel/syscall_user.c +--- orig/arch/um/kernel/syscall_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/syscall_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <sys/time.h> ++#include "kern_util.h" ++#include "syscall_user.h" ++ ++struct { ++ int syscall; ++ int pid; ++ int result; ++ struct timeval start; ++ struct timeval end; ++} syscall_record[1024]; ++ ++int record_syscall_start(int syscall) ++{ ++ int max, index; ++ ++ max = sizeof(syscall_record)/sizeof(syscall_record[0]); ++ index = next_syscall_index(max); ++ ++ syscall_record[index].syscall = syscall; ++ syscall_record[index].pid = current_pid(); ++ syscall_record[index].result = 0xdeadbeef; ++ gettimeofday(&syscall_record[index].start, NULL); ++ return(index); ++} ++ ++void record_syscall_end(int index, int result) ++{ ++ syscall_record[index].result = result; ++ gettimeofday(&syscall_record[index].end, NULL); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/sysrq.c um/arch/um/kernel/sysrq.c +--- orig/arch/um/kernel/sysrq.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/sysrq.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,98 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/kernel.h" ++#include "linux/module.h" ++#include "asm/page.h" ++#include "asm/processor.h" ++#include "sysrq.h" ++#include "user_util.h" ++ ++ /* ++ * If the address is either in the .text section of the ++ * kernel, or in the vmalloc'ed module regions, it *may* ++ * be the address of a calling routine ++ */ ++ ++#ifdef CONFIG_MODULES ++ ++extern struct module *module_list; ++extern struct module kernel_module; ++ ++static inline int kernel_text_address(unsigned long addr) ++{ ++ int retval = 0; ++ struct module *mod; ++ ++ if (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext) ++ return 1; ++ ++ for (mod = module_list; mod != &kernel_module; mod = mod->next) { ++ /* mod_bound tests for addr being inside the vmalloc'ed ++ * module area. Of course it'd be better to test only ++ * for the .text subset... */ ++ if (mod_bound(addr, 0, mod)) { ++ retval = 1; ++ break; ++ } ++ } ++ ++ return retval; ++} ++ ++#else ++ ++static inline int kernel_text_address(unsigned long addr) ++{ ++ return (addr >= (unsigned long) &_stext && ++ addr <= (unsigned long) &_etext); ++} ++ ++#endif ++ ++void show_trace(unsigned long * stack) ++{ ++ int i; ++ unsigned long addr; ++ ++ if (!stack) ++ stack = (unsigned long*) &stack; ++ ++ printk("Call Trace: "); ++ i = 1; ++ while (((long) stack & (THREAD_SIZE-1)) != 0) { ++ addr = *stack++; ++ if (kernel_text_address(addr)) { ++ if (i && ((i % 6) == 0)) ++ printk("\n "); ++ printk("[<%08lx>] ", addr); ++ i++; ++ } ++ } ++ printk("\n"); ++} ++ ++void show_trace_task(struct task_struct *tsk) ++{ ++ unsigned long esp = PT_REGS_SP(&tsk->thread.regs); ++ ++ /* User space on another CPU? */ ++ if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1)) ++ return; ++ show_trace((unsigned long *)esp); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tempfile.c um/arch/um/kernel/tempfile.c +--- orig/arch/um/kernel/tempfile.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tempfile.c Fri Jan 17 23:16:19 2003 +@@ -0,0 +1,80 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <string.h> ++#include <errno.h> ++#include <sys/param.h> ++#include "init.h" ++ ++/* Modified from create_mem_file and start_debugger */ ++static char *tempdir = NULL; ++ ++static void __init find_tempdir(void) ++{ ++ char *dirs[] = { "TMP", "TEMP", "TMPDIR", NULL }; ++ int i; ++ char *dir = NULL; ++ ++ if(tempdir != NULL) return; /* We've already been called */ ++ for(i = 0; dirs[i]; i++){ ++ dir = getenv(dirs[i]); ++ if((dir != NULL) && (*dir != '\0')) ++ break; ++ } ++ if((dir == NULL) || (*dir == '\0')) ++ dir = "/tmp"; ++ ++ tempdir = malloc(strlen(dir) + 2); ++ if(tempdir == NULL){ ++ fprintf(stderr, "Failed to malloc tempdir, " ++ "errno = %d\n", errno); ++ return; ++ } ++ strcpy(tempdir, dir); ++ strcat(tempdir, "/"); ++} ++ ++int make_tempfile(const char *template, char **out_tempname, int do_unlink) ++{ ++ char tempname[MAXPATHLEN]; ++ int fd; ++ ++ find_tempdir(); ++ if (*template != '/') ++ strcpy(tempname, tempdir); ++ else ++ *tempname = 0; ++ strcat(tempname, template); ++ if((fd = mkstemp(tempname)) < 0){ ++ fprintf(stderr, "open - cannot create %s: %s\n", tempname, ++ strerror(errno)); ++ return -1; ++ } ++ if(do_unlink && (unlink(tempname) < 0)){ ++ perror("unlink"); ++ return -1; ++ } ++ if(out_tempname){ ++ if((*out_tempname = strdup(tempname)) == NULL){ ++ perror("strdup"); ++ return -1; ++ } ++ } ++ return(fd); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/time.c um/arch/um/kernel/time.c +--- orig/arch/um/kernel/time.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/time.c Wed Apr 23 20:45:19 2003 +@@ -0,0 +1,127 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <time.h> ++#include <sys/time.h> ++#include <signal.h> ++#include <errno.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "process.h" ++#include "signal_user.h" ++#include "time_user.h" ++ ++extern struct timeval xtime; ++ ++struct timeval local_offset = { 0, 0 }; ++ ++void timer(void) ++{ ++ gettimeofday(&xtime, NULL); ++ timeradd(&xtime, &local_offset, &xtime); ++} ++ ++void set_interval(int timer_type) ++{ ++ int usec = 1000000/hz(); ++ struct itimerval interval = ((struct itimerval) { { 0, usec }, ++ { 0, usec } }); ++ ++ if(setitimer(timer_type, &interval, NULL) == -1) ++ panic("setitimer failed - errno = %d\n", errno); ++} ++ ++void enable_timer(void) ++{ ++ int usec = 1000000/hz(); ++ struct itimerval enable = ((struct itimerval) { { 0, usec }, ++ { 0, usec }}); ++ if(setitimer(ITIMER_VIRTUAL, &enable, NULL)) ++ printk("enable_timer - setitimer failed, errno = %d\n", ++ errno); ++} ++ ++void switch_timers(int to_real) ++{ ++ struct itimerval disable = ((struct itimerval) { { 0, 0 }, { 0, 0 }}); ++ struct itimerval enable = ((struct itimerval) { { 0, 1000000/hz() }, ++ { 0, 1000000/hz() }}); ++ int old, new; ++ ++ if(to_real){ ++ old = ITIMER_VIRTUAL; ++ new = ITIMER_REAL; ++ } ++ else { ++ old = ITIMER_REAL; ++ new = ITIMER_VIRTUAL; ++ } ++ ++ if((setitimer(old, &disable, NULL) < 0) || ++ (setitimer(new, &enable, NULL))) ++ printk("switch_timers - setitimer failed, errno = %d\n", ++ errno); ++} ++ ++void idle_timer(void) ++{ ++ if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) ++ panic("Couldn't unset SIGVTALRM handler"); ++ ++ set_handler(SIGALRM, (__sighandler_t) alarm_handler, ++ SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1); ++ set_interval(ITIMER_REAL); ++} ++ ++void time_init(void) ++{ ++ if(signal(SIGVTALRM, boot_timer_handler) == SIG_ERR) ++ panic("Couldn't set SIGVTALRM handler"); ++ set_interval(ITIMER_VIRTUAL); ++} ++ ++void do_gettimeofday(struct timeval *tv) ++{ ++ unsigned long flags; ++ ++ flags = time_lock(); ++ gettimeofday(tv, NULL); ++ timeradd(tv, &local_offset, tv); ++ time_unlock(flags); ++} ++ ++void do_settimeofday(struct timeval *tv) ++{ ++ struct timeval now; ++ unsigned long flags; ++ ++ flags = time_lock(); ++ gettimeofday(&now, NULL); ++ timersub(tv, &now, &local_offset); ++ time_unlock(flags); ++} ++ ++void idle_sleep(int secs) ++{ ++ struct timespec ts; ++ ++ ts.tv_sec = secs; ++ ts.tv_nsec = 0; ++ nanosleep(&ts, NULL); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/time_kern.c um/arch/um/kernel/time_kern.c +--- orig/arch/um/kernel/time_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/time_kern.c Wed Apr 23 22:19:08 2003 +@@ -0,0 +1,172 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/unistd.h" ++#include "linux/stddef.h" ++#include "linux/spinlock.h" ++#include "linux/sched.h" ++#include "linux/interrupt.h" ++#include "linux/init.h" ++#include "linux/delay.h" ++#include "asm/irq.h" ++#include "asm/param.h" ++#include "asm/current.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "time_user.h" ++#include "mode.h" ++ ++extern rwlock_t xtime_lock; ++ ++int hz(void) ++{ ++ return(HZ); ++} ++ ++/* Changed at early boot */ ++int timer_irq_inited = 0; ++ ++/* missed_ticks will be modified after kernel memory has been ++ * write-protected, so this puts it in a section which will be left ++ * write-enabled. ++ */ ++int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS]; ++ ++void timer_irq(union uml_pt_regs *regs) ++{ ++ int cpu = current->processor, ticks = missed_ticks[cpu]; ++ ++ if(!timer_irq_inited) return; ++ missed_ticks[cpu] = 0; ++ while(ticks--) do_IRQ(TIMER_IRQ, regs); ++} ++ ++void boot_timer_handler(int sig) ++{ ++ struct pt_regs regs; ++ ++ CHOOSE_MODE((void) ++ (UPT_SC(®s.regs) = (struct sigcontext *) (&sig + 1)), ++ (void) (regs.regs.skas.is_user = 0)); ++ do_timer(®s); ++} ++ ++void um_timer(int irq, void *dev, struct pt_regs *regs) ++{ ++ do_timer(regs); ++ write_lock(&xtime_lock); ++ vxtime_lock(); ++ timer(); ++ vxtime_unlock(); ++ write_unlock(&xtime_lock); ++} ++ ++long um_time(int * tloc) ++{ ++ struct timeval now; ++ ++ do_gettimeofday(&now); ++ if (tloc) { ++ if (put_user(now.tv_sec,tloc)) ++ now.tv_sec = -EFAULT; ++ } ++ return now.tv_sec; ++} ++ ++long um_stime(int * tptr) ++{ ++ int value; ++ struct timeval new; ++ ++ if (get_user(value, tptr)) ++ return -EFAULT; ++ new.tv_sec = value; ++ new.tv_usec = 0; ++ do_settimeofday(&new); ++ return 0; ++} ++ ++/* XXX Needs to be moved under sys-i386 */ ++void __delay(um_udelay_t time) ++{ ++ /* Stolen from the i386 __loop_delay */ ++ int d0; ++ __asm__ __volatile__( ++ "\tjmp 1f\n" ++ ".align 16\n" ++ "1:\tjmp 2f\n" ++ ".align 16\n" ++ "2:\tdecl %0\n\tjns 2b" ++ :"=&a" (d0) ++ :"0" (time)); ++} ++ ++void __udelay(um_udelay_t usecs) ++{ ++ int i, n; ++ ++ n = (loops_per_jiffy * HZ * usecs) / 1000000; ++ for(i=0;i<n;i++) ; ++} ++ ++void __const_udelay(um_udelay_t usecs) ++{ ++ int i, n; ++ ++ n = (loops_per_jiffy * HZ * usecs) / 1000000; ++ for(i=0;i<n;i++) ; ++} ++ ++void timer_handler(int sig, union uml_pt_regs *regs) ++{ ++#ifdef CONFIG_SMP ++ update_process_times(user_context(UPT_SP(regs))); ++#endif ++ if(current->processor == 0) ++ timer_irq(regs); ++} ++ ++static spinlock_t timer_spinlock = SPIN_LOCK_UNLOCKED; ++ ++unsigned long time_lock(void) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&timer_spinlock, flags); ++ return(flags); ++} ++ ++void time_unlock(unsigned long flags) ++{ ++ spin_unlock_irqrestore(&timer_spinlock, flags); ++} ++ ++int __init timer_init(void) ++{ ++ int err; ++ ++ CHOOSE_MODE(user_time_init_tt(), user_time_init_skas()); ++ if((err = request_irq(TIMER_IRQ, um_timer, SA_INTERRUPT, "timer", ++ NULL)) != 0) ++ printk(KERN_ERR "timer_init : request_irq failed - " ++ "errno = %d\n", -err); ++ timer_irq_inited = 1; ++ return(0); ++} ++ ++__initcall(timer_init); ++ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tlb.c um/arch/um/kernel/tlb.c +--- orig/arch/um/kernel/tlb.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tlb.c Wed Oct 23 22:15:51 2002 +@@ -0,0 +1,80 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/mm.h" ++#include "asm/page.h" ++#include "asm/pgalloc.h" ++#include "choose-mode.h" ++#include "mode_kern.h" ++ ++void flush_tlb_page(struct vm_area_struct *vma, unsigned long address) ++{ ++ address &= PAGE_MASK; ++ flush_tlb_range(vma->vm_mm, address, address + PAGE_SIZE); ++} ++ ++void flush_tlb_all(void) ++{ ++ flush_tlb_mm(current->mm); ++} ++ ++void flush_tlb_kernel_vm(void) ++{ ++ CHOOSE_MODE(flush_tlb_kernel_vm_tt(), flush_tlb_kernel_vm_skas()); ++} ++ ++void __flush_tlb_one(unsigned long addr) ++{ ++ CHOOSE_MODE_PROC(__flush_tlb_one_tt, __flush_tlb_one_skas, addr); ++} ++ ++void flush_tlb_range(struct mm_struct *mm, unsigned long start, ++ unsigned long end) ++{ ++ CHOOSE_MODE_PROC(flush_tlb_range_tt, flush_tlb_range_skas, mm, start, ++ end); ++} ++ ++void flush_tlb_mm(struct mm_struct *mm) ++{ ++ CHOOSE_MODE_PROC(flush_tlb_mm_tt, flush_tlb_mm_skas, mm); ++} ++ ++void force_flush_all(void) ++{ ++ CHOOSE_MODE(force_flush_all_tt(), force_flush_all_skas()); ++} ++ ++ ++pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address) ++{ ++ return(pgd_offset(mm, address)); ++} ++ ++pmd_t *pmd_offset_proc(pgd_t *pgd, unsigned long address) ++{ ++ return(pmd_offset(pgd, address)); ++} ++ ++pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address) ++{ ++ return(pte_offset(pmd, address)); ++} ++ ++pte_t *addr_pte(struct task_struct *task, unsigned long addr) ++{ ++ return(pte_offset(pmd_offset(pgd_offset(task->mm, addr), addr), addr)); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/trap_kern.c um/arch/um/kernel/trap_kern.c +--- orig/arch/um/kernel/trap_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/trap_kern.c Wed Mar 26 13:26:00 2003 +@@ -0,0 +1,192 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/sched.h" ++#include "linux/mm.h" ++#include "linux/spinlock.h" ++#include "linux/config.h" ++#include "linux/init.h" ++#include "asm/semaphore.h" ++#include "asm/pgtable.h" ++#include "asm/pgalloc.h" ++#include "asm/a.out.h" ++#include "asm/current.h" ++#include "asm/irq.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "chan_kern.h" ++#include "mconsole_kern.h" ++#include "2_5compat.h" ++ ++unsigned long handle_page_fault(unsigned long address, unsigned long ip, ++ int is_write, int is_user, int *code_out) ++{ ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long page; ++ int handled = 0; ++ ++ *code_out = SEGV_MAPERR; ++ down_read(&mm->mmap_sem); ++ vma = find_vma(mm, address); ++ if(!vma) ++ goto out; ++ else if(vma->vm_start <= address) ++ goto good_area; ++ else if(!(vma->vm_flags & VM_GROWSDOWN)) ++ goto out; ++ else if(expand_stack(vma, address)) ++ goto out; ++ ++ good_area: ++ *code_out = SEGV_ACCERR; ++ if(is_write && !(vma->vm_flags & VM_WRITE)) ++ goto out; ++ page = address & PAGE_MASK; ++ if(page == (unsigned long) current + PAGE_SIZE) ++ panic("Kernel stack overflow"); ++ pgd = pgd_offset(mm, page); ++ pmd = pmd_offset(pgd, page); ++ do { ++ survive: ++ switch (handle_mm_fault(mm, vma, address, is_write)) { ++ case 1: ++ current->min_flt++; ++ break; ++ case 2: ++ current->maj_flt++; ++ break; ++ default: ++ if (current->pid == 1) { ++ up_read(&mm->mmap_sem); ++ yield(); ++ down_read(&mm->mmap_sem); ++ goto survive; ++ } ++ /* Fall through to bad area case */ ++ case 0: ++ goto out; ++ } ++ pte = pte_offset(pmd, page); ++ } while(!pte_present(*pte)); ++ handled = 1; ++ *pte = pte_mkyoung(*pte); ++ if(pte_write(*pte)) *pte = pte_mkdirty(*pte); ++ flush_tlb_page(vma, page); ++ out: ++ up_read(&mm->mmap_sem); ++ return(handled); ++} ++ ++unsigned long segv(unsigned long address, unsigned long ip, int is_write, ++ int is_user, void *sc) ++{ ++ struct siginfo si; ++ void *catcher; ++ int handled; ++ ++ if(!is_user && (address >= start_vm) && (address < end_vm)){ ++ flush_tlb_kernel_vm(); ++ return(0); ++ } ++ if(current->mm == NULL) ++ panic("Segfault with no mm"); ++ ++ handled = handle_page_fault(address, ip, is_write, is_user, ++ &si.si_code); ++ ++ catcher = current->thread.fault_catcher; ++ if(handled) ++ return(0); ++ else if(catcher != NULL){ ++ current->thread.fault_addr = (void *) address; ++ do_longjmp(catcher, 1); ++ } ++ else if(current->thread.fault_addr != NULL){ ++ panic("fault_addr set but no fault catcher"); ++ } ++ else if(arch_fixup(ip, sc)) ++ return(0); ++ ++ if(!is_user) ++ panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", ++ address, ip); ++ si.si_signo = SIGSEGV; ++ si.si_addr = (void *) address; ++ current->thread.cr2 = address; ++ current->thread.err = is_write; ++ force_sig_info(SIGSEGV, &si, current); ++ return(0); ++} ++ ++void bad_segv(unsigned long address, unsigned long ip, int is_write) ++{ ++ struct siginfo si; ++ ++ printk(KERN_ERR "Unfixable SEGV in '%s' (pid %d) at 0x%lx " ++ "(ip 0x%lx)\n", current->comm, current->pid, address, ip); ++ si.si_signo = SIGSEGV; ++ si.si_code = SEGV_ACCERR; ++ si.si_addr = (void *) address; ++ current->thread.cr2 = address; ++ current->thread.err = is_write; ++ force_sig_info(SIGSEGV, &si, current); ++} ++ ++void relay_signal(int sig, union uml_pt_regs *regs) ++{ ++ if(arch_handle_signal(sig, regs)) return; ++ if(!UPT_IS_USER(regs)) ++ panic("Kernel mode signal %d", sig); ++ force_sig(sig, current); ++} ++ ++void bus_handler(int sig, union uml_pt_regs *regs) ++{ ++ if(current->thread.fault_catcher != NULL) ++ do_longjmp(current->thread.fault_catcher, 1); ++ else relay_signal(sig, regs); ++} ++ ++void winch(int sig, union uml_pt_regs *regs) ++{ ++ do_IRQ(WINCH_IRQ, regs); ++} ++ ++void trap_init(void) ++{ ++} ++ ++spinlock_t trap_lock = SPIN_LOCK_UNLOCKED; ++ ++static int trap_index = 0; ++ ++int next_trap_index(int limit) ++{ ++ int ret; ++ ++ spin_lock(&trap_lock); ++ ret = trap_index; ++ if(++trap_index == limit) ++ trap_index = 0; ++ spin_unlock(&trap_lock); ++ return(ret); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/trap_user.c um/arch/um/kernel/trap_user.c +--- orig/arch/um/kernel/trap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/trap_user.c Wed Mar 26 13:25:50 2003 +@@ -0,0 +1,140 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <errno.h> ++#include <fcntl.h> ++#include <setjmp.h> ++#include <signal.h> ++#include <sys/time.h> ++#include <sys/ioctl.h> ++#include <sys/ptrace.h> ++#include <sys/wait.h> ++#include <asm/page.h> ++#include <asm/unistd.h> ++#include <asm/ptrace.h> ++#include "init.h" ++#include "sysdep/ptrace.h" ++#include "sigcontext.h" ++#include "sysdep/sigcontext.h" ++#include "irq_user.h" ++#include "frame_user.h" ++#include "signal_user.h" ++#include "time_user.h" ++#include "task.h" ++#include "mode.h" ++#include "choose-mode.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "os.h" ++ ++void kill_child_dead(int pid) ++{ ++ kill(pid, SIGKILL); ++ kill(pid, SIGCONT); ++ while(waitpid(pid, NULL, 0) > 0) kill(pid, SIGCONT); ++} ++ ++/* Unlocked - don't care if this is a bit off */ ++int nsegfaults = 0; ++ ++struct { ++ unsigned long address; ++ int is_write; ++ int pid; ++ unsigned long sp; ++ int is_user; ++} segfault_record[1024]; ++ ++void segv_handler(int sig, union uml_pt_regs *regs) ++{ ++ int index, max; ++ ++ if(UPT_IS_USER(regs) && !UPT_SEGV_IS_FIXABLE(regs)){ ++ bad_segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), ++ UPT_FAULT_WRITE(regs)); ++ return; ++ } ++ max = sizeof(segfault_record)/sizeof(segfault_record[0]); ++ index = next_trap_index(max); ++ ++ nsegfaults++; ++ segfault_record[index].address = UPT_FAULT_ADDR(regs); ++ segfault_record[index].pid = os_getpid(); ++ segfault_record[index].is_write = UPT_FAULT_WRITE(regs); ++ segfault_record[index].sp = UPT_SP(regs); ++ segfault_record[index].is_user = UPT_IS_USER(regs); ++ segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), UPT_FAULT_WRITE(regs), ++ UPT_IS_USER(regs), regs); ++} ++ ++void usr2_handler(int sig, union uml_pt_regs *regs) ++{ ++ CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0); ++} ++ ++struct signal_info sig_info[] = { ++ [ SIGTRAP ] { .handler = relay_signal, ++ .is_irq = 0 }, ++ [ SIGFPE ] { .handler = relay_signal, ++ .is_irq = 0 }, ++ [ SIGILL ] { .handler = relay_signal, ++ .is_irq = 0 }, ++ [ SIGWINCH ] { .handler = winch, ++ .is_irq = 1 }, ++ [ SIGBUS ] { .handler = bus_handler, ++ .is_irq = 0 }, ++ [ SIGSEGV] { .handler = segv_handler, ++ .is_irq = 0 }, ++ [ SIGIO ] { .handler = sigio_handler, ++ .is_irq = 1 }, ++ [ SIGVTALRM ] { .handler = timer_handler, ++ .is_irq = 1 }, ++ [ SIGALRM ] { .handler = timer_handler, ++ .is_irq = 1 }, ++ [ SIGUSR2 ] { .handler = usr2_handler, ++ .is_irq = 0 }, ++}; ++ ++void sig_handler(int sig, struct sigcontext sc) ++{ ++ CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas, ++ sig, &sc); ++} ++ ++extern int timer_irq_inited, missed_ticks[]; ++ ++void alarm_handler(int sig, struct sigcontext sc) ++{ ++ if(!timer_irq_inited) return; ++ missed_ticks[cpu()]++; ++ ++ if(sig == SIGALRM) ++ switch_timers(0); ++ ++ CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas, ++ sig, &sc); ++ ++ if(sig == SIGALRM) ++ switch_timers(1); ++} ++ ++void do_longjmp(void *b, int val) ++{ ++ jmp_buf *buf = b; ++ ++ longjmp(*buf, val); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/Makefile um/arch/um/kernel/tt/Makefile +--- orig/arch/um/kernel/tt/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/Makefile Fri Dec 20 23:29:42 2002 +@@ -0,0 +1,39 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = tt.o ++ ++obj-y = exec_kern.o exec_user.o gdb.o ksyms.o mem.o mem_user.o process_kern.o \ ++ syscall_kern.o syscall_user.o time.o tlb.o tracer.o trap_user.o \ ++ uaccess_user.o ++ ++obj-$(CONFIG_PT_PROXY) += gdb_kern.o ++ ++subdir-y = sys-$(SUBARCH) ++subdir-$(CONFIG_PT_PROXY) += ptproxy ++ ++obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o)) ++ ++export-objs = ksyms.o ++ ++USER_OBJS = $(filter %_user.o,$(obj-y)) gdb.o time.o tracer.o ++ ++UNMAP_CFLAGS := $(patsubst -pg -DPROFILING,,$(USER_CFLAGS)) ++UNMAP_CFLAGS := $(patsubst -fprofile-arcs -ftest-coverage,,$(UNMAP_CFLAGS)) ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++$(O_TARGET) : unmap_fin.o ++ ++unmap.o: unmap.c ++ $(CC) $(UNMAP_CFLAGS) -c -o $@ $< ++ ++unmap_fin.o : unmap.o ++ ld -r -o $@ $< -lc -L/usr/lib ++ ++clean : +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/exec_kern.c um/arch/um/kernel/tt/exec_kern.c +--- orig/arch/um/kernel/tt/exec_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/exec_kern.c Thu Oct 24 19:22:17 2002 +@@ -0,0 +1,83 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/mm.h" ++#include "asm/signal.h" ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "asm/pgalloc.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "irq_user.h" ++#include "time_user.h" ++#include "mem_user.h" ++#include "os.h" ++#include "tlb.h" ++ ++static int exec_tramp(void *sig_stack) ++{ ++ init_new_thread_stack(sig_stack, NULL); ++ init_new_thread_signals(1); ++ os_stop_process(os_getpid()); ++ return(0); ++} ++ ++void flush_thread_tt(void) ++{ ++ unsigned long stack; ++ int new_pid; ++ ++ stack = alloc_stack(0, 0); ++ if(stack == 0){ ++ printk(KERN_ERR ++ "flush_thread : failed to allocate temporary stack\n"); ++ do_exit(SIGKILL); ++ } ++ ++ new_pid = start_fork_tramp((void *) current->thread.kernel_stack, ++ stack, 0, exec_tramp); ++ if(new_pid < 0){ ++ printk(KERN_ERR ++ "flush_thread : new thread failed, errno = %d\n", ++ -new_pid); ++ do_exit(SIGKILL); ++ } ++ ++ if(current->processor == 0) ++ forward_interrupts(new_pid); ++ current->thread.request.op = OP_EXEC; ++ current->thread.request.u.exec.pid = new_pid; ++ unprotect_stack((unsigned long) current); ++ os_usr1_process(os_getpid()); ++ ++ enable_timer(); ++ free_page(stack); ++ protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1); ++ task_protections((unsigned long) current); ++ force_flush_all(); ++ unblock_signals(); ++} ++ ++void start_thread_tt(struct pt_regs *regs, unsigned long eip, ++ unsigned long esp) ++{ ++ set_fs(USER_DS); ++ flush_tlb_mm(current->mm); ++ PT_REGS_IP(regs) = eip; ++ PT_REGS_SP(regs) = esp; ++ PT_FIX_EXEC_STACK(esp); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/exec_user.c um/arch/um/kernel/tt/exec_user.c +--- orig/arch/um/kernel/tt/exec_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/exec_user.c Thu Dec 5 19:36:57 2002 +@@ -0,0 +1,49 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <stdlib.h> ++#include <sched.h> ++#include <errno.h> ++#include <sys/wait.h> ++#include <sys/ptrace.h> ++#include <signal.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "ptrace_user.h" ++ ++void do_exec(int old_pid, int new_pid) ++{ ++ unsigned long regs[FRAME_SIZE]; ++ ++ if((ptrace(PTRACE_ATTACH, new_pid, 0, 0) < 0) || ++ (ptrace(PTRACE_CONT, new_pid, 0, 0) < 0) || ++ (waitpid(new_pid, 0, WUNTRACED) < 0)) ++ tracer_panic("do_exec failed to attach proc - errno = %d", ++ errno); ++ ++ if(ptrace_getregs(old_pid, regs) < 0) ++ tracer_panic("do_exec failed to get registers - errno = %d", ++ errno); ++ ++ kill(old_pid, SIGKILL); ++ ++ if(ptrace_setregs(new_pid, regs) < 0) ++ tracer_panic("do_exec failed to start new proc - errno = %d", ++ errno); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/gdb.c um/arch/um/kernel/tt/gdb.c +--- orig/arch/um/kernel/tt/gdb.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/gdb.c Fri Jan 17 13:23:31 2003 +@@ -0,0 +1,278 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <errno.h> ++#include <string.h> ++#include <signal.h> ++#include <sys/ptrace.h> ++#include <sys/types.h> ++#include "uml-config.h" ++#include "kern_constants.h" ++#include "chan_user.h" ++#include "init.h" ++#include "user.h" ++#include "debug.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "tt.h" ++#include "sysdep/thread.h" ++ ++extern int debugger_pid; ++extern int debugger_fd; ++extern int debugger_parent; ++ ++int detach(int pid, int sig) ++{ ++ return(ptrace(PTRACE_DETACH, pid, 0, sig)); ++} ++ ++int attach(int pid) ++{ ++ int err; ++ ++ err = ptrace(PTRACE_ATTACH, pid, 0, 0); ++ if(err < 0) return(-errno); ++ else return(err); ++} ++ ++int cont(int pid) ++{ ++ return(ptrace(PTRACE_CONT, pid, 0, 0)); ++} ++ ++#ifdef UML_CONFIG_PT_PROXY ++ ++int debugger_signal(int status, pid_t pid) ++{ ++ return(debugger_proxy(status, pid)); ++} ++ ++void child_signal(pid_t pid, int status) ++{ ++ child_proxy(pid, status); ++} ++ ++static void gdb_announce(char *dev_name, int dev) ++{ ++ printf("gdb assigned device '%s'\n", dev_name); ++} ++ ++static struct chan_opts opts = { ++ .announce = gdb_announce, ++ .xterm_title = "UML kernel debugger", ++ .raw = 0, ++ .tramp_stack = 0, ++ .in_kernel = 0, ++}; ++ ++/* Accessed by the tracing thread, which automatically serializes access */ ++static void *xterm_data; ++static int xterm_fd; ++ ++extern void *xterm_init(char *, int, struct chan_opts *); ++extern int xterm_open(int, int, int, void *, char **); ++extern void xterm_close(int, void *); ++ ++int open_gdb_chan(void) ++{ ++ char stack[UM_KERN_PAGE_SIZE], *dummy; ++ ++ opts.tramp_stack = (unsigned long) stack; ++ xterm_data = xterm_init("", 0, &opts); ++ xterm_fd = xterm_open(1, 1, 1, xterm_data, &dummy); ++ return(xterm_fd); ++} ++ ++static void exit_debugger_cb(void *unused) ++{ ++ if(debugger_pid != -1){ ++ if(gdb_pid != -1){ ++ fake_child_exit(); ++ gdb_pid = -1; ++ } ++ else kill_child_dead(debugger_pid); ++ debugger_pid = -1; ++ if(debugger_parent != -1) ++ detach(debugger_parent, SIGINT); ++ } ++ if(xterm_data != NULL) xterm_close(xterm_fd, xterm_data); ++} ++ ++static void exit_debugger(void) ++{ ++ initial_thread_cb(exit_debugger_cb, NULL); ++} ++ ++__uml_exitcall(exit_debugger); ++ ++struct gdb_data { ++ char *str; ++ int err; ++}; ++ ++static void config_gdb_cb(void *arg) ++{ ++ struct gdb_data *data = arg; ++ void *task; ++ int pid; ++ ++ data->err = -1; ++ if(debugger_pid != -1) exit_debugger_cb(NULL); ++ if(!strncmp(data->str, "pid,", strlen("pid,"))){ ++ data->str += strlen("pid,"); ++ pid = strtoul(data->str, NULL, 0); ++ task = cpu_tasks[0].task; ++ debugger_pid = attach_debugger(TASK_EXTERN_PID(task), pid, 0); ++ if(debugger_pid != -1){ ++ data->err = 0; ++ gdb_pid = pid; ++ } ++ return; ++ } ++ data->err = 0; ++ debugger_pid = start_debugger(linux_prog, 0, 0, &debugger_fd); ++ init_proxy(debugger_pid, 0, 0); ++} ++ ++int gdb_config(char *str) ++{ ++ struct gdb_data data; ++ ++ if(*str++ != '=') return(-1); ++ data.str = str; ++ initial_thread_cb(config_gdb_cb, &data); ++ return(data.err); ++} ++ ++void remove_gdb_cb(void *unused) ++{ ++ exit_debugger_cb(NULL); ++} ++ ++int gdb_remove(char *unused) ++{ ++ initial_thread_cb(remove_gdb_cb, NULL); ++ return(0); ++} ++ ++void signal_usr1(int sig) ++{ ++ if(debugger_pid != -1){ ++ printk(UM_KERN_ERR "The debugger is already running\n"); ++ return; ++ } ++ debugger_pid = start_debugger(linux_prog, 0, 0, &debugger_fd); ++ init_proxy(debugger_pid, 0, 0); ++} ++ ++int init_ptrace_proxy(int idle_pid, int startup, int stop) ++{ ++ int pid, status; ++ ++ pid = start_debugger(linux_prog, startup, stop, &debugger_fd); ++ status = wait_for_stop(idle_pid, SIGSTOP, PTRACE_CONT, NULL); ++ if(pid < 0){ ++ cont(idle_pid); ++ return(-1); ++ } ++ init_proxy(pid, 1, status); ++ return(pid); ++} ++ ++int attach_debugger(int idle_pid, int pid, int stop) ++{ ++ int status = 0, err; ++ ++ err = attach(pid); ++ if(err < 0){ ++ printf("Failed to attach pid %d, errno = %d\n", pid, -err); ++ return(-1); ++ } ++ if(stop) status = wait_for_stop(idle_pid, SIGSTOP, PTRACE_CONT, NULL); ++ init_proxy(pid, 1, status); ++ return(pid); ++} ++ ++#ifdef notdef /* Put this back in when it does something useful */ ++static int __init uml_gdb_init_setup(char *line, int *add) ++{ ++ gdb_init = uml_strdup(line); ++ return 0; ++} ++ ++__uml_setup("gdb=", uml_gdb_init_setup, ++"gdb=<channel description>\n\n" ++); ++#endif ++ ++static int __init uml_gdb_pid_setup(char *line, int *add) ++{ ++ gdb_pid = strtoul(line, NULL, 0); ++ *add = 0; ++ return 0; ++} ++ ++__uml_setup("gdb-pid=", uml_gdb_pid_setup, ++"gdb-pid=<pid>\n" ++" gdb-pid is used to attach an external debugger to UML. This may be\n" ++" an already-running gdb or a debugger-like process like strace.\n\n" ++); ++ ++#else ++ ++int debugger_signal(int status, pid_t pid){ return(0); } ++void child_signal(pid_t pid, int status){ } ++int init_ptrace_proxy(int idle_pid, int startup, int stop) ++{ ++ printk(UM_KERN_ERR "debug requested when CONFIG_PT_PROXY is off\n"); ++ kill_child_dead(idle_pid); ++ exit(1); ++} ++ ++void signal_usr1(int sig) ++{ ++ printk(UM_KERN_ERR "debug requested when CONFIG_PT_PROXY is off\n"); ++} ++ ++int attach_debugger(int idle_pid, int pid, int stop) ++{ ++ printk(UM_KERN_ERR "attach_debugger called when CONFIG_PT_PROXY " ++ "is off\n"); ++ return(-1); ++} ++ ++int config_gdb(char *str) ++{ ++ return(-1); ++} ++ ++int remove_gdb(void) ++{ ++ return(-1); ++} ++ ++int init_parent_proxy(int pid) ++{ ++ return(-1); ++} ++ ++void debugger_parent_signal(int status, int pid) ++{ ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/gdb_kern.c um/arch/um/kernel/tt/gdb_kern.c +--- orig/arch/um/kernel/tt/gdb_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/gdb_kern.c Sun Dec 15 21:16:17 2002 +@@ -0,0 +1,40 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/init.h" ++#include "linux/config.h" ++#include "mconsole_kern.h" ++ ++#ifdef CONFIG_MCONSOLE ++ ++extern int gdb_config(char *str); ++extern int gdb_remove(char *unused); ++ ++static struct mc_device gdb_mc = { ++ .name = "gdb", ++ .config = gdb_config, ++ .remove = gdb_remove, ++}; ++ ++int gdb_mc_init(void) ++{ ++ mconsole_register_dev(&gdb_mc); ++ return(0); ++} ++ ++__initcall(gdb_mc_init); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/debug.h um/arch/um/kernel/tt/include/debug.h +--- orig/arch/um/kernel/tt/include/debug.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/debug.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,29 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) and ++ * Lars Brinkhoff. ++ * Licensed under the GPL ++ */ ++ ++#ifndef __DEBUG_H ++#define __DEBUG_H ++ ++extern int debugger_proxy(int status, pid_t pid); ++extern void child_proxy(pid_t pid, int status); ++extern void init_proxy (pid_t pid, int waiting, int status); ++extern int start_debugger(char *prog, int startup, int stop, int *debugger_fd); ++extern void fake_child_exit(void); ++extern int gdb_config(char *str); ++extern int gdb_remove(char *unused); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mmu.h um/arch/um/kernel/tt/include/mmu.h +--- orig/arch/um/kernel/tt/include/mmu.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/mmu.h Sat Nov 9 12:51:32 2002 +@@ -0,0 +1,23 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TT_MMU_H ++#define __TT_MMU_H ++ ++struct mmu_context_tt { ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mode.h um/arch/um/kernel/tt/include/mode.h +--- orig/arch/um/kernel/tt/include/mode.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/mode.h Mon Dec 9 00:34:40 2002 +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MODE_TT_H__ ++#define __MODE_TT_H__ ++ ++#include "sysdep/ptrace.h" ++ ++extern int tracing_pid; ++ ++extern int tracer(int (*init_proc)(void *), void *sp); ++extern void user_time_init_tt(void); ++extern int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data); ++extern int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, ++ void *data); ++extern void sig_handler_common_tt(int sig, void *sc); ++extern void syscall_handler_tt(int sig, union uml_pt_regs *regs); ++extern void reboot_tt(void); ++extern void halt_tt(void); ++extern int is_tracer_winch(int pid, int fd, void *data); ++extern void kill_off_processes_tt(void); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/mode_kern.h um/arch/um/kernel/tt/include/mode_kern.h +--- orig/arch/um/kernel/tt/include/mode_kern.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/mode_kern.h Mon Dec 16 21:49:18 2002 +@@ -0,0 +1,52 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TT_MODE_KERN_H__ ++#define __TT_MODE_KERN_H__ ++ ++#include "linux/sched.h" ++#include "asm/page.h" ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++ ++extern void *_switch_to_tt(void *prev, void *next); ++extern void flush_thread_tt(void); ++extern void start_thread_tt(struct pt_regs *regs, unsigned long eip, ++ unsigned long esp); ++extern int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long stack_top, struct task_struct *p, ++ struct pt_regs *regs); ++extern void release_thread_tt(struct task_struct *task); ++extern void exit_thread_tt(void); ++extern void initial_thread_cb_tt(void (*proc)(void *), void *arg); ++extern void init_idle_tt(void); ++extern void flush_tlb_kernel_vm_tt(void); ++extern void __flush_tlb_one_tt(unsigned long addr); ++extern void flush_tlb_range_tt(struct mm_struct *mm, unsigned long start, ++ unsigned long end); ++extern void flush_tlb_mm_tt(struct mm_struct *mm); ++extern void force_flush_all_tt(void); ++extern long execute_syscall_tt(void *r); ++extern void before_mem_tt(unsigned long brk_start); ++extern unsigned long set_task_sizes_tt(int arg, unsigned long *host_size_out, ++ unsigned long *task_size_out); ++extern int start_uml_tt(void); ++extern int external_pid_tt(struct task_struct *task); ++extern int thread_pid_tt(struct thread_struct *thread); ++ ++#define kmem_end_tt (host_task_size - ABOVE_KMEM) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/ptrace-tt.h um/arch/um/kernel/tt/include/ptrace-tt.h +--- orig/arch/um/kernel/tt/include/ptrace-tt.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/ptrace-tt.h Fri Jan 17 13:23:30 2003 +@@ -0,0 +1,26 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PTRACE_TT_H ++#define __PTRACE_TT_H ++ ++#include "uml-config.h" ++ ++#ifdef UML_CONFIG_MODE_TT ++#include "sysdep/sc.h" ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/tt.h um/arch/um/kernel/tt/include/tt.h +--- orig/arch/um/kernel/tt/include/tt.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/tt.h Fri Dec 20 23:29:11 2002 +@@ -0,0 +1,46 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TT_H__ ++#define __TT_H__ ++ ++#include "sysdep/ptrace.h" ++ ++extern int gdb_pid; ++extern int debug; ++extern int debug_stop; ++extern int debug_trace; ++ ++extern int honeypot; ++ ++extern int fork_tramp(void *sig_stack); ++extern int do_proc_op(void *t, int proc_id); ++extern int tracer(int (*init_proc)(void *), void *sp); ++extern void attach_process(int pid); ++extern void tracer_panic(char *format, ...); ++extern void set_init_pid(int pid); ++extern int set_user_mode(void *task); ++extern void set_tracing(void *t, int tracing); ++extern int is_tracing(void *task); ++extern int singlestepping_tt(void *t); ++extern void clear_singlestep(void *t); ++extern void syscall_handler(int sig, union uml_pt_regs *regs); ++extern void exit_kernel(int pid, void *task); ++extern int do_syscall(void *task, int pid); ++extern int is_valid_pid(int pid); ++extern void remap_data(void *segment_start, void *segment_end, int w); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/include/uaccess.h um/arch/um/kernel/tt/include/uaccess.h +--- orig/arch/um/kernel/tt/include/uaccess.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/include/uaccess.h Tue Mar 25 16:58:42 2003 +@@ -0,0 +1,122 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __TT_UACCESS_H ++#define __TT_UACCESS_H ++ ++#include "linux/string.h" ++#include "linux/sched.h" ++#include "asm/processor.h" ++#include "asm/errno.h" ++#include "asm/current.h" ++#include "asm/a.out.h" ++#include "uml_uaccess.h" ++ ++#define ABOVE_KMEM (16 * 1024 * 1024) ++ ++extern unsigned long end_vm; ++extern unsigned long uml_physmem; ++ ++#define under_task_size(addr, size) \ ++ (((unsigned long) (addr) < TASK_SIZE) && \ ++ (((unsigned long) (addr) + (size)) < TASK_SIZE)) ++ ++#define is_stack(addr, size) \ ++ (((unsigned long) (addr) < STACK_TOP) && \ ++ ((unsigned long) (addr) >= STACK_TOP - ABOVE_KMEM) && \ ++ (((unsigned long) (addr) + (size)) <= STACK_TOP)) ++ ++#define access_ok_tt(type, addr, size) \ ++ ((type == VERIFY_READ) || (segment_eq(get_fs(), KERNEL_DS)) || \ ++ (((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) && \ ++ (under_task_size(addr, size) || is_stack(addr, size)))) ++ ++static inline int verify_area_tt(int type, const void * addr, ++ unsigned long size) ++{ ++ return(access_ok_tt(type, addr, size) ? 0 : -EFAULT); ++} ++ ++extern unsigned long get_fault_addr(void); ++ ++extern int __do_copy_from_user(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher); ++ ++static inline int copy_from_user_tt(void *to, const void *from, int n) ++{ ++ if(!access_ok_tt(VERIFY_READ, from, n)) ++ return(n); ++ ++ return(__do_copy_from_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); ++} ++ ++static inline int copy_to_user_tt(void *to, const void *from, int n) ++{ ++ if(!access_ok_tt(VERIFY_WRITE, to, n)) ++ return(n); ++ ++ return(__do_copy_to_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); ++} ++ ++extern int __do_strncpy_from_user(char *dst, const char *src, size_t n, ++ void **fault_addr, void **fault_catcher); ++ ++static inline int strncpy_from_user_tt(char *dst, const char *src, int count) ++{ ++ int n; ++ ++ if(!access_ok_tt(VERIFY_READ, src, 1)) ++ return(-EFAULT); ++ ++ n = __do_strncpy_from_user(dst, src, count, ++ ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher); ++ if(n < 0) return(-EFAULT); ++ return(n); ++} ++ ++extern int __do_clear_user(void *mem, size_t len, void **fault_addr, ++ void **fault_catcher); ++ ++static inline int __clear_user_tt(void *mem, int len) ++{ ++ return(__do_clear_user(mem, len, ++ ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); ++} ++ ++static inline int clear_user_tt(void *mem, int len) ++{ ++ if(!access_ok_tt(VERIFY_WRITE, mem, len)) ++ return(len); ++ ++ return(__do_clear_user(mem, len, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); ++} ++ ++extern int __do_strnlen_user(const char *str, unsigned long n, ++ void **fault_addr, void **fault_catcher); ++ ++static inline int strnlen_user_tt(const void *str, int len) ++{ ++ return(__do_strnlen_user(str, len, ++ ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ksyms.c um/arch/um/kernel/tt/ksyms.c +--- orig/arch/um/kernel/tt/ksyms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ksyms.c Sun Oct 27 17:01:56 2002 +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/module.h" ++#include "asm/uaccess.h" ++#include "mode.h" ++ ++EXPORT_SYMBOL(__do_copy_from_user); ++EXPORT_SYMBOL(__do_copy_to_user); ++EXPORT_SYMBOL(__do_strncpy_from_user); ++EXPORT_SYMBOL(__do_strnlen_user); ++EXPORT_SYMBOL(__do_clear_user); ++ ++EXPORT_SYMBOL(tracing_pid); ++EXPORT_SYMBOL(honeypot); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/mem.c um/arch/um/kernel/tt/mem.c +--- orig/arch/um/kernel/tt/mem.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/mem.c Mon Dec 16 21:49:51 2002 +@@ -0,0 +1,51 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/stddef.h" ++#include "linux/config.h" ++#include "linux/mm.h" ++#include "asm/uaccess.h" ++#include "mem_user.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "kern.h" ++#include "tt.h" ++ ++void before_mem_tt(unsigned long brk_start) ++{ ++ if(!jail || debug) ++ remap_data(UML_ROUND_DOWN(&_stext), UML_ROUND_UP(&_etext), 1); ++ remap_data(UML_ROUND_DOWN(&_sdata), UML_ROUND_UP(&_edata), 1); ++ remap_data(UML_ROUND_DOWN(&__bss_start), UML_ROUND_UP(brk_start), 1); ++} ++ ++#ifdef CONFIG_HOST_2G_2G ++#define TOP 0x80000000 ++#else ++#define TOP 0xc0000000 ++#endif ++ ++#define SIZE ((CONFIG_NEST_LEVEL + CONFIG_KERNEL_HALF_GIGS) * 0x20000000) ++#define START (TOP - SIZE) ++ ++unsigned long set_task_sizes_tt(int arg, unsigned long *host_size_out, ++ unsigned long *task_size_out) ++{ ++ /* Round up to the nearest 4M */ ++ *host_size_out = ROUND_4M((unsigned long) &arg); ++ *task_size_out = START; ++ return(START); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/mem_user.c um/arch/um/kernel/tt/mem_user.c +--- orig/arch/um/kernel/tt/mem_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/mem_user.c Fri Jan 17 22:07:31 2003 +@@ -0,0 +1,50 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <stdio.h> ++#include <unistd.h> ++#include <string.h> ++#include <errno.h> ++#include <sys/mman.h> ++#include "tt.h" ++#include "mem_user.h" ++#include "user_util.h" ++ ++void remap_data(void *segment_start, void *segment_end, int w) ++{ ++ void *addr; ++ unsigned long size; ++ int data, prot; ++ ++ if(w) prot = PROT_WRITE; ++ else prot = 0; ++ prot |= PROT_READ | PROT_EXEC; ++ size = (unsigned long) segment_end - ++ (unsigned long) segment_start; ++ data = create_mem_file(size); ++ if((addr = mmap(NULL, size, PROT_WRITE | PROT_READ, ++ MAP_SHARED, data, 0)) == MAP_FAILED){ ++ perror("mapping new data segment"); ++ exit(1); ++ } ++ memcpy(addr, segment_start, size); ++ if(switcheroo(data, prot, addr, segment_start, ++ size) < 0){ ++ printf("switcheroo failed\n"); ++ exit(1); ++ } ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/process_kern.c um/arch/um/kernel/tt/process_kern.c +--- orig/arch/um/kernel/tt/process_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/process_kern.c Sun Feb 16 21:34:23 2003 +@@ -0,0 +1,516 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "linux/signal.h" ++#include "linux/kernel.h" ++#include "asm/system.h" ++#include "asm/pgalloc.h" ++#include "asm/ptrace.h" ++#include "irq_user.h" ++#include "signal_user.h" ++#include "kern_util.h" ++#include "user_util.h" ++#include "os.h" ++#include "kern.h" ++#include "sigcontext.h" ++#include "time_user.h" ++#include "mem_user.h" ++#include "tlb.h" ++#include "mode.h" ++#include "init.h" ++#include "tt.h" ++ ++void *_switch_to_tt(void *prev, void *next) ++{ ++ struct task_struct *from, *to; ++ unsigned long flags; ++ int err, vtalrm, alrm, prof, cpu; ++ char c; ++ /* jailing and SMP are incompatible, so this doesn't need to be ++ * made per-cpu ++ */ ++ static int reading; ++ ++ from = prev; ++ to = next; ++ ++ to->thread.prev_sched = from; ++ ++ cpu = from->processor; ++ if(cpu == 0) ++ forward_interrupts(to->thread.mode.tt.extern_pid); ++#ifdef CONFIG_SMP ++ forward_ipi(cpu_data[cpu].ipi_pipe[0], to->thread.mode.tt.extern_pid); ++#endif ++ local_irq_save(flags); ++ ++ vtalrm = change_sig(SIGVTALRM, 0); ++ alrm = change_sig(SIGALRM, 0); ++ prof = change_sig(SIGPROF, 0); ++ ++ c = 0; ++ set_current(to); ++ ++ reading = 0; ++ err = os_write_file(to->thread.mode.tt.switch_pipe[1], &c, sizeof(c)); ++ if(err != sizeof(c)) ++ panic("write of switch_pipe failed, errno = %d", -err); ++ ++ reading = 1; ++ if(from->state == TASK_ZOMBIE) ++ os_kill_process(os_getpid(), 0); ++ ++ err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c)); ++ if(err != sizeof(c)) ++ panic("read of switch_pipe failed, errno = %d", -err); ++ ++ /* This works around a nasty race with 'jail'. If we are switching ++ * between two threads of a threaded app and the incoming process ++ * runs before the outgoing process reaches the read, and it makes ++ * it all the way out to userspace, then it will have write-protected ++ * the outgoing process stack. Then, when the outgoing process ++ * returns from the write, it will segfault because it can no longer ++ * write its own stack. So, in order to avoid that, the incoming ++ * thread sits in a loop yielding until 'reading' is set. This ++ * isn't entirely safe, since there may be a reschedule from a timer ++ * happening between setting 'reading' and sleeping in read. But, ++ * it should get a whole quantum in which to reach the read and sleep, ++ * which should be enough. ++ */ ++ ++ if(jail){ ++ while(!reading) sched_yield(); ++ } ++ ++ change_sig(SIGVTALRM, vtalrm); ++ change_sig(SIGALRM, alrm); ++ change_sig(SIGPROF, prof); ++ ++ arch_switch(); ++ ++ flush_tlb_all(); ++ local_irq_restore(flags); ++ ++ return(current->thread.prev_sched); ++} ++ ++void release_thread_tt(struct task_struct *task) ++{ ++ os_kill_process(task->thread.mode.tt.extern_pid, 0); ++} ++ ++void exit_thread_tt(void) ++{ ++ close(current->thread.mode.tt.switch_pipe[0]); ++ close(current->thread.mode.tt.switch_pipe[1]); ++} ++ ++extern void schedule_tail(struct task_struct *prev); ++ ++static void new_thread_handler(int sig) ++{ ++ int (*fn)(void *); ++ void *arg; ++ ++ fn = current->thread.request.u.thread.proc; ++ arg = current->thread.request.u.thread.arg; ++ UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); ++ suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); ++ ++ init_new_thread_signals(1); ++ enable_timer(); ++ free_page(current->thread.temp_stack); ++ set_cmdline("(kernel thread)"); ++ force_flush_all(); ++ ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ ++ change_sig(SIGUSR1, 1); ++ change_sig(SIGVTALRM, 1); ++ change_sig(SIGPROF, 1); ++ sti(); ++ if(!run_kernel_thread(fn, arg, ¤t->thread.exec_buf)) ++ do_exit(0); ++} ++ ++static int new_thread_proc(void *stack) ++{ ++ cli(); ++ init_new_thread_stack(stack, new_thread_handler); ++ os_usr1_process(os_getpid()); ++ return(0); ++} ++ ++/* Signal masking - signals are blocked at the start of fork_tramp. They ++ * are re-enabled when finish_fork_handler is entered by fork_tramp hitting ++ * itself with a SIGUSR1. set_user_mode has to be run with SIGUSR1 off, ++ * so it is blocked before it's called. They are re-enabled on sigreturn ++ * despite the fact that they were blocked when the SIGUSR1 was issued because ++ * copy_thread copies the parent's signcontext, including the signal mask ++ * onto the signal frame. ++ */ ++ ++static void finish_fork_handler(int sig) ++{ ++ UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); ++ suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); ++ ++ init_new_thread_signals(1); ++ enable_timer(); ++ sti(); ++ force_flush_all(); ++ if(current->mm != current->p_pptr->mm) ++ protect_memory(uml_reserved, high_physmem - uml_reserved, 1, ++ 1, 0, 1); ++ task_protections((unsigned long) current); ++ ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ ++ free_page(current->thread.temp_stack); ++ cli(); ++ change_sig(SIGUSR1, 0); ++ set_user_mode(current); ++} ++ ++int fork_tramp(void *stack) ++{ ++ cli(); ++ init_new_thread_stack(stack, finish_fork_handler); ++ os_usr1_process(os_getpid()); ++ return(0); ++} ++ ++int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp, ++ unsigned long stack_top, struct task_struct * p, ++ struct pt_regs *regs) ++{ ++ int (*tramp)(void *); ++ int new_pid, err; ++ unsigned long stack; ++ ++ if(current->thread.forking) ++ tramp = fork_tramp; ++ else { ++ tramp = new_thread_proc; ++ p->thread.request.u.thread = current->thread.request.u.thread; ++ } ++ ++ err = os_pipe(p->thread.mode.tt.switch_pipe, 1, 1); ++ if(err){ ++ printk("copy_thread : pipe failed, errno = %d\n", -err); ++ return(err); ++ } ++ ++ stack = alloc_stack(0, 0); ++ if(stack == 0){ ++ printk(KERN_ERR "copy_thread : failed to allocate " ++ "temporary stack\n"); ++ return(-ENOMEM); ++ } ++ ++ clone_flags &= CLONE_VM; ++ p->thread.temp_stack = stack; ++ new_pid = start_fork_tramp((void *) p->thread.kernel_stack, stack, ++ clone_flags, tramp); ++ if(new_pid < 0){ ++ printk(KERN_ERR "copy_thread : clone failed - errno = %d\n", ++ -new_pid); ++ return(new_pid); ++ } ++ ++ if(current->thread.forking){ ++ sc_to_sc(UPT_SC(&p->thread.regs.regs), ++ UPT_SC(¤t->thread.regs.regs)); ++ SC_SET_SYSCALL_RETURN(UPT_SC(&p->thread.regs.regs), 0); ++ if(sp != 0) SC_SP(UPT_SC(&p->thread.regs.regs)) = sp; ++ } ++ p->thread.mode.tt.extern_pid = new_pid; ++ ++ current->thread.request.op = OP_FORK; ++ current->thread.request.u.fork.pid = new_pid; ++ os_usr1_process(os_getpid()); ++ return(0); ++} ++ ++void reboot_tt(void) ++{ ++ current->thread.request.op = OP_REBOOT; ++ os_usr1_process(os_getpid()); ++ os_kill_process(os_getpid(), 0); ++} ++ ++void halt_tt(void) ++{ ++ current->thread.request.op = OP_HALT; ++ os_usr1_process(os_getpid()); ++ os_kill_process(os_getpid(), 0); ++} ++ ++void kill_off_processes_tt(void) ++{ ++ struct task_struct *p; ++ int me; ++ ++ me = os_getpid(); ++ for_each_task(p){ ++ int pid = p->thread.mode.tt.extern_pid; ++ if((pid != me) && (pid != -1)) ++ os_kill_process(p->thread.mode.tt.extern_pid, 0); ++ } ++ if((init_task.thread.mode.tt.extern_pid != me) && ++ (init_task.thread.mode.tt.extern_pid != -1)) ++ os_kill_process(init_task.thread.mode.tt.extern_pid, 0); ++} ++ ++void initial_thread_cb_tt(void (*proc)(void *), void *arg) ++{ ++ if(os_getpid() == tracing_pid){ ++ (*proc)(arg); ++ } ++ else { ++ current->thread.request.op = OP_CB; ++ current->thread.request.u.cb.proc = proc; ++ current->thread.request.u.cb.arg = arg; ++ os_usr1_process(os_getpid()); ++ } ++} ++ ++int do_proc_op(void *t, int proc_id) ++{ ++ struct task_struct *task; ++ struct thread_struct *thread; ++ int op, pid; ++ ++ task = t; ++ thread = &task->thread; ++ op = thread->request.op; ++ switch(op){ ++ case OP_NONE: ++ case OP_TRACE_ON: ++ break; ++ case OP_EXEC: ++ pid = thread->request.u.exec.pid; ++ do_exec(thread->mode.tt.extern_pid, pid); ++ thread->mode.tt.extern_pid = pid; ++ cpu_tasks[task->processor].pid = pid; ++ break; ++ case OP_FORK: ++ attach_process(thread->request.u.fork.pid); ++ break; ++ case OP_CB: ++ (*thread->request.u.cb.proc)(thread->request.u.cb.arg); ++ break; ++ case OP_REBOOT: ++ case OP_HALT: ++ break; ++ default: ++ tracer_panic("Bad op in do_proc_op"); ++ break; ++ } ++ thread->request.op = OP_NONE; ++ return(op); ++} ++ ++void init_idle_tt(void) ++{ ++ idle_timer(); ++} ++ ++/* Changed by jail_setup, which is a setup */ ++int jail = 0; ++ ++int __init jail_setup(char *line, int *add) ++{ ++ int ok = 1; ++ ++ if(jail) return(0); ++#ifdef CONFIG_SMP ++ printf("'jail' may not used used in a kernel with CONFIG_SMP " ++ "enabled\n"); ++ ok = 0; ++#endif ++#ifdef CONFIG_HOSTFS ++ printf("'jail' may not used used in a kernel with CONFIG_HOSTFS " ++ "enabled\n"); ++ ok = 0; ++#endif ++#ifdef CONFIG_MODULES ++ printf("'jail' may not used used in a kernel with CONFIG_MODULES " ++ "enabled\n"); ++ ok = 0; ++#endif ++ if(!ok) exit(1); ++ ++ /* CAP_SYS_RAWIO controls the ability to open /dev/mem and /dev/kmem. ++ * Removing it from the bounding set eliminates the ability of anything ++ * to acquire it, and thus read or write kernel memory. ++ */ ++ cap_lower(cap_bset, CAP_SYS_RAWIO); ++ jail = 1; ++ return(0); ++} ++ ++__uml_setup("jail", jail_setup, ++"jail\n" ++" Enables the protection of kernel memory from processes.\n\n" ++); ++ ++static void mprotect_kernel_mem(int w) ++{ ++ unsigned long start, end; ++ int pages; ++ ++ if(!jail || (current == &init_task)) return; ++ ++ pages = (1 << CONFIG_KERNEL_STACK_ORDER); ++ ++ start = (unsigned long) current + PAGE_SIZE; ++ end = (unsigned long) current + PAGE_SIZE * pages; ++ protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1); ++ protect_memory(end, high_physmem - end, 1, w, 1, 1); ++ ++ start = (unsigned long) UML_ROUND_DOWN(&_stext); ++ end = (unsigned long) UML_ROUND_UP(&_etext); ++ protect_memory(start, end - start, 1, w, 1, 1); ++ ++ start = (unsigned long) UML_ROUND_DOWN(&_unprotected_end); ++ end = (unsigned long) UML_ROUND_UP(&_edata); ++ protect_memory(start, end - start, 1, w, 1, 1); ++ ++ start = (unsigned long) UML_ROUND_DOWN(&__bss_start); ++ end = (unsigned long) UML_ROUND_UP(brk_start); ++ protect_memory(start, end - start, 1, w, 1, 1); ++ ++ mprotect_kernel_vm(w); ++} ++ ++void unprotect_kernel_mem(void) ++{ ++ mprotect_kernel_mem(1); ++} ++ ++void protect_kernel_mem(void) ++{ ++ mprotect_kernel_mem(0); ++} ++ ++extern void start_kernel(void); ++ ++static int start_kernel_proc(void *unused) ++{ ++ int pid; ++ ++ block_signals(); ++ pid = os_getpid(); ++ ++ cpu_tasks[0].pid = pid; ++ cpu_tasks[0].task = current; ++#ifdef CONFIG_SMP ++ cpu_online_map = 1; ++#endif ++ if(debug) os_stop_process(pid); ++ start_kernel(); ++ return(0); ++} ++ ++void set_tracing(void *task, int tracing) ++{ ++ ((struct task_struct *) task)->thread.mode.tt.tracing = tracing; ++} ++ ++int is_tracing(void *t) ++{ ++ return (((struct task_struct *) t)->thread.mode.tt.tracing); ++} ++ ++int set_user_mode(void *t) ++{ ++ struct task_struct *task; ++ ++ task = t ? t : current; ++ if(task->thread.mode.tt.tracing) ++ return(1); ++ task->thread.request.op = OP_TRACE_ON; ++ os_usr1_process(os_getpid()); ++ return(0); ++} ++ ++void set_init_pid(int pid) ++{ ++ int err; ++ ++ init_task.thread.mode.tt.extern_pid = pid; ++ err = os_pipe(init_task.thread.mode.tt.switch_pipe, 1, 1); ++ if(err) panic("Can't create switch pipe for init_task, errno = %d", ++ err); ++} ++ ++int singlestepping_tt(void *t) ++{ ++ struct task_struct *task = t; ++ ++ if(task->thread.mode.tt.singlestep_syscall) ++ return(0); ++ return(task->ptrace & PT_DTRACE); ++} ++ ++void clear_singlestep(void *t) ++{ ++ struct task_struct *task = t; ++ ++ task->ptrace &= ~PT_DTRACE; ++} ++ ++int start_uml_tt(void) ++{ ++ void *sp; ++ int pages; ++ ++ pages = (1 << CONFIG_KERNEL_STACK_ORDER) - 2; ++ sp = (void *) init_task.thread.kernel_stack + pages * PAGE_SIZE - ++ sizeof(unsigned long); ++ return(tracer(start_kernel_proc, sp)); ++} ++ ++int external_pid_tt(struct task_struct *task) ++{ ++ return(task->thread.mode.tt.extern_pid); ++} ++ ++int thread_pid_tt(struct thread_struct *thread) ++{ ++ return(thread->mode.tt.extern_pid); ++} ++ ++int is_valid_pid(int pid) ++{ ++ struct task_struct *task; ++ ++ read_lock(&tasklist_lock); ++ for_each_task(task){ ++ if(task->thread.mode.tt.extern_pid == pid){ ++ read_unlock(&tasklist_lock); ++ return(1); ++ } ++ } ++ read_unlock(&tasklist_lock); ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/Makefile um/arch/um/kernel/tt/ptproxy/Makefile +--- orig/arch/um/kernel/tt/ptproxy/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,12 @@ ++O_TARGET = ptproxy.o ++ ++obj-y = proxy.o ptrace.o sysdep.o wait.o ++ ++USER_OBJS = $(obj-y) ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean: +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/proxy.c um/arch/um/kernel/tt/ptproxy/proxy.c +--- orig/arch/um/kernel/tt/ptproxy/proxy.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/proxy.c Wed Apr 16 14:01:03 2003 +@@ -0,0 +1,370 @@ ++/********************************************************************** ++proxy.c ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++ ++Jeff Dike (jdike@karaya.com) : Modified for integration into uml ++**********************************************************************/ ++ ++/* XXX This file shouldn't refer to CONFIG_* */ ++ ++#include <errno.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <signal.h> ++#include <string.h> ++#include <fcntl.h> ++#include <termios.h> ++#include <sys/wait.h> ++#include <sys/types.h> ++#include <sys/ptrace.h> ++#include <sys/ioctl.h> ++#include <asm/unistd.h> ++ ++#include "ptproxy.h" ++#include "sysdep.h" ++#include "wait.h" ++ ++#include "user_util.h" ++#include "user.h" ++#include "os.h" ++#include "tempfile.h" ++ ++static int debugger_wait(debugger_state *debugger, int *status, int options, ++ int (*syscall)(debugger_state *debugger, pid_t child), ++ int (*normal_return)(debugger_state *debugger, ++ pid_t unused), ++ int (*wait_return)(debugger_state *debugger, ++ pid_t unused)) ++{ ++ if(debugger->real_wait){ ++ debugger->handle_trace = normal_return; ++ syscall_continue(debugger->pid); ++ debugger->real_wait = 0; ++ return(1); ++ } ++ debugger->wait_status_ptr = status; ++ debugger->wait_options = options; ++ if((debugger->debugee != NULL) && debugger->debugee->event){ ++ syscall_continue(debugger->pid); ++ wait_for_stop(debugger->pid, SIGTRAP, PTRACE_SYSCALL, ++ NULL); ++ (*wait_return)(debugger, -1); ++ return(0); ++ } ++ else if(debugger->wait_options & WNOHANG){ ++ syscall_cancel(debugger->pid, 0); ++ debugger->handle_trace = syscall; ++ return(0); ++ } ++ else { ++ syscall_pause(debugger->pid); ++ debugger->handle_trace = wait_return; ++ debugger->waiting = 1; ++ } ++ return(1); ++} ++ ++/* ++ * Handle debugger trap, i.e. syscall. ++ */ ++ ++int debugger_syscall(debugger_state *debugger, pid_t child) ++{ ++ long arg1, arg2, arg3, arg4, arg5, result; ++ int syscall, ret = 0; ++ ++ syscall = get_syscall(debugger->pid, &arg1, &arg2, &arg3, &arg4, ++ &arg5); ++ ++ switch(syscall){ ++ case __NR_execve: ++ /* execve never returns */ ++ debugger->handle_trace = debugger_syscall; ++ break; ++ ++ case __NR_ptrace: ++ if(debugger->debugee->pid != 0) arg2 = debugger->debugee->pid; ++ if(!debugger->debugee->in_context) ++ child = debugger->debugee->pid; ++ result = proxy_ptrace(debugger, arg1, arg2, arg3, arg4, child, ++ &ret); ++ syscall_cancel(debugger->pid, result); ++ debugger->handle_trace = debugger_syscall; ++ return(ret); ++ ++ case __NR_waitpid: ++ case __NR_wait4: ++ if(!debugger_wait(debugger, (int *) arg2, arg3, ++ debugger_syscall, debugger_normal_return, ++ proxy_wait_return)) ++ return(0); ++ break; ++ ++ case __NR_kill: ++ if(!debugger->debugee->in_context) ++ child = debugger->debugee->pid; ++ if(arg1 == debugger->debugee->pid){ ++ result = kill(child, arg2); ++ syscall_cancel(debugger->pid, result); ++ debugger->handle_trace = debugger_syscall; ++ return(0); ++ } ++ else debugger->handle_trace = debugger_normal_return; ++ break; ++ ++ default: ++ debugger->handle_trace = debugger_normal_return; ++ } ++ ++ syscall_continue(debugger->pid); ++ return(0); ++} ++ ++/* Used by the tracing thread */ ++static debugger_state parent; ++static int parent_syscall(debugger_state *debugger, int pid); ++ ++int init_parent_proxy(int pid) ++{ ++ parent = ((debugger_state) { .pid = pid, ++ .wait_options = 0, ++ .wait_status_ptr = NULL, ++ .waiting = 0, ++ .real_wait = 0, ++ .expecting_child = 0, ++ .handle_trace = parent_syscall, ++ .debugee = NULL } ); ++ return(0); ++} ++ ++int parent_normal_return(debugger_state *debugger, pid_t unused) ++{ ++ debugger->handle_trace = parent_syscall; ++ syscall_continue(debugger->pid); ++ return(0); ++} ++ ++static int parent_syscall(debugger_state *debugger, int pid) ++{ ++ long arg1, arg2, arg3, arg4, arg5; ++ int syscall; ++ ++ syscall = get_syscall(pid, &arg1, &arg2, &arg3, &arg4, &arg5); ++ ++ if((syscall == __NR_waitpid) || (syscall == __NR_wait4)){ ++ debugger_wait(&parent, (int *) arg2, arg3, parent_syscall, ++ parent_normal_return, parent_wait_return); ++ } ++ else ptrace(PTRACE_SYSCALL, pid, 0, 0); ++ return(0); ++} ++ ++int debugger_normal_return(debugger_state *debugger, pid_t unused) ++{ ++ debugger->handle_trace = debugger_syscall; ++ syscall_continue(debugger->pid); ++ return(0); ++} ++ ++void debugger_cancelled_return(debugger_state *debugger, int result) ++{ ++ debugger->handle_trace = debugger_syscall; ++ syscall_set_result(debugger->pid, result); ++ syscall_continue(debugger->pid); ++} ++ ++/* Used by the tracing thread */ ++static debugger_state debugger; ++static debugee_state debugee; ++ ++void init_proxy (pid_t debugger_pid, int stopped, int status) ++{ ++ debugger.pid = debugger_pid; ++ debugger.handle_trace = debugger_syscall; ++ debugger.debugee = &debugee; ++ debugger.waiting = 0; ++ debugger.real_wait = 0; ++ debugger.expecting_child = 0; ++ ++ debugee.pid = 0; ++ debugee.traced = 0; ++ debugee.stopped = stopped; ++ debugee.event = 0; ++ debugee.zombie = 0; ++ debugee.died = 0; ++ debugee.wait_status = status; ++ debugee.in_context = 1; ++} ++ ++int debugger_proxy(int status, int pid) ++{ ++ int ret = 0, sig; ++ ++ if(WIFSTOPPED(status)){ ++ sig = WSTOPSIG(status); ++ if (sig == SIGTRAP) ++ ret = (*debugger.handle_trace)(&debugger, pid); ++ ++ else if(sig == SIGCHLD){ ++ if(debugger.expecting_child){ ++ ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); ++ debugger.expecting_child = 0; ++ } ++ else if(debugger.waiting) ++ real_wait_return(&debugger); ++ else { ++ ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); ++ debugger.real_wait = 1; ++ } ++ } ++ else ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); ++ } ++ else if(WIFEXITED(status)){ ++ tracer_panic("debugger (pid %d) exited with status %d", ++ debugger.pid, WEXITSTATUS(status)); ++ } ++ else if(WIFSIGNALED(status)){ ++ tracer_panic("debugger (pid %d) exited with signal %d", ++ debugger.pid, WTERMSIG(status)); ++ } ++ else { ++ tracer_panic("proxy got unknown status (0x%x) on debugger " ++ "(pid %d)", status, debugger.pid); ++ } ++ return(ret); ++} ++ ++void child_proxy(pid_t pid, int status) ++{ ++ debugee.event = 1; ++ debugee.wait_status = status; ++ ++ if(WIFSTOPPED(status)){ ++ debugee.stopped = 1; ++ debugger.expecting_child = 1; ++ kill(debugger.pid, SIGCHLD); ++ } ++ else if(WIFEXITED(status) || WIFSIGNALED(status)){ ++ debugee.zombie = 1; ++ debugger.expecting_child = 1; ++ kill(debugger.pid, SIGCHLD); ++ } ++ else panic("proxy got unknown status (0x%x) on child (pid %d)", ++ status, pid); ++} ++ ++void debugger_parent_signal(int status, int pid) ++{ ++ int sig; ++ ++ if(WIFSTOPPED(status)){ ++ sig = WSTOPSIG(status); ++ if(sig == SIGTRAP) (*parent.handle_trace)(&parent, pid); ++ else ptrace(PTRACE_SYSCALL, pid, 0, sig); ++ } ++} ++ ++void fake_child_exit(void) ++{ ++ int status, pid; ++ ++ child_proxy(1, W_EXITCODE(0, 0)); ++ while(debugger.waiting == 1){ ++ pid = waitpid(debugger.pid, &status, WUNTRACED); ++ if(pid != debugger.pid){ ++ printk("fake_child_exit - waitpid failed, " ++ "errno = %d\n", errno); ++ return; ++ } ++ debugger_proxy(status, debugger.pid); ++ } ++ pid = waitpid(debugger.pid, &status, WUNTRACED); ++ if(pid != debugger.pid){ ++ printk("fake_child_exit - waitpid failed, " ++ "errno = %d\n", errno); ++ return; ++ } ++ if(ptrace(PTRACE_DETACH, debugger.pid, 0, SIGCONT) < 0) ++ printk("fake_child_exit - PTRACE_DETACH failed, errno = %d\n", ++ errno); ++} ++ ++char gdb_init_string[] = ++"att 1 \n\ ++b panic \n\ ++b stop \n\ ++handle SIGWINCH nostop noprint pass \n\ ++"; ++ ++int start_debugger(char *prog, int startup, int stop, int *fd_out) ++{ ++ int slave, child; ++ ++ slave = open_gdb_chan(); ++ if((child = fork()) == 0){ ++ char *tempname = NULL; ++ int fd; ++ ++ if(setsid() < 0) perror("setsid"); ++ if((dup2(slave, 0) < 0) || (dup2(slave, 1) < 0) || ++ (dup2(slave, 2) < 0)){ ++ printk("start_debugger : dup2 failed, errno = %d\n", ++ errno); ++ exit(1); ++ } ++ if(ioctl(0, TIOCSCTTY, 0) < 0){ ++ printk("start_debugger : TIOCSCTTY failed, " ++ "errno = %d\n", errno); ++ exit(1); ++ } ++ if(tcsetpgrp (1, os_getpid()) < 0){ ++ printk("start_debugger : tcsetpgrp failed, " ++ "errno = %d\n", errno); ++#ifdef notdef ++ exit(1); ++#endif ++ } ++ if((fd = make_tempfile("/tmp/gdb_init-XXXXXX", &tempname, 0)) < 0){ ++ printk("start_debugger : make_tempfile failed, errno = %d\n", ++ errno); ++ exit(1); ++ } ++ write(fd, gdb_init_string, sizeof(gdb_init_string) - 1); ++ if(startup){ ++ if(stop){ ++ write(fd, "b start_kernel\n", ++ strlen("b start_kernel\n")); ++ } ++ write(fd, "c\n", strlen("c\n")); ++ } ++ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){ ++ printk("start_debugger : PTRACE_TRACEME failed, " ++ "errno = %d\n", errno); ++ exit(1); ++ } ++ execlp("gdb", "gdb", "--command", tempname, prog, NULL); ++ printk("start_debugger : exec of gdb failed, errno = %d\n", ++ errno); ++ } ++ if(child < 0){ ++ printk("start_debugger : fork for gdb failed, errno = %d\n", ++ errno); ++ return(-1); ++ } ++ *fd_out = slave; ++ return(child); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/ptproxy.h um/arch/um/kernel/tt/ptproxy/ptproxy.h +--- orig/arch/um/kernel/tt/ptproxy/ptproxy.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/ptproxy.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,61 @@ ++/********************************************************************** ++ptproxy.h ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++**********************************************************************/ ++ ++#ifndef __PTPROXY_H ++#define __PTPROXY_H ++ ++#include <sys/types.h> ++ ++typedef struct debugger debugger_state; ++typedef struct debugee debugee_state; ++ ++struct debugger ++{ ++ pid_t pid; ++ int wait_options; ++ int *wait_status_ptr; ++ unsigned int waiting : 1; ++ unsigned int real_wait : 1; ++ unsigned int expecting_child : 1; ++ int (*handle_trace) (debugger_state *, pid_t); ++ ++ debugee_state *debugee; ++}; ++ ++struct debugee ++{ ++ pid_t pid; ++ int wait_status; ++ unsigned int died : 1; ++ unsigned int event : 1; ++ unsigned int stopped : 1; ++ unsigned int trace_singlestep : 1; ++ unsigned int trace_syscall : 1; ++ unsigned int traced : 1; ++ unsigned int zombie : 1; ++ unsigned int in_context : 1; ++}; ++ ++extern int debugger_syscall(debugger_state *debugger, pid_t pid); ++extern int debugger_normal_return (debugger_state *debugger, pid_t unused); ++ ++extern long proxy_ptrace (struct debugger *, int, pid_t, long, long, pid_t, ++ int *strace_out); ++extern void debugger_cancelled_return(debugger_state *debugger, int result); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/ptrace.c um/arch/um/kernel/tt/ptproxy/ptrace.c +--- orig/arch/um/kernel/tt/ptproxy/ptrace.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/ptrace.c Mon Nov 11 13:06:03 2002 +@@ -0,0 +1,239 @@ ++/********************************************************************** ++ptrace.c ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++ ++Jeff Dike (jdike@karaya.com) : Modified for integration into uml ++**********************************************************************/ ++ ++#include <errno.h> ++#include <unistd.h> ++#include <signal.h> ++#include <sys/types.h> ++#include <sys/time.h> ++#include <sys/ptrace.h> ++#include <sys/wait.h> ++#include <asm/ptrace.h> ++ ++#include "ptproxy.h" ++#include "debug.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "ptrace_user.h" ++#include "tt.h" ++ ++long proxy_ptrace(struct debugger *debugger, int arg1, pid_t arg2, ++ long arg3, long arg4, pid_t child, int *ret) ++{ ++ sigset_t relay; ++ long result; ++ int status; ++ ++ *ret = 0; ++ if(debugger->debugee->died) return(-ESRCH); ++ ++ switch(arg1){ ++ case PTRACE_ATTACH: ++ if(debugger->debugee->traced) return(-EPERM); ++ ++ debugger->debugee->pid = arg2; ++ debugger->debugee->traced = 1; ++ ++ if(is_valid_pid(arg2) && (arg2 != child)){ ++ debugger->debugee->in_context = 0; ++ kill(arg2, SIGSTOP); ++ debugger->debugee->event = 1; ++ debugger->debugee->wait_status = W_STOPCODE(SIGSTOP); ++ } ++ else { ++ debugger->debugee->in_context = 1; ++ if(debugger->debugee->stopped) ++ child_proxy(child, W_STOPCODE(SIGSTOP)); ++ else kill(child, SIGSTOP); ++ } ++ ++ return(0); ++ ++ case PTRACE_DETACH: ++ if(!debugger->debugee->traced) return(-EPERM); ++ ++ debugger->debugee->traced = 0; ++ debugger->debugee->pid = 0; ++ if(!debugger->debugee->in_context) ++ kill(child, SIGCONT); ++ ++ return(0); ++ ++ case PTRACE_CONT: ++ if(!debugger->debugee->in_context) return(-EPERM); ++ *ret = PTRACE_CONT; ++ return(ptrace(PTRACE_CONT, child, arg3, arg4)); ++ ++#ifdef UM_HAVE_GETFPREGS ++ case PTRACE_GETFPREGS: ++ { ++ long regs[FP_FRAME_SIZE]; ++ int i, result; ++ ++ result = ptrace(PTRACE_GETFPREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ ptrace(PTRACE_POKEDATA, debugger->pid, arg4 + 4 * i, ++ regs[i]); ++ return(result); ++ } ++#endif ++ ++#ifdef UM_HAVE_GETFPXREGS ++ case PTRACE_GETFPXREGS: ++ { ++ long regs[FPX_FRAME_SIZE]; ++ int i, result; ++ ++ result = ptrace(PTRACE_GETFPXREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ ptrace(PTRACE_POKEDATA, debugger->pid, arg4 + 4 * i, ++ regs[i]); ++ return(result); ++ } ++#endif ++ ++#ifdef UM_HAVE_GETREGS ++ case PTRACE_GETREGS: ++ { ++ long regs[FRAME_SIZE]; ++ int i, result; ++ ++ result = ptrace(PTRACE_GETREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ ptrace (PTRACE_POKEDATA, debugger->pid, ++ arg4 + 4 * i, regs[i]); ++ return(result); ++ } ++ break; ++#endif ++ ++ case PTRACE_KILL: ++ result = ptrace(PTRACE_KILL, child, arg3, arg4); ++ if(result == -1) return(-errno); ++ ++ return(result); ++ ++ case PTRACE_PEEKDATA: ++ case PTRACE_PEEKTEXT: ++ case PTRACE_PEEKUSER: ++ /* The value being read out could be -1, so we have to ++ * check errno to see if there's an error, and zero it ++ * beforehand so we're not faked out by an old error ++ */ ++ ++ errno = 0; ++ result = ptrace(arg1, child, arg3, 0); ++ if((result == -1) && (errno != 0)) return(-errno); ++ ++ result = ptrace(PTRACE_POKEDATA, debugger->pid, arg4, result); ++ if(result == -1) return(-errno); ++ ++ return(result); ++ ++ case PTRACE_POKEDATA: ++ case PTRACE_POKETEXT: ++ case PTRACE_POKEUSER: ++ result = ptrace(arg1, child, arg3, arg4); ++ if(result == -1) return(-errno); ++ ++ if(arg1 == PTRACE_POKEUSER) ptrace_pokeuser(arg3, arg4); ++ return(result); ++ ++#ifdef UM_HAVE_SETFPREGS ++ case PTRACE_SETFPREGS: ++ { ++ long regs[FP_FRAME_SIZE]; ++ int i; ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ regs[i] = ptrace (PTRACE_PEEKDATA, debugger->pid, ++ arg4 + 4 * i, 0); ++ result = ptrace(PTRACE_SETFPREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ return(result); ++ } ++#endif ++ ++#ifdef UM_HAVE_SETFPXREGS ++ case PTRACE_SETFPXREGS: ++ { ++ long regs[FPX_FRAME_SIZE]; ++ int i; ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ regs[i] = ptrace (PTRACE_PEEKDATA, debugger->pid, ++ arg4 + 4 * i, 0); ++ result = ptrace(PTRACE_SETFPXREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ return(result); ++ } ++#endif ++ ++#ifdef UM_HAVE_SETREGS ++ case PTRACE_SETREGS: ++ { ++ long regs[FRAME_SIZE]; ++ int i; ++ ++ for (i = 0; i < sizeof(regs)/sizeof(regs[0]); i++) ++ regs[i] = ptrace(PTRACE_PEEKDATA, debugger->pid, ++ arg4 + 4 * i, 0); ++ result = ptrace(PTRACE_SETREGS, child, 0, regs); ++ if(result == -1) return(-errno); ++ ++ return(result); ++ } ++#endif ++ ++ case PTRACE_SINGLESTEP: ++ if(!debugger->debugee->in_context) return(-EPERM); ++ sigemptyset(&relay); ++ sigaddset(&relay, SIGSEGV); ++ sigaddset(&relay, SIGILL); ++ sigaddset(&relay, SIGBUS); ++ result = ptrace(PTRACE_SINGLESTEP, child, arg3, arg4); ++ if(result == -1) return(-errno); ++ ++ status = wait_for_stop(child, SIGTRAP, PTRACE_SINGLESTEP, ++ &relay); ++ child_proxy(child, status); ++ return(result); ++ ++ case PTRACE_SYSCALL: ++ if(!debugger->debugee->in_context) return(-EPERM); ++ result = ptrace(PTRACE_SYSCALL, child, arg3, arg4); ++ if(result == -1) return(-errno); ++ ++ *ret = PTRACE_SYSCALL; ++ return(result); ++ ++ case PTRACE_TRACEME: ++ default: ++ return(-EINVAL); ++ } ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/sysdep.c um/arch/um/kernel/tt/ptproxy/sysdep.c +--- orig/arch/um/kernel/tt/ptproxy/sysdep.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/sysdep.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,71 @@ ++/********************************************************************** ++sysdep.c ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++**********************************************************************/ ++ ++#include <stdio.h> ++#include <string.h> ++#include <stdlib.h> ++#include <signal.h> ++#include <sys/types.h> ++#include <sys/ptrace.h> ++#include <asm/ptrace.h> ++#include <linux/unistd.h> ++#include "ptrace_user.h" ++#include "user_util.h" ++#include "user.h" ++ ++int get_syscall(pid_t pid, long *arg1, long *arg2, long *arg3, long *arg4, ++ long *arg5) ++{ ++ *arg1 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG1_OFFSET, 0); ++ *arg2 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG2_OFFSET, 0); ++ *arg3 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG3_OFFSET, 0); ++ *arg4 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG4_OFFSET, 0); ++ *arg5 = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_ARG5_OFFSET, 0); ++ return(ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, 0)); ++} ++ ++void syscall_cancel(pid_t pid, int result) ++{ ++ if((ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, ++ __NR_getpid) < 0) || ++ (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) || ++ (wait_for_stop(pid, SIGTRAP, PTRACE_SYSCALL, NULL) < 0) || ++ (ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, result) < 0) || ++ (ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0)) ++ printk("ptproxy: couldn't cancel syscall: errno = %d\n", ++ errno); ++} ++ ++void syscall_set_result(pid_t pid, long result) ++{ ++ ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_RET_OFFSET, result); ++} ++ ++void syscall_continue(pid_t pid) ++{ ++ ptrace(PTRACE_SYSCALL, pid, 0, 0); ++} ++ ++int syscall_pause(pid_t pid) ++{ ++ if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_pause) < 0){ ++ printk("syscall_change - ptrace failed, errno = %d\n", errno); ++ return(-1); ++ } ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/sysdep.h um/arch/um/kernel/tt/ptproxy/sysdep.h +--- orig/arch/um/kernel/tt/ptproxy/sysdep.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/sysdep.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,25 @@ ++/********************************************************************** ++sysdep.h ++ ++Copyright (C) 1999 Lars Brinkhoff. ++Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++See the file COPYING for licensing terms and conditions. ++**********************************************************************/ ++ ++extern int get_syscall(pid_t pid, long *arg1, long *arg2, long *arg3, ++ long *arg4, long *arg5); ++extern void syscall_cancel (pid_t pid, long result); ++extern void syscall_set_result (pid_t pid, long result); ++extern void syscall_continue (pid_t pid); ++extern int syscall_pause(pid_t pid); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/wait.c um/arch/um/kernel/tt/ptproxy/wait.c +--- orig/arch/um/kernel/tt/ptproxy/wait.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/wait.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,86 @@ ++/********************************************************************** ++wait.c ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++ ++**********************************************************************/ ++ ++#include <errno.h> ++#include <signal.h> ++#include <sys/wait.h> ++#include <sys/ptrace.h> ++#include <asm/ptrace.h> ++ ++#include "ptproxy.h" ++#include "sysdep.h" ++#include "wait.h" ++#include "user_util.h" ++#include "sysdep/ptrace.h" ++#include "sysdep/ptrace_user.h" ++#include "sysdep/sigcontext.h" ++ ++int proxy_wait_return(struct debugger *debugger, pid_t unused) ++{ ++ debugger->waiting = 0; ++ ++ if(debugger->debugee->died || (debugger->wait_options & __WCLONE)){ ++ debugger_cancelled_return(debugger, -ECHILD); ++ return(0); ++ } ++ ++ if(debugger->debugee->zombie && debugger->debugee->event) ++ debugger->debugee->died = 1; ++ ++ if(debugger->debugee->event){ ++ debugger->debugee->event = 0; ++ ptrace(PTRACE_POKEDATA, debugger->pid, ++ debugger->wait_status_ptr, ++ debugger->debugee->wait_status); ++ /* if (wait4) ++ ptrace (PTRACE_POKEDATA, pid, rusage_ptr, ...); */ ++ debugger_cancelled_return(debugger, debugger->debugee->pid); ++ return(0); ++ } ++ ++ /* pause will return -EINTR, which happens to be right for wait */ ++ debugger_normal_return(debugger, -1); ++ return(0); ++} ++ ++int parent_wait_return(struct debugger *debugger, pid_t unused) ++{ ++ return(debugger_normal_return(debugger, -1)); ++} ++ ++int real_wait_return(struct debugger *debugger) ++{ ++ unsigned long ip; ++ int err, pid; ++ ++ pid = debugger->pid; ++ ip = ptrace(PTRACE_PEEKUSER, pid, PT_IP_OFFSET, 0); ++ ip = IP_RESTART_SYSCALL(ip); ++ err = ptrace(PTRACE_POKEUSER, pid, PT_IP_OFFSET, ip); ++ if(ptrace(PTRACE_POKEUSER, pid, PT_IP_OFFSET, ip) < 0) ++ tracer_panic("real_wait_return : Failed to restart system " ++ "call, errno = %d\n"); ++ if((ptrace(PTRACE_SYSCALL, debugger->pid, 0, SIGCHLD) < 0) || ++ (ptrace(PTRACE_SYSCALL, debugger->pid, 0, 0) < 0) || ++ (ptrace(PTRACE_SYSCALL, debugger->pid, 0, 0) < 0) || ++ debugger_normal_return(debugger, -1)) ++ tracer_panic("real_wait_return : gdb failed to wait, " ++ "errno = %d\n"); ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/ptproxy/wait.h um/arch/um/kernel/tt/ptproxy/wait.h +--- orig/arch/um/kernel/tt/ptproxy/wait.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/ptproxy/wait.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,15 @@ ++/********************************************************************** ++wait.h ++ ++Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing ++terms and conditions. ++**********************************************************************/ ++ ++#ifndef __PTPROXY_WAIT_H ++#define __PTPROXY_WAIT_H ++ ++extern int proxy_wait_return(struct debugger *debugger, pid_t unused); ++extern int real_wait_return(struct debugger *debugger); ++extern int parent_wait_return(struct debugger *debugger, pid_t unused); ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/sys-i386/Makefile um/arch/um/kernel/tt/sys-i386/Makefile +--- orig/arch/um/kernel/tt/sys-i386/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/sys-i386/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,17 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = sys-i386.o ++ ++obj-y = sigcontext.o ++ ++USER_OBJS = sigcontext.o ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean : +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/sys-i386/sigcontext.c um/arch/um/kernel/tt/sys-i386/sigcontext.c +--- orig/arch/um/kernel/tt/sys-i386/sigcontext.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/sys-i386/sigcontext.c Sun Dec 1 23:33:52 2002 +@@ -0,0 +1,60 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <asm/sigcontext.h> ++#include "kern_util.h" ++#include "sysdep/frame.h" ++ ++int copy_sc_from_user_tt(void *to_ptr, void *from_ptr, void *data) ++{ ++ struct arch_frame_data *arch = data; ++ struct sigcontext *to = to_ptr, *from = from_ptr; ++ struct _fpstate *to_fp, *from_fp; ++ unsigned long sigs; ++ int err; ++ ++ to_fp = to->fpstate; ++ from_fp = from->fpstate; ++ sigs = to->oldmask; ++ err = copy_from_user_proc(to, from, sizeof(*to)); ++ to->oldmask = sigs; ++ if(to_fp != NULL){ ++ err |= copy_from_user_proc(&to->fpstate, &to_fp, ++ sizeof(to->fpstate)); ++ err |= copy_from_user_proc(to_fp, from_fp, arch->fpstate_size); ++ } ++ return(err); ++} ++ ++int copy_sc_to_user_tt(void *to_ptr, void *fp, void *from_ptr, void *data) ++{ ++ struct arch_frame_data *arch = data; ++ struct sigcontext *to = to_ptr, *from = from_ptr; ++ struct _fpstate *to_fp, *from_fp; ++ int err; ++ ++ to_fp = (struct _fpstate *) ++ (fp ? (unsigned long) fp : ((unsigned long) to + sizeof(*to))); ++ from_fp = from->fpstate; ++ err = copy_to_user_proc(to, from, sizeof(*to)); ++ if(from_fp != NULL){ ++ err |= copy_to_user_proc(&to->fpstate, &to_fp, ++ sizeof(to->fpstate)); ++ err |= copy_to_user_proc(to_fp, from_fp, arch->fpstate_size); ++ } ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/syscall_kern.c um/arch/um/kernel/tt/syscall_kern.c +--- orig/arch/um/kernel/tt/syscall_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/syscall_kern.c Sun Dec 8 19:32:53 2002 +@@ -0,0 +1,142 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/types.h" ++#include "linux/utime.h" ++#include "linux/sys.h" ++#include "asm/unistd.h" ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "sysdep/syscalls.h" ++#include "kern_util.h" ++ ++static inline int check_area(void *ptr, int size) ++{ ++ return(verify_area(VERIFY_WRITE, ptr, size)); ++} ++ ++static int check_readlink(struct pt_regs *regs) ++{ ++ return(check_area((void *) UPT_SYSCALL_ARG1(®s->regs), ++ UPT_SYSCALL_ARG2(®s->regs))); ++} ++ ++static int check_utime(struct pt_regs *regs) ++{ ++ return(check_area((void *) UPT_SYSCALL_ARG1(®s->regs), ++ sizeof(struct utimbuf))); ++} ++ ++static int check_oldstat(struct pt_regs *regs) ++{ ++ return(check_area((void *) UPT_SYSCALL_ARG1(®s->regs), ++ sizeof(struct __old_kernel_stat))); ++} ++ ++static int check_stat(struct pt_regs *regs) ++{ ++ return(check_area((void *) UPT_SYSCALL_ARG1(®s->regs), ++ sizeof(struct stat))); ++} ++ ++static int check_stat64(struct pt_regs *regs) ++{ ++ return(check_area((void *) UPT_SYSCALL_ARG1(®s->regs), ++ sizeof(struct stat64))); ++} ++ ++struct bogus { ++ int kernel_ds; ++ int (*check_params)(struct pt_regs *); ++}; ++ ++struct bogus this_is_bogus[256] = { ++ [ __NR_mknod ] = { 1, NULL }, ++ [ __NR_mkdir ] = { 1, NULL }, ++ [ __NR_rmdir ] = { 1, NULL }, ++ [ __NR_unlink ] = { 1, NULL }, ++ [ __NR_symlink ] = { 1, NULL }, ++ [ __NR_link ] = { 1, NULL }, ++ [ __NR_rename ] = { 1, NULL }, ++ [ __NR_umount ] = { 1, NULL }, ++ [ __NR_mount ] = { 1, NULL }, ++ [ __NR_pivot_root ] = { 1, NULL }, ++ [ __NR_chdir ] = { 1, NULL }, ++ [ __NR_chroot ] = { 1, NULL }, ++ [ __NR_open ] = { 1, NULL }, ++ [ __NR_quotactl ] = { 1, NULL }, ++ [ __NR_sysfs ] = { 1, NULL }, ++ [ __NR_readlink ] = { 1, check_readlink }, ++ [ __NR_acct ] = { 1, NULL }, ++ [ __NR_execve ] = { 1, NULL }, ++ [ __NR_uselib ] = { 1, NULL }, ++ [ __NR_statfs ] = { 1, NULL }, ++ [ __NR_truncate ] = { 1, NULL }, ++ [ __NR_access ] = { 1, NULL }, ++ [ __NR_chmod ] = { 1, NULL }, ++ [ __NR_chown ] = { 1, NULL }, ++ [ __NR_lchown ] = { 1, NULL }, ++ [ __NR_utime ] = { 1, check_utime }, ++ [ __NR_oldlstat ] = { 1, check_oldstat }, ++ [ __NR_oldstat ] = { 1, check_oldstat }, ++ [ __NR_stat ] = { 1, check_stat }, ++ [ __NR_lstat ] = { 1, check_stat }, ++ [ __NR_stat64 ] = { 1, check_stat64 }, ++ [ __NR_lstat64 ] = { 1, check_stat64 }, ++ [ __NR_chown32 ] = { 1, NULL }, ++}; ++ ++/* sys_utimes */ ++ ++static int check_bogosity(struct pt_regs *regs) ++{ ++ struct bogus *bogon = &this_is_bogus[UPT_SYSCALL_NR(®s->regs)]; ++ ++ if(!bogon->kernel_ds) return(0); ++ if(bogon->check_params && (*bogon->check_params)(regs)) ++ return(-EFAULT); ++ set_fs(KERNEL_DS); ++ return(0); ++} ++ ++extern syscall_handler_t *sys_call_table[]; ++ ++long execute_syscall_tt(void *r) ++{ ++ struct pt_regs *regs = r; ++ long res; ++ int syscall; ++ ++ current->thread.nsyscalls++; ++ nsyscalls++; ++ syscall = UPT_SYSCALL_NR(®s->regs); ++ ++ if((syscall >= NR_syscalls) || (syscall < 0)) ++ res = -ENOSYS; ++ else if(honeypot && check_bogosity(regs)) ++ res = -EFAULT; ++ else res = EXECUTE_SYSCALL(syscall, regs); ++ ++ set_fs(USER_DS); ++ ++ if(current->thread.mode.tt.singlestep_syscall){ ++ current->thread.mode.tt.singlestep_syscall = 0; ++ current->ptrace &= ~PT_DTRACE; ++ force_sig(SIGTRAP, current); ++ } ++ ++ return(res); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/syscall_user.c um/arch/um/kernel/tt/syscall_user.c +--- orig/arch/um/kernel/tt/syscall_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/syscall_user.c Sun Dec 8 21:00:11 2002 +@@ -0,0 +1,89 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <signal.h> ++#include <errno.h> ++#include <sys/ptrace.h> ++#include <asm/unistd.h> ++#include "sysdep/ptrace.h" ++#include "sigcontext.h" ++#include "ptrace_user.h" ++#include "task.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "syscall_user.h" ++#include "tt.h" ++ ++/* XXX Bogus */ ++#define ERESTARTSYS 512 ++#define ERESTARTNOINTR 513 ++#define ERESTARTNOHAND 514 ++ ++void syscall_handler_tt(int sig, union uml_pt_regs *regs) ++{ ++ void *sc; ++ long result; ++ int index, syscall; ++ ++ syscall = UPT_SYSCALL_NR(regs); ++ sc = UPT_SC(regs); ++ SC_START_SYSCALL(sc); ++ ++ index = record_syscall_start(syscall); ++ syscall_trace(); ++ result = execute_syscall(regs); ++ ++ /* regs->sc may have changed while the system call ran (there may ++ * have been an interrupt or segfault), so it needs to be refreshed. ++ */ ++ UPT_SC(regs) = sc; ++ ++ SC_SET_SYSCALL_RETURN(sc, result); ++ if((result == -ERESTARTNOHAND) || (result == -ERESTARTSYS) || ++ (result == -ERESTARTNOINTR)) ++ do_signal(result); ++ ++ syscall_trace(); ++ record_syscall_end(index, result); ++} ++ ++int do_syscall(void *task, int pid) ++{ ++ unsigned long proc_regs[FRAME_SIZE]; ++ union uml_pt_regs *regs; ++ int syscall; ++ ++ if(ptrace_getregs(pid, proc_regs) < 0) ++ tracer_panic("Couldn't read registers"); ++ syscall = PT_SYSCALL_NR(proc_regs); ++ ++ regs = TASK_REGS(task); ++ UPT_SYSCALL_NR(regs) = syscall; ++ ++ if(syscall < 1) return(0); ++ ++ if((syscall != __NR_sigreturn) && ++ ((unsigned long *) PT_IP(proc_regs) >= &_stext) && ++ ((unsigned long *) PT_IP(proc_regs) <= &_etext)) ++ tracer_panic("I'm tracing myself and I can't get out"); ++ ++ if(ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, ++ __NR_getpid) < 0) ++ tracer_panic("do_syscall : Nullifying syscall failed, " ++ "errno = %d", errno); ++ return(1); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/time.c um/arch/um/kernel/tt/time.c +--- orig/arch/um/kernel/tt/time.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/time.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,28 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <signal.h> ++#include <sys/time.h> ++#include <time_user.h> ++#include "process.h" ++#include "user.h" ++ ++void user_time_init_tt(void) ++{ ++ if(signal(SIGVTALRM, (__sighandler_t) alarm_handler) == SIG_ERR) ++ panic("Couldn't set SIGVTALRM handler"); ++ set_interval(ITIMER_VIRTUAL); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/tlb.c um/arch/um/kernel/tt/tlb.c +--- orig/arch/um/kernel/tt/tlb.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/tlb.c Thu Dec 19 13:03:11 2002 +@@ -0,0 +1,220 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/stddef.h" ++#include "linux/kernel.h" ++#include "linux/sched.h" ++#include "asm/page.h" ++#include "asm/pgtable.h" ++#include "asm/uaccess.h" ++#include "user_util.h" ++#include "mem_user.h" ++#include "os.h" ++ ++static void fix_range(struct mm_struct *mm, unsigned long start_addr, ++ unsigned long end_addr, int force) ++{ ++ pgd_t *npgd; ++ pmd_t *npmd; ++ pte_t *npte; ++ unsigned long addr; ++ int r, w, x, err; ++ ++ if((current->thread.mode.tt.extern_pid != -1) && ++ (current->thread.mode.tt.extern_pid != os_getpid())) ++ panic("fix_range fixing wrong address space, current = 0x%p", ++ current); ++ if(mm == NULL) return; ++ for(addr=start_addr;addr<end_addr;){ ++ if(addr == TASK_SIZE){ ++ /* Skip over kernel text, kernel data, and physical ++ * memory, which don't have ptes, plus kernel virtual ++ * memory, which is flushed separately, and remap ++ * the process stack. The only way to get here is ++ * if (end_addr == STACK_TOP) > TASK_SIZE, which is ++ * only true in the honeypot case. ++ */ ++ addr = STACK_TOP - ABOVE_KMEM; ++ continue; ++ } ++ npgd = pgd_offset(mm, addr); ++ npmd = pmd_offset(npgd, addr); ++ if(pmd_present(*npmd)){ ++ npte = pte_offset(npmd, addr); ++ r = pte_read(*npte); ++ w = pte_write(*npte); ++ x = pte_exec(*npte); ++ if(!pte_dirty(*npte)) w = 0; ++ if(!pte_young(*npte)){ ++ r = 0; ++ w = 0; ++ } ++ if(force || pte_newpage(*npte)){ ++ err = os_unmap_memory((void *) addr, ++ PAGE_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ if(pte_present(*npte)) ++ map_memory(addr, ++ pte_val(*npte) & PAGE_MASK, ++ PAGE_SIZE, r, w, x); ++ } ++ else if(pte_newprot(*npte)){ ++ protect_memory(addr, PAGE_SIZE, r, w, x, 1); ++ } ++ *npte = pte_mkuptodate(*npte); ++ addr += PAGE_SIZE; ++ } ++ else { ++ if(force || pmd_newpage(*npmd)){ ++ err = os_unmap_memory((void *) addr, PMD_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ pmd_mkuptodate(*npmd); ++ } ++ addr += PMD_SIZE; ++ } ++ } ++} ++ ++atomic_t vmchange_seq = ATOMIC_INIT(1); ++ ++static void flush_kernel_vm_range(unsigned long start, unsigned long end, ++ int update_seq) ++{ ++ struct mm_struct *mm; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long addr; ++ int updated = 0, err; ++ ++ mm = &init_mm; ++ for(addr = start; addr < end;){ ++ pgd = pgd_offset(mm, addr); ++ pmd = pmd_offset(pgd, addr); ++ if(pmd_present(*pmd)){ ++ pte = pte_offset(pmd, addr); ++ if(!pte_present(*pte) || pte_newpage(*pte)){ ++ updated = 1; ++ err = os_unmap_memory((void *) addr, ++ PAGE_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ if(pte_present(*pte)) ++ map_memory(addr, ++ pte_val(*pte) & PAGE_MASK, ++ PAGE_SIZE, 1, 1, 1); ++ } ++ else if(pte_newprot(*pte)){ ++ updated = 1; ++ protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1); ++ } ++ addr += PAGE_SIZE; ++ } ++ else { ++ if(pmd_newpage(*pmd)){ ++ updated = 1; ++ err = os_unmap_memory((void *) addr, PMD_SIZE); ++ if(err < 0) ++ panic("munmap failed, errno = %d\n", ++ -err); ++ } ++ addr += PMD_SIZE; ++ } ++ } ++ if(updated && update_seq) atomic_inc(&vmchange_seq); ++} ++ ++static void protect_vm_page(unsigned long addr, int w, int must_succeed) ++{ ++ int err; ++ ++ err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed); ++ if(err == 0) return; ++ else if((err == -EFAULT) || (err == -ENOMEM)){ ++ flush_kernel_vm_range(addr, addr + PAGE_SIZE, 1); ++ protect_vm_page(addr, w, 1); ++ } ++ else panic("protect_vm_page : protect failed, errno = %d\n", err); ++} ++ ++void mprotect_kernel_vm(int w) ++{ ++ struct mm_struct *mm; ++ pgd_t *pgd; ++ pmd_t *pmd; ++ pte_t *pte; ++ unsigned long addr; ++ ++ mm = &init_mm; ++ for(addr = start_vm; addr < end_vm;){ ++ pgd = pgd_offset(mm, addr); ++ pmd = pmd_offset(pgd, addr); ++ if(pmd_present(*pmd)){ ++ pte = pte_offset(pmd, addr); ++ if(pte_present(*pte)) protect_vm_page(addr, w, 0); ++ addr += PAGE_SIZE; ++ } ++ else addr += PMD_SIZE; ++ } ++} ++ ++void flush_tlb_kernel_vm_tt(void) ++{ ++ flush_kernel_vm_range(start_vm, end_vm, 1); ++} ++ ++void __flush_tlb_one_tt(unsigned long addr) ++{ ++ flush_kernel_vm_range(addr, addr + PAGE_SIZE, 1); ++} ++ ++void flush_tlb_range_tt(struct mm_struct *mm, unsigned long start, ++ unsigned long end) ++{ ++ if(mm != current->mm) return; ++ ++ /* Assumes that the range start ... end is entirely within ++ * either process memory or kernel vm ++ */ ++ if((start >= start_vm) && (start < end_vm)) ++ flush_kernel_vm_range(start, end, 1); ++ else fix_range(mm, start, end, 0); ++} ++ ++void flush_tlb_mm_tt(struct mm_struct *mm) ++{ ++ unsigned long seq; ++ ++ if(mm != current->mm) return; ++ ++ fix_range(mm, 0, STACK_TOP, 0); ++ ++ seq = atomic_read(&vmchange_seq); ++ if(current->thread.mode.tt.vm_seq == seq) return; ++ current->thread.mode.tt.vm_seq = seq; ++ flush_kernel_vm_range(start_vm, end_vm, 0); ++} ++ ++void force_flush_all_tt(void) ++{ ++ fix_range(current->mm, 0, STACK_TOP, 1); ++ flush_kernel_vm_range(start_vm, end_vm, 0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/tracer.c um/arch/um/kernel/tt/tracer.c +--- orig/arch/um/kernel/tt/tracer.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/tracer.c Wed Mar 26 10:01:33 2003 +@@ -0,0 +1,453 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <stdarg.h> ++#include <unistd.h> ++#include <signal.h> ++#include <errno.h> ++#include <sched.h> ++#include <string.h> ++#include <sys/mman.h> ++#include <sys/ptrace.h> ++#include <sys/time.h> ++#include <sys/wait.h> ++#include "user.h" ++#include "sysdep/ptrace.h" ++#include "sigcontext.h" ++#include "sysdep/sigcontext.h" ++#include "os.h" ++#include "signal_user.h" ++#include "user_util.h" ++#include "mem_user.h" ++#include "process.h" ++#include "kern_util.h" ++#include "frame.h" ++#include "chan_user.h" ++#include "ptrace_user.h" ++#include "mode.h" ++#include "tt.h" ++ ++static int tracer_winch[2]; ++ ++int is_tracer_winch(int pid, int fd, void *data) ++{ ++ if(pid != tracing_pid) ++ return(0); ++ ++ register_winch_irq(tracer_winch[0], fd, -1, data); ++ return(1); ++} ++ ++static void tracer_winch_handler(int sig) ++{ ++ char c = 1; ++ ++ if(write(tracer_winch[1], &c, sizeof(c)) != sizeof(c)) ++ printk("tracer_winch_handler - write failed, errno = %d\n", ++ errno); ++} ++ ++/* Called only by the tracing thread during initialization */ ++ ++static void setup_tracer_winch(void) ++{ ++ int err; ++ ++ err = os_pipe(tracer_winch, 1, 1); ++ if(err){ ++ printk("setup_tracer_winch : os_pipe failed, errno = %d\n", ++ -err); ++ return; ++ } ++ signal(SIGWINCH, tracer_winch_handler); ++} ++ ++void attach_process(int pid) ++{ ++ if((ptrace(PTRACE_ATTACH, pid, 0, 0) < 0) || ++ (ptrace(PTRACE_CONT, pid, 0, 0) < 0)) ++ tracer_panic("OP_FORK failed to attach pid"); ++ wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); ++ if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) ++ tracer_panic("OP_FORK failed to continue process"); ++} ++ ++void tracer_panic(char *format, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, format); ++ vprintf(format, ap); ++ printf("\n"); ++ while(1) pause(); ++} ++ ++static void tracer_segv(int sig, struct sigcontext sc) ++{ ++ printf("Tracing thread segfault at address 0x%lx, ip 0x%lx\n", ++ SC_FAULT_ADDR(&sc), SC_IP(&sc)); ++ while(1) ++ pause(); ++} ++ ++/* Changed early in boot, and then only read */ ++int debug = 0; ++int debug_stop = 1; ++int debug_parent = 0; ++int honeypot = 0; ++ ++static int signal_tramp(void *arg) ++{ ++ int (*proc)(void *); ++ ++ if(honeypot && munmap((void *) (host_task_size - 0x10000000), ++ 0x10000000)) ++ panic("Unmapping stack failed"); ++ if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) ++ panic("ptrace PTRACE_TRACEME failed"); ++ os_stop_process(os_getpid()); ++ change_sig(SIGWINCH, 0); ++ signal(SIGUSR1, SIG_IGN); ++ change_sig(SIGCHLD, 0); ++ signal(SIGSEGV, (__sighandler_t) sig_handler); ++ set_cmdline("(idle thread)"); ++ set_init_pid(os_getpid()); ++ proc = arg; ++ return((*proc)(NULL)); ++} ++ ++static void sleeping_process_signal(int pid, int sig) ++{ ++ switch(sig){ ++ /* These two result from UML being ^Z-ed and bg-ed. PTRACE_CONT is ++ * right because the process must be in the kernel already. ++ */ ++ case SIGCONT: ++ case SIGTSTP: ++ if(ptrace(PTRACE_CONT, pid, 0, sig) < 0) ++ tracer_panic("sleeping_process_signal : Failed to " ++ "continue pid %d, errno = %d\n", pid, ++ sig); ++ break; ++ ++ /* This happens when the debugger (e.g. strace) is doing system call ++ * tracing on the kernel. During a context switch, the current task ++ * will be set to the incoming process and the outgoing process will ++ * hop into write and then read. Since it's not the current process ++ * any more, the trace of those will land here. So, we need to just ++ * PTRACE_SYSCALL it. ++ */ ++ case SIGTRAP: ++ if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) ++ tracer_panic("sleeping_process_signal : Failed to " ++ "PTRACE_SYSCALL pid %d, errno = %d\n", ++ pid, sig); ++ break; ++ case SIGSTOP: ++ break; ++ default: ++ tracer_panic("sleeping process %d got unexpected " ++ "signal : %d\n", pid, sig); ++ break; ++ } ++} ++ ++/* Accessed only by the tracing thread */ ++int debugger_pid = -1; ++int debugger_parent = -1; ++int debugger_fd = -1; ++int gdb_pid = -1; ++ ++struct { ++ int pid; ++ int signal; ++ unsigned long addr; ++ struct timeval time; ++} signal_record[1024][32]; ++ ++int signal_index[32]; ++int nsignals = 0; ++int debug_trace = 0; ++extern int io_nsignals, io_count, intr_count; ++ ++extern void signal_usr1(int sig); ++ ++int tracing_pid = -1; ++ ++int tracer(int (*init_proc)(void *), void *sp) ++{ ++ void *task = NULL; ++ unsigned long eip = 0; ++ int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0; ++ int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0; ++ ++ capture_signal_stack(); ++ signal(SIGPIPE, SIG_IGN); ++ setup_tracer_winch(); ++ tracing_pid = os_getpid(); ++ printf("tracing thread pid = %d\n", tracing_pid); ++ ++ pid = clone(signal_tramp, sp, CLONE_FILES | SIGCHLD, init_proc); ++ n = waitpid(pid, &status, WUNTRACED); ++ if(n < 0){ ++ printf("waitpid on idle thread failed, errno = %d\n", errno); ++ exit(1); ++ } ++ if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){ ++ printf("Failed to continue idle thread, errno = %d\n", errno); ++ exit(1); ++ } ++ ++ signal(SIGSEGV, (sighandler_t) tracer_segv); ++ signal(SIGUSR1, signal_usr1); ++ if(debug_trace){ ++ printf("Tracing thread pausing to be attached\n"); ++ stop(); ++ } ++ if(debug){ ++ if(gdb_pid != -1) ++ debugger_pid = attach_debugger(pid, gdb_pid, 1); ++ else debugger_pid = init_ptrace_proxy(pid, 1, debug_stop); ++ if(debug_parent){ ++ debugger_parent = os_process_parent(debugger_pid); ++ init_parent_proxy(debugger_parent); ++ err = attach(debugger_parent); ++ if(err){ ++ printf("Failed to attach debugger parent %d, " ++ "errno = %d\n", debugger_parent, err); ++ debugger_parent = -1; ++ } ++ else { ++ if(ptrace(PTRACE_SYSCALL, debugger_parent, ++ 0, 0) < 0){ ++ printf("Failed to continue debugger " ++ "parent, errno = %d\n", errno); ++ debugger_parent = -1; ++ } ++ } ++ } ++ } ++ set_cmdline("(tracing thread)"); ++ while(1){ ++ if((pid = waitpid(-1, &status, WUNTRACED)) <= 0){ ++ if(errno != ECHILD){ ++ printf("wait failed - errno = %d\n", errno); ++ } ++ continue; ++ } ++ if(pid == debugger_pid){ ++ int cont = 0; ++ ++ if(WIFEXITED(status) || WIFSIGNALED(status)) ++ debugger_pid = -1; ++ /* XXX Figure out how to deal with gdb and SMP */ ++ else cont = debugger_signal(status, cpu_tasks[0].pid); ++ if(cont == PTRACE_SYSCALL) strace = 1; ++ continue; ++ } ++ else if(pid == debugger_parent){ ++ debugger_parent_signal(status, pid); ++ continue; ++ } ++ nsignals++; ++ if(WIFEXITED(status)) ; ++#ifdef notdef ++ { ++ printf("Child %d exited with status %d\n", pid, ++ WEXITSTATUS(status)); ++ } ++#endif ++ else if(WIFSIGNALED(status)){ ++ sig = WTERMSIG(status); ++ if(sig != 9){ ++ printf("Child %d exited with signal %d\n", pid, ++ sig); ++ } ++ } ++ else if(WIFSTOPPED(status)){ ++ proc_id = pid_to_processor_id(pid); ++ sig = WSTOPSIG(status); ++ if(signal_index[proc_id] == 1024){ ++ signal_index[proc_id] = 0; ++ last_index = 1023; ++ } ++ else last_index = signal_index[proc_id] - 1; ++ if(((sig == SIGPROF) || (sig == SIGVTALRM) || ++ (sig == SIGALRM)) && ++ (signal_record[proc_id][last_index].signal == sig)&& ++ (signal_record[proc_id][last_index].pid == pid)) ++ signal_index[proc_id] = last_index; ++ signal_record[proc_id][signal_index[proc_id]].pid = pid; ++ gettimeofday(&signal_record[proc_id][signal_index[proc_id]].time, NULL); ++ eip = ptrace(PTRACE_PEEKUSER, pid, PT_IP_OFFSET, 0); ++ signal_record[proc_id][signal_index[proc_id]].addr = eip; ++ signal_record[proc_id][signal_index[proc_id]++].signal = sig; ++ ++ if(proc_id == -1){ ++ sleeping_process_signal(pid, sig); ++ continue; ++ } ++ ++ task = cpu_tasks[proc_id].task; ++ tracing = is_tracing(task); ++ old_tracing = tracing; ++ ++ switch(sig){ ++ case SIGUSR1: ++ sig = 0; ++ op = do_proc_op(task, proc_id); ++ switch(op){ ++ case OP_TRACE_ON: ++ arch_leave_kernel(task, pid); ++ tracing = 1; ++ break; ++ case OP_REBOOT: ++ case OP_HALT: ++ unmap_physmem(); ++ kmalloc_ok = 0; ++ ptrace(PTRACE_KILL, pid, 0, 0); ++ return(op == OP_REBOOT); ++ case OP_NONE: ++ printf("Detaching pid %d\n", pid); ++ detach(pid, SIGSTOP); ++ continue; ++ default: ++ break; ++ } ++ /* OP_EXEC switches host processes on us, ++ * we want to continue the new one. ++ */ ++ pid = cpu_tasks[proc_id].pid; ++ break; ++ case SIGTRAP: ++ if(!tracing && (debugger_pid != -1)){ ++ child_signal(pid, status); ++ continue; ++ } ++ tracing = 0; ++ if(do_syscall(task, pid)) sig = SIGUSR2; ++ else clear_singlestep(task); ++ break; ++ case SIGPROF: ++ if(tracing) sig = 0; ++ break; ++ case SIGCHLD: ++ case SIGHUP: ++ sig = 0; ++ break; ++ case SIGSEGV: ++ case SIGIO: ++ case SIGALRM: ++ case SIGVTALRM: ++ case SIGFPE: ++ case SIGBUS: ++ case SIGILL: ++ case SIGWINCH: ++ default: ++ tracing = 0; ++ break; ++ } ++ set_tracing(task, tracing); ++ ++ if(!tracing && old_tracing) ++ arch_enter_kernel(task, pid); ++ ++ if(!tracing && (debugger_pid != -1) && (sig != 0) && ++ (sig != SIGALRM) && (sig != SIGVTALRM) && ++ (sig != SIGSEGV) && (sig != SIGTRAP) && ++ (sig != SIGUSR2) && (sig != SIGIO) && ++ (sig != SIGFPE)){ ++ child_signal(pid, status); ++ continue; ++ } ++ ++ if(tracing){ ++ if(singlestepping_tt(task)) ++ cont_type = PTRACE_SINGLESTEP; ++ else cont_type = PTRACE_SYSCALL; ++ } ++ else cont_type = PTRACE_CONT; ++ ++ if((cont_type == PTRACE_CONT) && ++ (debugger_pid != -1) && strace) ++ cont_type = PTRACE_SYSCALL; ++ ++ if(ptrace(cont_type, pid, 0, sig) != 0){ ++ tracer_panic("ptrace failed to continue " ++ "process - errno = %d\n", ++ errno); ++ } ++ } ++ } ++ return(0); ++} ++ ++static int __init uml_debug_setup(char *line, int *add) ++{ ++ char *next; ++ ++ debug = 1; ++ *add = 0; ++ if(*line != '=') return(0); ++ line++; ++ ++ while(line != NULL){ ++ next = strchr(line, ','); ++ if(next) *next++ = '\0'; ++ ++ if(!strcmp(line, "go")) debug_stop = 0; ++ else if(!strcmp(line, "parent")) debug_parent = 1; ++ else printf("Unknown debug option : '%s'\n", line); ++ ++ line = next; ++ } ++ return(0); ++} ++ ++__uml_setup("debug", uml_debug_setup, ++"debug\n" ++" Starts up the kernel under the control of gdb. See the \n" ++" kernel debugging tutorial and the debugging session pages\n" ++" at http://user-mode-linux.sourceforge.net/ for more information.\n\n" ++); ++ ++static int __init uml_debugtrace_setup(char *line, int *add) ++{ ++ debug_trace = 1; ++ return 0; ++} ++__uml_setup("debugtrace", uml_debugtrace_setup, ++"debugtrace\n" ++" Causes the tracing thread to pause until it is attached by a\n" ++" debugger and continued. This is mostly for debugging crashes\n" ++" early during boot, and should be pretty much obsoleted by\n" ++" the debug switch.\n\n" ++); ++ ++static int __init uml_honeypot_setup(char *line, int *add) ++{ ++ jail_setup("", add); ++ honeypot = 1; ++ return 0; ++} ++__uml_setup("honeypot", uml_honeypot_setup, ++"honeypot\n" ++" This makes UML put process stacks in the same location as they are\n" ++" on the host, allowing expoits such as stack smashes to work against\n" ++" UML. This implies 'jail'.\n\n" ++); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/trap_user.c um/arch/um/kernel/tt/trap_user.c +--- orig/arch/um/kernel/tt/trap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/trap_user.c Mon Dec 9 13:14:42 2002 +@@ -0,0 +1,59 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <errno.h> ++#include <signal.h> ++#include <asm/sigcontext.h> ++#include "sysdep/ptrace.h" ++#include "signal_user.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "task.h" ++#include "tt.h" ++ ++void sig_handler_common_tt(int sig, void *sc_ptr) ++{ ++ struct sigcontext *sc = sc_ptr; ++ struct tt_regs save_regs, *r; ++ struct signal_info *info; ++ int save_errno = errno, is_user; ++ ++ unprotect_kernel_mem(); ++ ++ r = &TASK_REGS(get_current())->tt; ++ save_regs = *r; ++ is_user = user_context(SC_SP(sc)); ++ r->sc = sc; ++ if(sig != SIGUSR2) ++ r->syscall = -1; ++ ++ change_sig(SIGUSR1, 1); ++ info = &sig_info[sig]; ++ if(!info->is_irq) unblock_signals(); ++ ++ (*info->handler)(sig, (union uml_pt_regs *) r); ++ ++ if(is_user){ ++ interrupt_end(); ++ block_signals(); ++ change_sig(SIGUSR1, 0); ++ set_user_mode(NULL); ++ } ++ *r = save_regs; ++ errno = save_errno; ++ if(is_user) protect_kernel_mem(); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/uaccess_user.c um/arch/um/kernel/tt/uaccess_user.c +--- orig/arch/um/kernel/tt/uaccess_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/uaccess_user.c Tue Mar 25 17:10:54 2003 +@@ -0,0 +1,100 @@ ++/* ++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <setjmp.h> ++#include <string.h> ++#include "user_util.h" ++#include "uml_uaccess.h" ++#include "task.h" ++#include "kern_util.h" ++ ++int __do_copy_from_user(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher) ++{ ++ struct tt_regs save = TASK_REGS(get_current())->tt; ++ unsigned long fault; ++ int faulted; ++ ++ fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, ++ __do_copy, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ ++ if(!faulted) return(0); ++ else return(n - (fault - (unsigned long) from)); ++} ++ ++static void __do_strncpy(void *dst, const void *src, int count) ++{ ++ strncpy(dst, src, count); ++} ++ ++int __do_strncpy_from_user(char *dst, const char *src, unsigned long count, ++ void **fault_addr, void **fault_catcher) ++{ ++ struct tt_regs save = TASK_REGS(get_current())->tt; ++ unsigned long fault; ++ int faulted; ++ ++ fault = __do_user_copy(dst, src, count, fault_addr, fault_catcher, ++ __do_strncpy, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ ++ if(!faulted) return(strlen(dst)); ++ else return(-1); ++} ++ ++static void __do_clear(void *to, const void *from, int n) ++{ ++ memset(to, 0, n); ++} ++ ++int __do_clear_user(void *mem, unsigned long len, ++ void **fault_addr, void **fault_catcher) ++{ ++ struct tt_regs save = TASK_REGS(get_current())->tt; ++ unsigned long fault; ++ int faulted; ++ ++ fault = __do_user_copy(mem, NULL, len, fault_addr, fault_catcher, ++ __do_clear, &faulted); ++ TASK_REGS(get_current())->tt = save; ++ ++ if(!faulted) return(0); ++ else return(len - (fault - (unsigned long) mem)); ++} ++ ++int __do_strnlen_user(const char *str, unsigned long n, ++ void **fault_addr, void **fault_catcher) ++{ ++ struct tt_regs save = TASK_REGS(get_current())->tt; ++ int ret; ++ unsigned long *faddrp = (unsigned long *)fault_addr; ++ jmp_buf jbuf; ++ ++ *fault_catcher = &jbuf; ++ if(setjmp(jbuf) == 0){ ++ ret = strlen(str) + 1; ++ } ++ else { ++ ret = *faddrp - (unsigned long) str; ++ } ++ *fault_addr = NULL; ++ *fault_catcher = NULL; ++ ++ TASK_REGS(get_current())->tt = save; ++ return ret; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tt/unmap.c um/arch/um/kernel/tt/unmap.c +--- orig/arch/um/kernel/tt/unmap.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tt/unmap.c Wed Dec 11 10:42:21 2002 +@@ -0,0 +1,31 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <sys/mman.h> ++ ++int switcheroo(int fd, int prot, void *from, void *to, int size) ++{ ++ if(munmap(to, size) < 0){ ++ return(-1); ++ } ++ if(mmap(to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) != to){ ++ return(-1); ++ } ++ if(munmap(from, size) < 0){ ++ return(-1); ++ } ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/tty_log.c um/arch/um/kernel/tty_log.c +--- orig/arch/um/kernel/tty_log.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/tty_log.c Wed Apr 16 16:35:20 2003 +@@ -0,0 +1,213 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) and ++ * geoffrey hing <ghing@net.ohio-state.edu> ++ * Licensed under the GPL ++ */ ++ ++#include <errno.h> ++#include <string.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <fcntl.h> ++#include <sys/time.h> ++#include "init.h" ++#include "user.h" ++#include "kern_util.h" ++#include "os.h" ++ ++#define TTY_LOG_DIR "./" ++ ++/* Set early in boot and then unchanged */ ++static char *tty_log_dir = TTY_LOG_DIR; ++static int tty_log_fd = -1; ++ ++#define TTY_LOG_OPEN 1 ++#define TTY_LOG_CLOSE 2 ++#define TTY_LOG_WRITE 3 ++#define TTY_LOG_EXEC 4 ++ ++#define TTY_READ 1 ++#define TTY_WRITE 2 ++ ++struct tty_log_buf { ++ int what; ++ unsigned long tty; ++ int len; ++ int direction; ++ unsigned long sec; ++ unsigned long usec; ++}; ++ ++int open_tty_log(void *tty, void *current_tty) ++{ ++ struct timeval tv; ++ struct tty_log_buf data; ++ char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")]; ++ int fd; ++ ++ gettimeofday(&tv, NULL); ++ if(tty_log_fd != -1){ ++ data = ((struct tty_log_buf) { .what = TTY_LOG_OPEN, ++ .tty = (unsigned long) tty, ++ .len = sizeof(current_tty), ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ write(tty_log_fd, ¤t_tty, data.len); ++ return(tty_log_fd); ++ } ++ ++ sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, ++ (unsigned int) tv.tv_usec); ++ ++ fd = os_open_file(buf, of_append(of_create(of_rdwr(OPENFLAGS()))), ++ 0644); ++ if(fd < 0){ ++ printk("open_tty_log : couldn't open '%s', errno = %d\n", ++ buf, -fd); ++ } ++ return(fd); ++} ++ ++void close_tty_log(int fd, void *tty) ++{ ++ struct tty_log_buf data; ++ struct timeval tv; ++ ++ if(tty_log_fd != -1){ ++ gettimeofday(&tv, NULL); ++ data = ((struct tty_log_buf) { .what = TTY_LOG_CLOSE, ++ .tty = (unsigned long) tty, ++ .len = 0, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ return; ++ } ++ close(fd); ++} ++ ++static int log_chunk(int fd, char *buf, int len) ++{ ++ int total = 0, try, missed, n; ++ char chunk[64]; ++ ++ while(len > 0){ ++ try = (len > sizeof(chunk)) ? sizeof(chunk) : len; ++ missed = copy_from_user_proc(chunk, buf, try); ++ try -= missed; ++ n = write(fd, chunk, try); ++ if(n != try) ++ return(-errno); ++ if(missed != 0) ++ return(-EFAULT); ++ ++ len -= try; ++ total += try; ++ buf += try; ++ } ++ ++ return(total); ++} ++ ++int write_tty_log(int fd, char *buf, int len, void *tty, int is_read) ++{ ++ struct timeval tv; ++ struct tty_log_buf data; ++ int direction; ++ ++ if(fd == tty_log_fd){ ++ gettimeofday(&tv, NULL); ++ direction = is_read ? TTY_READ : TTY_WRITE; ++ data = ((struct tty_log_buf) { .what = TTY_LOG_WRITE, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = direction, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ } ++ ++ return(log_chunk(fd, buf, len)); ++} ++ ++void log_exec(char **argv, void *tty) ++{ ++ struct timeval tv; ++ struct tty_log_buf data; ++ char **ptr,*arg; ++ int len; ++ ++ if(tty_log_fd == -1) return; ++ ++ gettimeofday(&tv, NULL); ++ ++ len = 0; ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ len += strlen_user_proc(arg); ++ } ++ ++ data = ((struct tty_log_buf) { .what = TTY_LOG_EXEC, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ log_chunk(tty_log_fd, arg, strlen_user_proc(arg)); ++ } ++} ++ ++static int __init set_tty_log_dir(char *name, int *add) ++{ ++ tty_log_dir = name; ++ return 0; ++} ++ ++__uml_setup("tty_log_dir=", set_tty_log_dir, ++"tty_log_dir=<directory>\n" ++" This is used to specify the directory where the logs of all pty\n" ++" data from this UML machine will be written.\n\n" ++); ++ ++static int __init set_tty_log_fd(char *name, int *add) ++{ ++ char *end; ++ ++ tty_log_fd = strtoul(name, &end, 0); ++ if((*end != '\0') || (end == name)){ ++ printf("set_tty_log_fd - strtoul failed on '%s'\n", name); ++ tty_log_fd = -1; ++ } ++ return 0; ++} ++ ++__uml_setup("tty_log_fd=", set_tty_log_fd, ++"tty_log_fd=<fd>\n" ++" This is used to specify a preconfigured file descriptor to which all\n" ++" tty data will be written. Preconfigure the descriptor with something\n" ++" like '10>tty_log tty_log_fd=10'.\n\n" ++); ++ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/uaccess_user.c um/arch/um/kernel/uaccess_user.c +--- orig/arch/um/kernel/uaccess_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/uaccess_user.c Tue Mar 25 17:06:05 2003 +@@ -0,0 +1,64 @@ ++/* ++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <setjmp.h> ++#include <string.h> ++ ++/* These are here rather than tt/uaccess.c because skas mode needs them in ++ * order to do SIGBUS recovery when a tmpfs mount runs out of room. ++ */ ++ ++unsigned long __do_user_copy(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher, ++ void (*op)(void *to, const void *from, ++ int n), int *faulted_out) ++{ ++ unsigned long *faddrp = (unsigned long *) fault_addr, ret; ++ ++ jmp_buf jbuf; ++ *fault_catcher = &jbuf; ++ if(setjmp(jbuf) == 0){ ++ (*op)(to, from, n); ++ ret = 0; ++ *faulted_out = 0; ++ } ++ else { ++ ret = *faddrp; ++ *faulted_out = 1; ++ } ++ *fault_addr = NULL; ++ *fault_catcher = NULL; ++ return ret; ++} ++ ++void __do_copy(void *to, const void *from, int n) ++{ ++ memcpy(to, from, n); ++} ++ ++ ++int __do_copy_to_user(void *to, const void *from, int n, ++ void **fault_addr, void **fault_catcher) ++{ ++ unsigned long fault; ++ int faulted; ++ ++ fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, ++ __do_copy, &faulted); ++ if(!faulted) return(0); ++ else return(n - (fault - (unsigned long) to)); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/um_arch.c um/arch/um/kernel/um_arch.c +--- orig/arch/um/kernel/um_arch.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/um_arch.c Thu Mar 6 19:06:09 2003 +@@ -0,0 +1,425 @@ ++/* ++ * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/kernel.h" ++#include "linux/sched.h" ++#include "linux/notifier.h" ++#include "linux/mm.h" ++#include "linux/types.h" ++#include "linux/tty.h" ++#include "linux/init.h" ++#include "linux/bootmem.h" ++#include "linux/spinlock.h" ++#include "linux/utsname.h" ++#include "linux/sysrq.h" ++#include "linux/seq_file.h" ++#include "linux/delay.h" ++#include "asm/page.h" ++#include "asm/pgtable.h" ++#include "asm/ptrace.h" ++#include "asm/elf.h" ++#include "asm/user.h" ++#include "ubd_user.h" ++#include "asm/current.h" ++#include "user_util.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "mprot.h" ++#include "mem_user.h" ++#include "mem.h" ++#include "umid.h" ++#include "initrd.h" ++#include "init.h" ++#include "os.h" ++#include "choose-mode.h" ++#include "mode_kern.h" ++#include "mode.h" ++ ++#define DEFAULT_COMMAND_LINE "root=/dev/ubd0" ++ ++struct cpuinfo_um boot_cpu_data = { ++ .loops_per_jiffy = 0, ++ .pgd_quick = NULL, ++ .pmd_quick = NULL, ++ .pte_quick = NULL, ++ .pgtable_cache_sz = 0, ++ .ipi_pipe = { -1, -1 } ++}; ++ ++unsigned long thread_saved_pc(struct thread_struct *thread) ++{ ++ return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, ++ thread))); ++} ++ ++static int show_cpuinfo(struct seq_file *m, void *v) ++{ ++ int index; ++ ++ index = (struct cpuinfo_um *)v - cpu_data; ++#ifdef CONFIG_SMP ++ if (!(cpu_online_map & (1 << index))) ++ return 0; ++#endif ++ ++ seq_printf(m, "processor\t: %d\n", index); ++ seq_printf(m, "vendor_id\t: User Mode Linux\n"); ++ seq_printf(m, "model name\t: UML\n"); ++ seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas")); ++ seq_printf(m, "host\t\t: %s\n", host_info); ++ seq_printf(m, "bogomips\t: %lu.%02lu\n\n", ++ loops_per_jiffy/(500000/HZ), ++ (loops_per_jiffy/(5000/HZ)) % 100); ++ ++ return(0); ++} ++ ++static void *c_start(struct seq_file *m, loff_t *pos) ++{ ++ return *pos < NR_CPUS ? cpu_data + *pos : NULL; ++} ++ ++static void *c_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return c_start(m, pos); ++} ++ ++static void c_stop(struct seq_file *m, void *v) ++{ ++} ++ ++struct seq_operations cpuinfo_op = { ++ .start = c_start, ++ .next = c_next, ++ .stop = c_stop, ++ .show = show_cpuinfo, ++}; ++ ++pte_t * __bad_pagetable(void) ++{ ++ panic("Someone should implement __bad_pagetable"); ++ return(NULL); ++} ++ ++/* Set in linux_main */ ++unsigned long host_task_size; ++unsigned long task_size; ++unsigned long uml_start; ++ ++/* Set in early boot */ ++unsigned long uml_physmem; ++unsigned long uml_reserved; ++unsigned long start_vm; ++unsigned long end_vm; ++int ncpus = 1; ++ ++#ifdef CONFIG_MODE_TT ++/* Pointer set in linux_main, the array itself is private to each thread, ++ * and changed at address space creation time so this poses no concurrency ++ * problems. ++ */ ++static char *argv1_begin = NULL; ++static char *argv1_end = NULL; ++#endif ++ ++/* Set in early boot */ ++static int have_root __initdata = 0; ++long physmem_size = 32 * 1024 * 1024; ++ ++void set_cmdline(char *cmd) ++{ ++#ifdef CONFIG_MODE_TT ++ char *umid, *ptr; ++ ++ if(CHOOSE_MODE(honeypot, 0)) return; ++ ++ umid = get_umid(1); ++ if(umid != NULL){ ++ snprintf(argv1_begin, ++ (argv1_end - argv1_begin) * sizeof(*ptr), ++ "(%s) ", umid); ++ ptr = &argv1_begin[strlen(argv1_begin)]; ++ } ++ else ptr = argv1_begin; ++ ++ snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd); ++ memset(argv1_begin + strlen(argv1_begin), '\0', ++ argv1_end - argv1_begin - strlen(argv1_begin)); ++#endif ++} ++ ++static char *usage_string = ++"User Mode Linux v%s\n" ++" available at http://user-mode-linux.sourceforge.net/\n\n"; ++ ++static int __init uml_version_setup(char *line, int *add) ++{ ++ printf("%s\n", system_utsname.release); ++ exit(0); ++} ++ ++__uml_setup("--version", uml_version_setup, ++"--version\n" ++" Prints the version number of the kernel.\n\n" ++); ++ ++static int __init uml_root_setup(char *line, int *add) ++{ ++ have_root = 1; ++ return 0; ++} ++ ++__uml_setup("root=", uml_root_setup, ++"root=<file containing the root fs>\n" ++" This is actually used by the generic kernel in exactly the same\n" ++" way as in any other kernel. If you configure a number of block\n" ++" devices and want to boot off something other than ubd0, you \n" ++" would use something like:\n" ++" root=/dev/ubd5\n\n" ++); ++ ++#ifdef CONFIG_SMP ++static int __init uml_ncpus_setup(char *line, int *add) ++{ ++ if (!sscanf(line, "%d", &ncpus)) { ++ printf("Couldn't parse [%s]\n", line); ++ return -1; ++ } ++ ++ return 0; ++} ++ ++__uml_setup("ncpus=", uml_ncpus_setup, ++"ncpus=<# of desired CPUs>\n" ++" This tells an SMP kernel how many virtual processors to start.\n\n" ++); ++#endif ++ ++int force_tt = 0; ++ ++#if defined(CONFIG_MODE_TT) && defined(CONFIG_MODE_SKAS) ++#define DEFAULT_TT 0 ++ ++static int __init mode_tt_setup(char *line, int *add) ++{ ++ force_tt = 1; ++ return(0); ++} ++ ++#else ++#ifdef CONFIG_MODE_SKAS ++ ++#define DEFAULT_TT 0 ++ ++static int __init mode_tt_setup(char *line, int *add) ++{ ++ printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); ++ return(0); ++} ++ ++#else ++#ifdef CONFIG_MODE_TT ++ ++#define DEFAULT_TT 1 ++ ++static int __init mode_tt_setup(char *line, int *add) ++{ ++ printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); ++ return(0); ++} ++ ++#else ++ ++#error Either CONFIG_MODE_TT or CONFIG_MODE_SKAS must be enabled ++ ++#endif ++#endif ++#endif ++ ++__uml_setup("mode=tt", mode_tt_setup, ++"mode=tt\n" ++" When both CONFIG_MODE_TT and CONFIG_MODE_SKAS are enabled, this option\n" ++" forces UML to run in tt (tracing thread) mode. It is not the default\n" ++" because it's slower and less secure than skas mode.\n\n" ++); ++ ++int mode_tt = DEFAULT_TT; ++ ++static int __init Usage(char *line, int *add) ++{ ++ const char **p; ++ ++ printf(usage_string, system_utsname.release); ++ p = &__uml_help_start; ++ while (p < &__uml_help_end) { ++ printf("%s", *p); ++ p++; ++ } ++ exit(0); ++} ++ ++__uml_setup("--help", Usage, ++"--help\n" ++" Prints this message.\n\n" ++); ++ ++static int __init uml_checksetup(char *line, int *add) ++{ ++ struct uml_param *p; ++ ++ p = &__uml_setup_start; ++ while(p < &__uml_setup_end) { ++ int n; ++ ++ n = strlen(p->str); ++ if(!strncmp(line, p->str, n)){ ++ if (p->setup_func(line + n, add)) return 1; ++ } ++ p++; ++ } ++ return 0; ++} ++ ++static void __init uml_postsetup(void) ++{ ++ initcall_t *p; ++ ++ p = &__uml_postsetup_start; ++ while(p < &__uml_postsetup_end){ ++ (*p)(); ++ p++; ++ } ++ return; ++} ++ ++/* Set during early boot */ ++unsigned long brk_start; ++static struct vm_reserved kernel_vm_reserved; ++ ++#define MIN_VMALLOC (32 * 1024 * 1024) ++ ++int linux_main(int argc, char **argv) ++{ ++ unsigned long avail; ++ unsigned long virtmem_size, max_physmem; ++ unsigned int i, add, err; ++ ++ for (i = 1; i < argc; i++){ ++ if((i == 1) && (argv[i][0] == ' ')) continue; ++ add = 1; ++ uml_checksetup(argv[i], &add); ++ if(add) add_arg(saved_command_line, argv[i]); ++ } ++ if(have_root == 0) add_arg(saved_command_line, DEFAULT_COMMAND_LINE); ++ ++ mode_tt = force_tt ? 1 : !can_do_skas(); ++ uml_start = CHOOSE_MODE_PROC(set_task_sizes_tt, set_task_sizes_skas, 0, ++ &host_task_size, &task_size); ++ ++ brk_start = (unsigned long) sbrk(0); ++ CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start); ++ ++ uml_physmem = uml_start; ++ ++ /* Reserve up to 4M after the current brk */ ++ uml_reserved = ROUND_4M(brk_start) + (1 << 22); ++ ++ setup_machinename(system_utsname.machine); ++ ++#ifdef CONFIG_MODE_TT ++ argv1_begin = argv[1]; ++ argv1_end = &argv[1][strlen(argv[1])]; ++#endif ++ ++ set_usable_vm(uml_physmem, get_kmem_end()); ++ ++ highmem = 0; ++ max_physmem = get_kmem_end() - uml_physmem - MIN_VMALLOC; ++ if(physmem_size > max_physmem){ ++ highmem = physmem_size - max_physmem; ++ physmem_size -= highmem; ++#ifndef CONFIG_HIGHMEM ++ highmem = 0; ++ printf("CONFIG_HIGHMEM not enabled - physical memory shrunk " ++ "to %ld bytes\n", physmem_size); ++#endif ++ } ++ ++ high_physmem = uml_physmem + physmem_size; ++ high_memory = (void *) high_physmem; ++ ++ start_vm = VMALLOC_START; ++ ++ setup_physmem(uml_physmem, uml_reserved, physmem_size); ++ virtmem_size = physmem_size; ++ avail = get_kmem_end() - start_vm; ++ if(physmem_size > avail) virtmem_size = avail; ++ end_vm = start_vm + virtmem_size; ++ ++ if(virtmem_size < physmem_size) ++ printf("Kernel virtual memory size shrunk to %ld bytes\n", ++ virtmem_size); ++ ++ err = reserve_vm(high_physmem, end_vm, &kernel_vm_reserved); ++ if(err){ ++ printf("Failed to reserve VM area for kernel VM\n"); ++ exit(1); ++ } ++ ++ uml_postsetup(); ++ ++ init_task.thread.kernel_stack = (unsigned long) &init_task + ++ 2 * PAGE_SIZE; ++ ++ task_protections((unsigned long) &init_task); ++ os_flush_stdout(); ++ ++ return(CHOOSE_MODE(start_uml_tt(), start_uml_skas())); ++} ++ ++static int panic_exit(struct notifier_block *self, unsigned long unused1, ++ void *unused2) ++{ ++#ifdef CONFIG_SYSRQ ++ handle_sysrq('p', ¤t->thread.regs, NULL, NULL); ++#endif ++ machine_halt(); ++ return(0); ++} ++ ++static struct notifier_block panic_exit_notifier = { ++ .notifier_call = panic_exit, ++ .next = NULL, ++ .priority = 0 ++}; ++ ++void __init setup_arch(char **cmdline_p) ++{ ++ notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); ++ paging_init(); ++ strcpy(command_line, saved_command_line); ++ *cmdline_p = command_line; ++ setup_hostinfo(); ++} ++ ++void __init check_bugs(void) ++{ ++ arch_check_bugs(); ++ check_ptrace(); ++ check_sigio(); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/umid.c um/arch/um/kernel/umid.c +--- orig/arch/um/kernel/umid.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/umid.c Mon Feb 24 23:11:23 2003 +@@ -0,0 +1,319 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <fcntl.h> ++#include <errno.h> ++#include <string.h> ++#include <stdlib.h> ++#include <dirent.h> ++#include <signal.h> ++#include <sys/stat.h> ++#include <sys/param.h> ++#include "user.h" ++#include "umid.h" ++#include "init.h" ++#include "os.h" ++#include "user_util.h" ++#include "choose-mode.h" ++ ++#define UMID_LEN 64 ++#define UML_DIR "~/.uml/" ++ ++/* Changed by set_umid and make_umid, which are run early in boot */ ++static char umid[UMID_LEN] = { 0 }; ++ ++/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */ ++static char *uml_dir = UML_DIR; ++ ++/* Changed by set_umid */ ++static int umid_is_random = 1; ++static int umid_inited = 0; ++ ++static int make_umid(int (*printer)(const char *fmt, ...)); ++ ++static int __init set_umid(char *name, int is_random, ++ int (*printer)(const char *fmt, ...)) ++{ ++ if(umid_inited){ ++ (*printer)("Unique machine name can't be set twice\n"); ++ return(-1); ++ } ++ ++ if(strlen(name) > UMID_LEN - 1) ++ (*printer)("Unique machine name is being truncated to %s " ++ "characters\n", UMID_LEN); ++ strncpy(umid, name, UMID_LEN - 1); ++ umid[UMID_LEN - 1] = '\0'; ++ ++ umid_is_random = is_random; ++ umid_inited = 1; ++ return 0; ++} ++ ++static int __init set_umid_arg(char *name, int *add) ++{ ++ return(set_umid(name, 0, printf)); ++} ++ ++__uml_setup("umid=", set_umid_arg, ++"umid=<name>\n" ++" This is used to assign a unique identity to this UML machine and\n" ++" is used for naming the pid file and management console socket.\n\n" ++); ++ ++int __init umid_file_name(char *name, char *buf, int len) ++{ ++ int n; ++ ++ if(!umid_inited && make_umid(printk)) return(-1); ++ ++ n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1; ++ if(n > len){ ++ printk("umid_file_name : buffer too short\n"); ++ return(-1); ++ } ++ ++ sprintf(buf, "%s%s/%s", uml_dir, umid, name); ++ return(0); ++} ++ ++extern int tracing_pid; ++ ++static int __init create_pid_file(void) ++{ ++ char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; ++ char pid[sizeof("nnnnn\0")]; ++ int fd; ++ ++ if(umid_file_name("pid", file, sizeof(file))) return 0; ++ ++ fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), ++ 0644); ++ if(fd < 0){ ++ printf("Open of machine pid file \"%s\" failed - " ++ "errno = %d\n", file, -fd); ++ return 0; ++ } ++ ++ sprintf(pid, "%d\n", os_getpid()); ++ if(write(fd, pid, strlen(pid)) != strlen(pid)) ++ printf("Write of pid file failed - errno = %d\n", errno); ++ close(fd); ++ return 0; ++} ++ ++static int actually_do_remove(char *dir) ++{ ++ DIR *directory; ++ struct dirent *ent; ++ int len; ++ char file[256]; ++ ++ if((directory = opendir(dir)) == NULL){ ++ printk("actually_do_remove : couldn't open directory '%s', " ++ "errno = %d\n", dir, errno); ++ return(1); ++ } ++ while((ent = readdir(directory)) != NULL){ ++ if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) ++ continue; ++ len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; ++ if(len > sizeof(file)){ ++ printk("Not deleting '%s' from '%s' - name too long\n", ++ ent->d_name, dir); ++ continue; ++ } ++ sprintf(file, "%s/%s", dir, ent->d_name); ++ if(unlink(file) < 0){ ++ printk("actually_do_remove : couldn't remove '%s' " ++ "from '%s', errno = %d\n", ent->d_name, dir, ++ errno); ++ return(1); ++ } ++ } ++ if(rmdir(dir) < 0){ ++ printk("actually_do_remove : couldn't rmdir '%s', " ++ "errno = %d\n", dir, errno); ++ return(1); ++ } ++ return(0); ++} ++ ++void remove_umid_dir(void) ++{ ++ char dir[strlen(uml_dir) + UMID_LEN + 1]; ++ if(!umid_inited) return; ++ ++ sprintf(dir, "%s%s", uml_dir, umid); ++ actually_do_remove(dir); ++} ++ ++char *get_umid(int only_if_set) ++{ ++ if(only_if_set && umid_is_random) return(NULL); ++ return(umid); ++} ++ ++int not_dead_yet(char *dir) ++{ ++ char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; ++ char pid[sizeof("nnnnn\0")], *end; ++ int dead, fd, p; ++ ++ sprintf(file, "%s/pid", dir); ++ dead = 0; ++ if((fd = os_open_file(file, of_read(OPENFLAGS()), 0)) < 0){ ++ if(fd != -ENOENT){ ++ printk("not_dead_yet : couldn't open pid file '%s', " ++ "errno = %d\n", file, -fd); ++ return(1); ++ } ++ dead = 1; ++ } ++ if(fd > 0){ ++ if(read(fd, pid, sizeof(pid)) < 0){ ++ printk("not_dead_yet : couldn't read pid file '%s', " ++ "errno = %d\n", file, errno); ++ return(1); ++ } ++ p = strtoul(pid, &end, 0); ++ if(end == pid){ ++ printk("not_dead_yet : couldn't parse pid file '%s', " ++ "errno = %d\n", file, errno); ++ dead = 1; ++ } ++ if(((kill(p, 0) < 0) && (errno == ESRCH)) || ++ (p == CHOOSE_MODE(tracing_pid, os_getpid()))) ++ dead = 1; ++ } ++ if(!dead) return(1); ++ return(actually_do_remove(dir)); ++} ++ ++static int __init set_uml_dir(char *name, int *add) ++{ ++ if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){ ++ uml_dir = malloc(strlen(name) + 1); ++ if(uml_dir == NULL){ ++ printf("Failed to malloc uml_dir - error = %d\n", ++ errno); ++ uml_dir = name; ++ return(0); ++ } ++ sprintf(uml_dir, "%s/", name); ++ } ++ else uml_dir = name; ++ return 0; ++} ++ ++static int __init make_uml_dir(void) ++{ ++ char dir[MAXPATHLEN + 1] = { '\0' }; ++ int len; ++ ++ if(*uml_dir == '~'){ ++ char *home = getenv("HOME"); ++ ++ if(home == NULL){ ++ printf("make_uml_dir : no value in environment for " ++ "$HOME\n"); ++ exit(1); ++ } ++ strncpy(dir, home, sizeof(dir)); ++ uml_dir++; ++ } ++ len = strlen(dir); ++ strncat(dir, uml_dir, sizeof(dir) - len); ++ len = strlen(dir); ++ if((len > 0) && (len < sizeof(dir) - 1) && (dir[len - 1] != '/')){ ++ dir[len] = '/'; ++ dir[len + 1] = '\0'; ++ } ++ ++ if((uml_dir = malloc(strlen(dir) + 1)) == NULL){ ++ printf("make_uml_dir : malloc failed, errno = %d\n", errno); ++ exit(1); ++ } ++ strcpy(uml_dir, dir); ++ ++ if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){ ++ printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno); ++ return(-1); ++ } ++ return 0; ++} ++ ++static int __init make_umid(int (*printer)(const char *fmt, ...)) ++{ ++ int fd, err; ++ char tmp[strlen(uml_dir) + UMID_LEN + 1]; ++ ++ strncpy(tmp, uml_dir, sizeof(tmp) - 1); ++ tmp[sizeof(tmp) - 1] = '\0'; ++ ++ if(!umid_inited){ ++ strcat(tmp, "XXXXXX"); ++ fd = mkstemp(tmp); ++ if(fd < 0){ ++ (*printer)("make_umid - mkstemp failed, errno = %d\n", ++ errno); ++ return(1); ++ } ++ ++ close(fd); ++ /* There's a nice tiny little race between this unlink and ++ * the mkdir below. It'd be nice if there were a mkstemp ++ * for directories. ++ */ ++ unlink(tmp); ++ set_umid(&tmp[strlen(uml_dir)], 1, printer); ++ } ++ ++ sprintf(tmp, "%s%s", uml_dir, umid); ++ ++ if((err = mkdir(tmp, 0777)) < 0){ ++ if(errno == EEXIST){ ++ if(not_dead_yet(tmp)){ ++ (*printer)("umid '%s' is in use\n", umid); ++ return(-1); ++ } ++ err = mkdir(tmp, 0777); ++ } ++ } ++ if(err < 0){ ++ (*printer)("Failed to create %s - errno = %d\n", umid, errno); ++ return(-1); ++ } ++ ++ return(0); ++} ++ ++__uml_setup("uml_dir=", set_uml_dir, ++"uml_dir=<directory>\n" ++" The location to place the pid and umid files.\n\n" ++); ++ ++__uml_postsetup(make_uml_dir); ++ ++static int __init make_umid_setup(void) ++{ ++ return(make_umid(printf)); ++} ++ ++__uml_postsetup(make_umid_setup); ++__uml_postsetup(create_pid_file); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/kernel/user_syms.c um/arch/um/kernel/user_syms.c +--- orig/arch/um/kernel/user_syms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/user_syms.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,116 @@ ++#include <stdio.h> ++#include <unistd.h> ++#include <fcntl.h> ++#include <dirent.h> ++#include <errno.h> ++#include <utime.h> ++#include <string.h> ++#include <sys/stat.h> ++#include <sys/vfs.h> ++#include <sys/ioctl.h> ++#include "user_util.h" ++#include "mem_user.h" ++ ++/* XXX All the __CONFIG_* stuff is broken because this file can't include ++ * config.h ++ */ ++ ++/* Had to steal this from linux/module.h because that file can't be included ++ * since this includes various user-level headers. ++ */ ++ ++struct module_symbol ++{ ++ unsigned long value; ++ const char *name; ++}; ++ ++/* Indirect stringification. */ ++ ++#define __MODULE_STRING_1(x) #x ++#define __MODULE_STRING(x) __MODULE_STRING_1(x) ++ ++#if !defined(__AUTOCONF_INCLUDED__) ++ ++#define __EXPORT_SYMBOL(sym,str) error config_must_be_included_before_module ++#define EXPORT_SYMBOL(var) error config_must_be_included_before_module ++#define EXPORT_SYMBOL_NOVERS(var) error config_must_be_included_before_module ++ ++#elif !defined(__CONFIG_MODULES__) ++ ++#define __EXPORT_SYMBOL(sym,str) ++#define EXPORT_SYMBOL(var) ++#define EXPORT_SYMBOL_NOVERS(var) ++ ++#else ++ ++#define __EXPORT_SYMBOL(sym, str) \ ++const char __kstrtab_##sym[] \ ++__attribute__((section(".kstrtab"))) = str; \ ++const struct module_symbol __ksymtab_##sym \ ++__attribute__((section("__ksymtab"))) = \ ++{ (unsigned long)&sym, __kstrtab_##sym } ++ ++#if defined(__MODVERSIONS__) || !defined(__CONFIG_MODVERSIONS__) ++#define EXPORT_SYMBOL(var) __EXPORT_SYMBOL(var, __MODULE_STRING(var)) ++#else ++#define EXPORT_SYMBOL(var) __EXPORT_SYMBOL(var, __MODULE_STRING(__VERSIONED_SYMBOL(var))) ++#endif ++ ++#define EXPORT_SYMBOL_NOVERS(var) __EXPORT_SYMBOL(var, __MODULE_STRING(var)) ++ ++#endif ++ ++EXPORT_SYMBOL(__errno_location); ++ ++EXPORT_SYMBOL(access); ++EXPORT_SYMBOL(open); ++EXPORT_SYMBOL(open64); ++EXPORT_SYMBOL(close); ++EXPORT_SYMBOL(read); ++EXPORT_SYMBOL(write); ++EXPORT_SYMBOL(dup2); ++EXPORT_SYMBOL(__xstat); ++EXPORT_SYMBOL(__lxstat); ++EXPORT_SYMBOL(__lxstat64); ++EXPORT_SYMBOL(lseek); ++EXPORT_SYMBOL(lseek64); ++EXPORT_SYMBOL(chown); ++EXPORT_SYMBOL(truncate); ++EXPORT_SYMBOL(utime); ++EXPORT_SYMBOL(chmod); ++EXPORT_SYMBOL(rename); ++EXPORT_SYMBOL(__xmknod); ++ ++EXPORT_SYMBOL(symlink); ++EXPORT_SYMBOL(link); ++EXPORT_SYMBOL(unlink); ++EXPORT_SYMBOL(readlink); ++ ++EXPORT_SYMBOL(mkdir); ++EXPORT_SYMBOL(rmdir); ++EXPORT_SYMBOL(opendir); ++EXPORT_SYMBOL(readdir); ++EXPORT_SYMBOL(closedir); ++EXPORT_SYMBOL(seekdir); ++EXPORT_SYMBOL(telldir); ++ ++EXPORT_SYMBOL(ioctl); ++ ++extern ssize_t pread64 (int __fd, void *__buf, size_t __nbytes, ++ __off64_t __offset); ++extern ssize_t pwrite64 (int __fd, __const void *__buf, size_t __n, ++ __off64_t __offset); ++EXPORT_SYMBOL(pread64); ++EXPORT_SYMBOL(pwrite64); ++ ++EXPORT_SYMBOL(statfs); ++EXPORT_SYMBOL(statfs64); ++ ++EXPORT_SYMBOL(memcpy); ++EXPORT_SYMBOL(getuid); ++ ++EXPORT_SYMBOL(memset); ++EXPORT_SYMBOL(strstr); ++ ++EXPORT_SYMBOL(find_iomem); +diff -Naur -X ../exclude-files orig/arch/um/kernel/user_util.c um/arch/um/kernel/user_util.c +--- orig/arch/um/kernel/user_util.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/kernel/user_util.c Wed Apr 23 20:41:54 2003 +@@ -0,0 +1,164 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stdlib.h> ++#include <fcntl.h> ++#include <unistd.h> ++#include <limits.h> ++#include <sys/mman.h> ++#include <sys/stat.h> ++#include <sys/ptrace.h> ++#include <sys/utsname.h> ++#include <sys/param.h> ++#include <sys/time.h> ++#include "asm/types.h" ++#include <ctype.h> ++#include <signal.h> ++#include <wait.h> ++#include <errno.h> ++#include <stdarg.h> ++#include <sched.h> ++#include <termios.h> ++#include <string.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "user.h" ++#include "mem_user.h" ++#include "init.h" ++#include "helper.h" ++#include "uml-config.h" ++ ++#define COMMAND_LINE_SIZE _POSIX_ARG_MAX ++ ++/* Changed in linux_main and setup_arch, which run before SMP is started */ ++char saved_command_line[COMMAND_LINE_SIZE] = { 0 }; ++char command_line[COMMAND_LINE_SIZE] = { 0 }; ++ ++void add_arg(char *cmd_line, char *arg) ++{ ++ if (strlen(cmd_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) { ++ printf("add_arg: Too much command line!\n"); ++ exit(1); ++ } ++ if(strlen(cmd_line) > 0) strcat(cmd_line, " "); ++ strcat(cmd_line, arg); ++} ++ ++void stop(void) ++{ ++ while(1) sleep(1000000); ++} ++ ++void stack_protections(unsigned long address) ++{ ++ int prot = PROT_READ | PROT_WRITE | PROT_EXEC; ++ ++ if(mprotect((void *) address, page_size(), prot) < 0) ++ panic("protecting stack failed, errno = %d", errno); ++} ++ ++void task_protections(unsigned long address) ++{ ++ unsigned long guard = address + page_size(); ++ unsigned long stack = guard + page_size(); ++ int prot = 0, pages; ++#ifdef notdef ++ if(mprotect((void *) guard, page_size(), prot) < 0) ++ panic("protecting guard page failed, errno = %d", errno); ++#endif ++ pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2; ++ prot = PROT_READ | PROT_WRITE | PROT_EXEC; ++ if(mprotect((void *) stack, pages * page_size(), prot) < 0) ++ panic("protecting stack failed, errno = %d", errno); ++} ++ ++int wait_for_stop(int pid, int sig, int cont_type, void *relay) ++{ ++ sigset_t *relay_signals = relay; ++ int status, ret; ++ ++ while(1){ ++ if(((ret = waitpid(pid, &status, WUNTRACED)) < 0) || ++ !WIFSTOPPED(status) || (WSTOPSIG(status) != sig)){ ++ if(ret < 0){ ++ if(errno == EINTR) continue; ++ printk("wait failed, errno = %d\n", ++ errno); ++ } ++ else if(WIFEXITED(status)) ++ printk("process exited with status %d\n", ++ WEXITSTATUS(status)); ++ else if(WIFSIGNALED(status)) ++ printk("process exited with signal %d\n", ++ WTERMSIG(status)); ++ else if((WSTOPSIG(status) == SIGVTALRM) || ++ (WSTOPSIG(status) == SIGALRM) || ++ (WSTOPSIG(status) == SIGIO) || ++ (WSTOPSIG(status) == SIGPROF) || ++ (WSTOPSIG(status) == SIGCHLD) || ++ (WSTOPSIG(status) == SIGWINCH) || ++ (WSTOPSIG(status) == SIGINT)){ ++ ptrace(cont_type, pid, 0, WSTOPSIG(status)); ++ continue; ++ } ++ else if((relay_signals != NULL) && ++ sigismember(relay_signals, WSTOPSIG(status))){ ++ ptrace(cont_type, pid, 0, WSTOPSIG(status)); ++ continue; ++ } ++ else printk("process stopped with signal %d\n", ++ WSTOPSIG(status)); ++ panic("wait_for_stop failed to wait for %d to stop " ++ "with %d\n", pid, sig); ++ } ++ return(status); ++ } ++} ++ ++int raw(int fd, int complain) ++{ ++ struct termios tt; ++ int err; ++ ++ tcgetattr(fd, &tt); ++ cfmakeraw(&tt); ++ err = tcsetattr(fd, TCSANOW, &tt); ++ if((err < 0) && complain){ ++ printk("tcsetattr failed, errno = %d\n", errno); ++ return(-errno); ++ } ++ return(0); ++} ++ ++void setup_machinename(char *machine_out) ++{ ++ struct utsname host; ++ ++ uname(&host); ++ strcpy(machine_out, host.machine); ++} ++ ++char host_info[(_UTSNAME_LENGTH + 1) * 4 + _UTSNAME_NODENAME_LENGTH + 1]; ++ ++void setup_hostinfo(void) ++{ ++ struct utsname host; ++ ++ uname(&host); ++ sprintf(host_info, "%s %s %s %s %s", host.sysname, host.nodename, ++ host.release, host.version, host.machine); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/link.ld.in um/arch/um/link.ld.in +--- orig/arch/um/link.ld.in Wed Dec 31 19:00:00 1969 ++++ um/arch/um/link.ld.in Fri Jan 17 23:11:30 2003 +@@ -0,0 +1,95 @@ ++OUTPUT_FORMAT("ELF_FORMAT") ++OUTPUT_ARCH(ELF_ARCH) ++ENTRY(_start) ++ ++SECTIONS ++{ ++ . = START() + SIZEOF_HEADERS; ++ ++ . = ALIGN(4096); ++ __binary_start = .; ++ifdef(`MODE_TT', ` ++ .thread_private : { ++ __start_thread_private = .; ++ errno = .; ++ . += 4; ++ arch/um/kernel/tt/unmap_fin.o (.data) ++ __end_thread_private = .; ++ } ++ . = ALIGN(4096); ++ .remap : { arch/um/kernel/tt/unmap_fin.o (.text) } ++') ++ . = ALIGN(4096); /* Init code and data */ ++ _stext = .; ++ __init_begin = .; ++ .text.init : { *(.text.init) } ++ . = ALIGN(4096); ++ .text : ++ { ++ *(.text) ++ /* .gnu.warning sections are handled specially by elf32.em. */ ++ *(.gnu.warning) ++ *(.gnu.linkonce.t*) ++ } ++ .fini : { *(.fini) } =0x9090 ++ .rodata : { *(.rodata) *(.gnu.linkonce.r*) } ++ .rodata1 : { *(.rodata1) } ++ _etext = .; ++ PROVIDE (etext = .); ++ ++ . = ALIGN(4096); ++ PROVIDE (_sdata = .); ++ ++include(`arch/um/common.ld.in') ++ ++ .data : ++ { ++ . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ ++ *(.data.init_task) ++ *(.data) ++ *(.gnu.linkonce.d*) ++ CONSTRUCTORS ++ } ++ .data1 : { *(.data1) } ++ .ctors : ++ { ++ *(.ctors) ++ } ++ .dtors : ++ { ++ *(.dtors) ++ } ++ ++ .got : { *(.got.plt) *(.got) } ++ .dynamic : { *(.dynamic) } ++ /* We want the small data sections together, so single-instruction offsets ++ can access them all, and initialized data all before uninitialized, so ++ we can shorten the on-disk segment size. */ ++ .sdata : { *(.sdata) } ++ _edata = .; ++ PROVIDE (edata = .); ++ . = ALIGN(0x1000); ++ .sbss : ++ { ++ __bss_start = .; ++ PROVIDE(_bss_start = .); ++ *(.sbss) ++ *(.scommon) ++ } ++ .bss : ++ { ++ *(.dynbss) ++ *(.bss) ++ *(COMMON) ++ } ++ _end = . ; ++ PROVIDE (end = .); ++ /* Stabs debugging sections. */ ++ .stab 0 : { *(.stab) } ++ .stabstr 0 : { *(.stabstr) } ++ .stab.excl 0 : { *(.stab.excl) } ++ .stab.exclstr 0 : { *(.stab.exclstr) } ++ .stab.index 0 : { *(.stab.index) } ++ .stab.indexstr 0 : { *(.stab.indexstr) } ++ .comment 0 : { *(.comment) } ++} +diff -Naur -X ../exclude-files orig/arch/um/main.c um/arch/um/main.c +--- orig/arch/um/main.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/main.c Fri Jan 17 13:22:40 2003 +@@ -0,0 +1,195 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <stdio.h> ++#include <stdlib.h> ++#include <string.h> ++#include <signal.h> ++#include <sys/resource.h> ++#include <sys/mman.h> ++#include <sys/user.h> ++#include <asm/page.h> ++#include "user_util.h" ++#include "kern_util.h" ++#include "mem_user.h" ++#include "signal_user.h" ++#include "user.h" ++#include "init.h" ++#include "mode.h" ++#include "choose-mode.h" ++#include "uml-config.h" ++ ++/* Set in set_stklim, which is called from main and __wrap_malloc. ++ * __wrap_malloc only calls it if main hasn't started. ++ */ ++unsigned long stacksizelim; ++ ++/* Set in main */ ++char *linux_prog; ++ ++#define PGD_BOUND (4 * 1024 * 1024) ++#define STACKSIZE (8 * 1024 * 1024) ++#define THREAD_NAME_LEN (256) ++ ++static void set_stklim(void) ++{ ++ struct rlimit lim; ++ ++ if(getrlimit(RLIMIT_STACK, &lim) < 0){ ++ perror("getrlimit"); ++ exit(1); ++ } ++ if((lim.rlim_cur == RLIM_INFINITY) || (lim.rlim_cur > STACKSIZE)){ ++ lim.rlim_cur = STACKSIZE; ++ if(setrlimit(RLIMIT_STACK, &lim) < 0){ ++ perror("setrlimit"); ++ exit(1); ++ } ++ } ++ stacksizelim = (lim.rlim_cur + PGD_BOUND - 1) & ~(PGD_BOUND - 1); ++} ++ ++static __init void do_uml_initcalls(void) ++{ ++ initcall_t *call; ++ ++ call = &__uml_initcall_start; ++ while (call < &__uml_initcall_end){; ++ (*call)(); ++ call++; ++ } ++} ++ ++static void last_ditch_exit(int sig) ++{ ++ CHOOSE_MODE(kmalloc_ok = 0, (void) 0); ++ signal(SIGINT, SIG_DFL); ++ signal(SIGTERM, SIG_DFL); ++ signal(SIGHUP, SIG_DFL); ++ uml_cleanup(); ++ exit(1); ++} ++ ++extern int uml_exitcode; ++ ++int main(int argc, char **argv, char **envp) ++{ ++ char **new_argv; ++ sigset_t mask; ++ int ret, i; ++ ++ /* Enable all signals except SIGIO - in some environments, we can ++ * enter with some signals blocked ++ */ ++ ++ sigemptyset(&mask); ++ sigaddset(&mask, SIGIO); ++ if(sigprocmask(SIG_SETMASK, &mask, NULL) < 0){ ++ perror("sigprocmask"); ++ exit(1); ++ } ++ ++#ifdef UML_CONFIG_MODE_TT ++ /* Allocate memory for thread command lines */ ++ if(argc < 2 || strlen(argv[1]) < THREAD_NAME_LEN - 1){ ++ ++ char padding[THREAD_NAME_LEN] = { ++ [ 0 ... THREAD_NAME_LEN - 2] = ' ', '\0' ++ }; ++ ++ new_argv = malloc((argc + 2) * sizeof(char*)); ++ if(!new_argv) { ++ perror("Allocating extended argv"); ++ exit(1); ++ } ++ ++ new_argv[0] = argv[0]; ++ new_argv[1] = padding; ++ ++ for(i = 2; i <= argc; i++) ++ new_argv[i] = argv[i - 1]; ++ new_argv[argc + 1] = NULL; ++ ++ execvp(new_argv[0], new_argv); ++ perror("execing with extended args"); ++ exit(1); ++ } ++#endif ++ ++ linux_prog = argv[0]; ++ ++ set_stklim(); ++ ++ if((new_argv = malloc((argc + 1) * sizeof(char *))) == NULL){ ++ perror("Mallocing argv"); ++ exit(1); ++ } ++ for(i=0;i<argc;i++){ ++ if((new_argv[i] = strdup(argv[i])) == NULL){ ++ perror("Mallocing an arg"); ++ exit(1); ++ } ++ } ++ new_argv[argc] = NULL; ++ ++ set_handler(SIGINT, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1); ++ set_handler(SIGTERM, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1); ++ set_handler(SIGHUP, last_ditch_exit, SA_ONESHOT | SA_NODEFER, -1); ++ ++ do_uml_initcalls(); ++ ret = linux_main(argc, argv); ++ ++ /* Reboot */ ++ if(ret){ ++ printf("\n"); ++ execvp(new_argv[0], new_argv); ++ perror("Failed to exec kernel"); ++ ret = 1; ++ } ++ printf("\n"); ++ return(uml_exitcode); ++} ++ ++#define CAN_KMALLOC() \ ++ (kmalloc_ok && CHOOSE_MODE((getpid() != tracing_pid), 1)) ++ ++extern void *__real_malloc(int); ++ ++void *__wrap_malloc(int size) ++{ ++ if(CAN_KMALLOC()) ++ return(um_kmalloc(size)); ++ else ++ return(__real_malloc(size)); ++} ++ ++void *__wrap_calloc(int n, int size) ++{ ++ void *ptr = __wrap_malloc(n * size); ++ ++ if(ptr == NULL) return(NULL); ++ memset(ptr, 0, n * size); ++ return(ptr); ++} ++ ++extern void __real_free(void *); ++ ++void __wrap_free(void *ptr) ++{ ++ if(CAN_KMALLOC()) kfree(ptr); ++ else __real_free(ptr); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/Makefile um/arch/um/os-Linux/Makefile +--- orig/arch/um/os-Linux/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,17 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = built-in.o ++ ++obj-y = file.o process.o tty.o ++ ++include $(TOPDIR)/Rules.make ++ ++$(obj-y) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++clean : ++ ++archmrproper: +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/Makefile um/arch/um/os-Linux/drivers/Makefile +--- orig/arch/um/os-Linux/drivers/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,31 @@ ++# ++# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET := drivers.o ++ ++list-multi := tuntap.o ethertap.o ++ ++ethertap-objs := ethertap_kern.o ethertap_user.o ++tuntap-objs := tuntap_kern.o tuntap_user.o ++ ++obj-y = ++obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o ++obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o ++ ++USER_SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y)),$($(f)-objs)) ++ ++USER_OBJS = $(filter %_user.o,$(obj-y) $(USER_SINGLE_OBJS)) ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++ethertap.o : $(ethertap-objs) ++ ++tuntap.o : $(tuntap-objs) ++ ++$(list-multi) : # This doesn't work, but should : '%.o : $(%-objs)' ++ $(LD) $(LD_RFLAG) -r -o $@ $($(patsubst %.o,%,$@)-objs) +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/etap.h um/arch/um/os-Linux/drivers/etap.h +--- orig/arch/um/os-Linux/drivers/etap.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/etap.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,27 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "net_user.h" ++ ++struct ethertap_data { ++ char *dev_name; ++ char *gate_addr; ++ int data_fd; ++ int control_fd; ++ void *dev; ++}; ++ ++extern struct net_user_info ethertap_user_info; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/ethertap_kern.c um/arch/um/os-Linux/drivers/ethertap_kern.c +--- orig/arch/um/os-Linux/drivers/ethertap_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/ethertap_kern.c Sun Dec 15 21:17:37 2002 +@@ -0,0 +1,122 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * Licensed under the GPL. ++ */ ++ ++#include "linux/init.h" ++#include "linux/netdevice.h" ++#include "linux/etherdevice.h" ++#include "linux/init.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "etap.h" ++ ++struct ethertap_init { ++ char *dev_name; ++ char *gate_addr; ++}; ++ ++static void etap_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *pri; ++ struct ethertap_data *epri; ++ struct ethertap_init *init = data; ++ ++ init_etherdev(dev, 0); ++ pri = dev->priv; ++ epri = (struct ethertap_data *) pri->user; ++ *epri = ((struct ethertap_data) ++ { .dev_name = init->dev_name, ++ .gate_addr = init->gate_addr, ++ .data_fd = -1, ++ .control_fd = -1, ++ .dev = dev }); ++ ++ printk("ethertap backend - %s", epri->dev_name); ++ if(epri->gate_addr != NULL) ++ printk(", IP = %s", epri->gate_addr); ++ printk("\n"); ++} ++ ++static int etap_read(int fd, struct sk_buff **skb, struct uml_net_private *lp) ++{ ++ int len; ++ ++ *skb = ether_adjust_skb(*skb, ETH_HEADER_ETHERTAP); ++ if(*skb == NULL) return(-ENOMEM); ++ len = net_recvfrom(fd, (*skb)->mac.raw, ++ (*skb)->dev->mtu + 2 * ETH_HEADER_ETHERTAP); ++ if(len <= 0) return(len); ++ skb_pull(*skb, 2); ++ len -= 2; ++ return(len); ++} ++ ++static int etap_write(int fd, struct sk_buff **skb, struct uml_net_private *lp) ++{ ++ if(skb_headroom(*skb) < 2){ ++ struct sk_buff *skb2; ++ ++ skb2 = skb_realloc_headroom(*skb, 2); ++ dev_kfree_skb(*skb); ++ if (skb2 == NULL) return(-ENOMEM); ++ *skb = skb2; ++ } ++ skb_push(*skb, 2); ++ return(net_send(fd, (*skb)->data, (*skb)->len)); ++} ++ ++struct net_kern_info ethertap_kern_info = { ++ .init = etap_init, ++ .protocol = eth_protocol, ++ .read = etap_read, ++ .write = etap_write, ++}; ++ ++int ethertap_setup(char *str, char **mac_out, void *data) ++{ ++ struct ethertap_init *init = data; ++ ++ *init = ((struct ethertap_init) ++ { .dev_name = NULL, ++ .gate_addr = NULL }); ++ if(tap_setup_common(str, "ethertap", &init->dev_name, mac_out, ++ &init->gate_addr)) ++ return(0); ++ if(init->dev_name == NULL){ ++ printk("ethertap_setup : Missing tap device name\n"); ++ return(0); ++ } ++ ++ return(1); ++} ++ ++static struct transport ethertap_transport = { ++ .list = LIST_HEAD_INIT(ethertap_transport.list), ++ .name = "ethertap", ++ .setup = ethertap_setup, ++ .user = ðertap_user_info, ++ .kern = ðertap_kern_info, ++ .private_size = sizeof(struct ethertap_data), ++}; ++ ++static int register_ethertap(void) ++{ ++ register_transport(ðertap_transport); ++ return(1); ++} ++ ++__initcall(register_ethertap); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/ethertap_user.c um/arch/um/os-Linux/drivers/ethertap_user.c +--- orig/arch/um/os-Linux/drivers/ethertap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/ethertap_user.c Sun Dec 15 21:17:52 2002 +@@ -0,0 +1,238 @@ ++/* ++ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and ++ * James Leu (jleu@mindspring.net). ++ * Copyright (C) 2001 by various other people who didn't put their name here. ++ * Licensed under the GPL. ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <stddef.h> ++#include <fcntl.h> ++#include <stdlib.h> ++#include <sys/errno.h> ++#include <sys/socket.h> ++#include <sys/wait.h> ++#include <sys/un.h> ++#include <net/if.h> ++#include "user.h" ++#include "kern_util.h" ++#include "net_user.h" ++#include "etap.h" ++#include "helper.h" ++#include "os.h" ++ ++#define MAX_PACKET ETH_MAX_PACKET ++ ++void etap_user_init(void *data, void *dev) ++{ ++ struct ethertap_data *pri = data; ++ ++ pri->dev = dev; ++} ++ ++struct addr_change { ++ enum { ADD_ADDR, DEL_ADDR } what; ++ unsigned char addr[4]; ++ unsigned char netmask[4]; ++}; ++ ++static void etap_change(int op, unsigned char *addr, unsigned char *netmask, ++ int fd) ++{ ++ struct addr_change change; ++ void *output; ++ ++ change.what = op; ++ memcpy(change.addr, addr, sizeof(change.addr)); ++ memcpy(change.netmask, netmask, sizeof(change.netmask)); ++ if(write(fd, &change, sizeof(change)) != sizeof(change)) ++ printk("etap_change - request failed, errno = %d\n", ++ errno); ++ output = um_kmalloc(page_size()); ++ if(output == NULL) ++ printk("etap_change : Failed to allocate output buffer\n"); ++ read_output(fd, output, page_size()); ++ if(output != NULL){ ++ printk("%s", output); ++ kfree(output); ++ } ++} ++ ++static void etap_open_addr(unsigned char *addr, unsigned char *netmask, ++ void *arg) ++{ ++ etap_change(ADD_ADDR, addr, netmask, *((int *) arg)); ++} ++ ++static void etap_close_addr(unsigned char *addr, unsigned char *netmask, ++ void *arg) ++{ ++ etap_change(DEL_ADDR, addr, netmask, *((int *) arg)); ++} ++ ++struct etap_pre_exec_data { ++ int control_remote; ++ int control_me; ++ int data_me; ++}; ++ ++static void etap_pre_exec(void *arg) ++{ ++ struct etap_pre_exec_data *data = arg; ++ ++ dup2(data->control_remote, 1); ++ close(data->data_me); ++ close(data->control_me); ++} ++ ++static int etap_tramp(char *dev, char *gate, int control_me, ++ int control_remote, int data_me, int data_remote) ++{ ++ struct etap_pre_exec_data pe_data; ++ int pid, status, err; ++ char version_buf[sizeof("nnnnn\0")]; ++ char data_fd_buf[sizeof("nnnnnn\0")]; ++ char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; ++ char *setup_args[] = { "uml_net", version_buf, "ethertap", dev, ++ data_fd_buf, gate_buf, NULL }; ++ char *nosetup_args[] = { "uml_net", version_buf, "ethertap", ++ dev, data_fd_buf, NULL }; ++ char **args, c; ++ ++ sprintf(data_fd_buf, "%d", data_remote); ++ sprintf(version_buf, "%d", UML_NET_VERSION); ++ if(gate != NULL){ ++ strcpy(gate_buf, gate); ++ args = setup_args; ++ } ++ else args = nosetup_args; ++ ++ err = 0; ++ pe_data.control_remote = control_remote; ++ pe_data.control_me = control_me; ++ pe_data.data_me = data_me; ++ pid = run_helper(etap_pre_exec, &pe_data, args, NULL); ++ ++ if(pid < 0) err = errno; ++ close(data_remote); ++ close(control_remote); ++ if(read(control_me, &c, sizeof(c)) != sizeof(c)){ ++ printk("etap_tramp : read of status failed, errno = %d\n", ++ errno); ++ return(EINVAL); ++ } ++ if(c != 1){ ++ printk("etap_tramp : uml_net failed\n"); ++ err = EINVAL; ++ if(waitpid(pid, &status, 0) < 0) err = errno; ++ else if(!WIFEXITED(status) || (WEXITSTATUS(status) != 1)){ ++ printk("uml_net didn't exit with status 1\n"); ++ } ++ } ++ return(err); ++} ++ ++static int etap_open(void *data) ++{ ++ struct ethertap_data *pri = data; ++ char *output; ++ int data_fds[2], control_fds[2], err, output_len; ++ ++ err = tap_open_common(pri->dev, pri->gate_addr); ++ if(err) return(err); ++ ++ err = os_pipe(data_fds, 0, 0); ++ if(err){ ++ printk("data os_pipe failed - errno = %d\n", -err); ++ return(err); ++ } ++ ++ err = os_pipe(control_fds, 1, 0); ++ if(err){ ++ printk("control os_pipe failed - errno = %d\n", -err); ++ return(err); ++ } ++ ++ err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], ++ control_fds[1], data_fds[0], data_fds[1]); ++ output_len = page_size(); ++ output = um_kmalloc(output_len); ++ read_output(control_fds[0], output, output_len); ++ ++ if(output == NULL) ++ printk("etap_open : failed to allocate output buffer\n"); ++ else { ++ printk("%s", output); ++ kfree(output); ++ } ++ ++ if(err != 0){ ++ printk("etap_tramp failed - errno = %d\n", err); ++ return(-err); ++ } ++ ++ pri->data_fd = data_fds[0]; ++ pri->control_fd = control_fds[0]; ++ iter_addresses(pri->dev, etap_open_addr, &pri->control_fd); ++ return(data_fds[0]); ++} ++ ++static void etap_close(int fd, void *data) ++{ ++ struct ethertap_data *pri = data; ++ ++ iter_addresses(pri->dev, etap_close_addr, &pri->control_fd); ++ close(fd); ++ os_shutdown_socket(pri->data_fd, 1, 1); ++ close(pri->data_fd); ++ pri->data_fd = -1; ++ close(pri->control_fd); ++ pri->control_fd = -1; ++} ++ ++static int etap_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++static void etap_add_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct ethertap_data *pri = data; ++ ++ tap_check_ips(pri->gate_addr, addr); ++ if(pri->control_fd == -1) return; ++ etap_open_addr(addr, netmask, &pri->control_fd); ++} ++ ++static void etap_del_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct ethertap_data *pri = data; ++ ++ if(pri->control_fd == -1) return; ++ etap_close_addr(addr, netmask, &pri->control_fd); ++} ++ ++struct net_user_info ethertap_user_info = { ++ .init = etap_user_init, ++ .open = etap_open, ++ .close = etap_close, ++ .remove = NULL, ++ .set_mtu = etap_set_mtu, ++ .add_address = etap_add_addr, ++ .delete_address = etap_del_addr, ++ .max_packet = MAX_PACKET - ETH_HEADER_ETHERTAP ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap.h um/arch/um/os-Linux/drivers/tuntap.h +--- orig/arch/um/os-Linux/drivers/tuntap.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/tuntap.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,32 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_TUNTAP_H ++#define __UM_TUNTAP_H ++ ++#include "net_user.h" ++ ++struct tuntap_data { ++ char *dev_name; ++ int fixed_config; ++ char *gate_addr; ++ int fd; ++ void *dev; ++}; ++ ++extern struct net_user_info tuntap_user_info; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap_kern.c um/arch/um/os-Linux/drivers/tuntap_kern.c +--- orig/arch/um/os-Linux/drivers/tuntap_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/tuntap_kern.c Sun Dec 15 21:18:16 2002 +@@ -0,0 +1,105 @@ ++/* ++ * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/stddef.h" ++#include "linux/netdevice.h" ++#include "linux/etherdevice.h" ++#include "linux/skbuff.h" ++#include "linux/init.h" ++#include "asm/errno.h" ++#include "net_kern.h" ++#include "net_user.h" ++#include "tuntap.h" ++ ++struct tuntap_init { ++ char *dev_name; ++ char *gate_addr; ++}; ++ ++static void tuntap_init(struct net_device *dev, void *data) ++{ ++ struct uml_net_private *pri; ++ struct tuntap_data *tpri; ++ struct tuntap_init *init = data; ++ ++ init_etherdev(dev, 0); ++ pri = dev->priv; ++ tpri = (struct tuntap_data *) pri->user; ++ *tpri = ((struct tuntap_data) ++ { .dev_name = init->dev_name, ++ .fixed_config = (init->dev_name != NULL), ++ .gate_addr = init->gate_addr, ++ .fd = -1, ++ .dev = dev }); ++ printk("TUN/TAP backend - "); ++ if(tpri->gate_addr != NULL) ++ printk("IP = %s", tpri->gate_addr); ++ printk("\n"); ++} ++ ++static int tuntap_read(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ *skb = ether_adjust_skb(*skb, ETH_HEADER_OTHER); ++ if(*skb == NULL) return(-ENOMEM); ++ return(net_read(fd, (*skb)->mac.raw, ++ (*skb)->dev->mtu + ETH_HEADER_OTHER)); ++} ++ ++static int tuntap_write(int fd, struct sk_buff **skb, ++ struct uml_net_private *lp) ++{ ++ return(net_write(fd, (*skb)->data, (*skb)->len)); ++} ++ ++struct net_kern_info tuntap_kern_info = { ++ .init = tuntap_init, ++ .protocol = eth_protocol, ++ .read = tuntap_read, ++ .write = tuntap_write, ++}; ++ ++int tuntap_setup(char *str, char **mac_out, void *data) ++{ ++ struct tuntap_init *init = data; ++ ++ *init = ((struct tuntap_init) ++ { .dev_name = NULL, ++ .gate_addr = NULL }); ++ if(tap_setup_common(str, "tuntap", &init->dev_name, mac_out, ++ &init->gate_addr)) ++ return(0); ++ ++ return(1); ++} ++ ++static struct transport tuntap_transport = { ++ .list = LIST_HEAD_INIT(tuntap_transport.list), ++ .name = "tuntap", ++ .setup = tuntap_setup, ++ .user = &tuntap_user_info, ++ .kern = &tuntap_kern_info, ++ .private_size = sizeof(struct tuntap_data), ++ .setup_size = sizeof(struct tuntap_init), ++}; ++ ++static int register_tuntap(void) ++{ ++ register_transport(&tuntap_transport); ++ return(1); ++} ++ ++__initcall(register_tuntap); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/drivers/tuntap_user.c um/arch/um/os-Linux/drivers/tuntap_user.c +--- orig/arch/um/os-Linux/drivers/tuntap_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/drivers/tuntap_user.c Sun Dec 15 21:18:25 2002 +@@ -0,0 +1,223 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <stddef.h> ++#include <stdlib.h> ++#include <unistd.h> ++#include <errno.h> ++#include <fcntl.h> ++#include <sys/wait.h> ++#include <sys/socket.h> ++#include <sys/un.h> ++#include <sys/uio.h> ++#include <sys/ioctl.h> ++#include <net/if.h> ++#include <linux/if_tun.h> ++#include "net_user.h" ++#include "tuntap.h" ++#include "kern_util.h" ++#include "user.h" ++#include "helper.h" ++#include "os.h" ++ ++#define MAX_PACKET ETH_MAX_PACKET ++ ++void tuntap_user_init(void *data, void *dev) ++{ ++ struct tuntap_data *pri = data; ++ ++ pri->dev = dev; ++} ++ ++static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct tuntap_data *pri = data; ++ ++ tap_check_ips(pri->gate_addr, addr); ++ if((pri->fd == -1) || pri->fixed_config) return; ++ open_addr(addr, netmask, pri->dev_name); ++} ++ ++static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, ++ void *data) ++{ ++ struct tuntap_data *pri = data; ++ ++ if((pri->fd == -1) || pri->fixed_config) return; ++ close_addr(addr, netmask, pri->dev_name); ++} ++ ++struct tuntap_pre_exec_data { ++ int stdout; ++ int close_me; ++}; ++ ++static void tuntap_pre_exec(void *arg) ++{ ++ struct tuntap_pre_exec_data *data = arg; ++ ++ dup2(data->stdout, 1); ++ close(data->close_me); ++} ++ ++static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, ++ char *buffer, int buffer_len, int *used_out) ++{ ++ struct tuntap_pre_exec_data data; ++ char version_buf[sizeof("nnnnn\0")]; ++ char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, ++ NULL }; ++ char buf[CMSG_SPACE(sizeof(*fd_out))]; ++ struct msghdr msg; ++ struct cmsghdr *cmsg; ++ struct iovec iov; ++ int pid, n; ++ ++ sprintf(version_buf, "%d", UML_NET_VERSION); ++ ++ data.stdout = remote; ++ data.close_me = me; ++ ++ pid = run_helper(tuntap_pre_exec, &data, argv, NULL); ++ ++ if(pid < 0) return(-pid); ++ ++ close(remote); ++ ++ msg.msg_name = NULL; ++ msg.msg_namelen = 0; ++ if(buffer != NULL){ ++ iov = ((struct iovec) { buffer, buffer_len }); ++ msg.msg_iov = &iov; ++ msg.msg_iovlen = 1; ++ } ++ else { ++ msg.msg_iov = NULL; ++ msg.msg_iovlen = 0; ++ } ++ msg.msg_control = buf; ++ msg.msg_controllen = sizeof(buf); ++ msg.msg_flags = 0; ++ n = recvmsg(me, &msg, 0); ++ *used_out = n; ++ if(n < 0){ ++ printk("tuntap_open_tramp : recvmsg failed - errno = %d\n", ++ errno); ++ return(errno); ++ } ++ waitpid(pid, NULL, 0); ++ ++ cmsg = CMSG_FIRSTHDR(&msg); ++ if(cmsg == NULL){ ++ printk("tuntap_open_tramp : didn't receive a message\n"); ++ return(EINVAL); ++ } ++ if((cmsg->cmsg_level != SOL_SOCKET) || ++ (cmsg->cmsg_type != SCM_RIGHTS)){ ++ printk("tuntap_open_tramp : didn't receive a descriptor\n"); ++ return(EINVAL); ++ } ++ *fd_out = ((int *) CMSG_DATA(cmsg))[0]; ++ return(0); ++} ++ ++static int tuntap_open(void *data) ++{ ++ struct ifreq ifr; ++ struct tuntap_data *pri = data; ++ char *output, *buffer; ++ int err, fds[2], len, used; ++ ++ err = tap_open_common(pri->dev, pri->gate_addr); ++ if(err) return(err); ++ ++ if(pri->fixed_config){ ++ if((pri->fd = open("/dev/net/tun", O_RDWR)) < 0){ ++ printk("Failed to open /dev/net/tun, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ memset(&ifr, 0, sizeof(ifr)); ++ ifr.ifr_flags = IFF_TAP; ++ strncpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name) - 1); ++ if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){ ++ printk("TUNSETIFF failed, errno = %d", errno); ++ close(pri->fd); ++ return(-errno); ++ } ++ } ++ else { ++ err = os_pipe(fds, 0, 0); ++ if(err){ ++ printk("tuntap_open : os_pipe failed - errno = %d\n", ++ -err); ++ return(err); ++ } ++ ++ buffer = get_output_buffer(&len); ++ if(buffer != NULL) len--; ++ used = 0; ++ ++ err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], ++ fds[1], buffer, len, &used); ++ ++ output = buffer; ++ if(err == 0){ ++ pri->dev_name = uml_strdup(buffer); ++ output += IFNAMSIZ; ++ printk(output); ++ free_output_buffer(buffer); ++ } ++ else { ++ printk(output); ++ free_output_buffer(buffer); ++ printk("tuntap_open_tramp failed - errno = %d\n", err); ++ return(-err); ++ } ++ close(fds[0]); ++ iter_addresses(pri->dev, open_addr, pri->dev_name); ++ } ++ ++ return(pri->fd); ++} ++ ++static void tuntap_close(int fd, void *data) ++{ ++ struct tuntap_data *pri = data; ++ ++ if(!pri->fixed_config) ++ iter_addresses(pri->dev, close_addr, pri->dev_name); ++ close(fd); ++ pri->fd = -1; ++} ++ ++static int tuntap_set_mtu(int mtu, void *data) ++{ ++ return(mtu); ++} ++ ++struct net_user_info tuntap_user_info = { ++ .init = tuntap_user_init, ++ .open = tuntap_open, ++ .close = tuntap_close, ++ .remove = NULL, ++ .set_mtu = tuntap_set_mtu, ++ .add_address = tuntap_add_addr, ++ .delete_address = tuntap_del_addr, ++ .max_packet = MAX_PACKET ++}; ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/file.c um/arch/um/os-Linux/file.c +--- orig/arch/um/os-Linux/file.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/file.c Tue Feb 4 19:32:10 2003 +@@ -0,0 +1,384 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <unistd.h> ++#include <errno.h> ++#include <fcntl.h> ++#include <signal.h> ++#include <sys/socket.h> ++#include <sys/un.h> ++#include <sys/ioctl.h> ++#include <sys/mount.h> ++#include <sys/uio.h> ++#include "os.h" ++#include "user.h" ++#include "kern_util.h" ++ ++int os_file_type(char *file) ++{ ++ struct stat64 buf; ++ ++ if(stat64(file, &buf) == -1) ++ return(-errno); ++ ++ if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); ++ else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); ++ else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); ++ else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); ++ else if(S_ISFIFO(buf.st_mode)) return(OS_TYPE_FIFO); ++ else if(S_ISSOCK(buf.st_mode)) return(OS_TYPE_SOCK); ++ else return(OS_TYPE_FILE); ++} ++ ++int os_file_mode(char *file, struct openflags *mode_out) ++{ ++ *mode_out = OPENFLAGS(); ++ ++ if(!access(file, W_OK)) *mode_out = of_write(*mode_out); ++ else if(errno != EACCES) ++ return(-errno); ++ ++ if(!access(file, R_OK)) *mode_out = of_read(*mode_out); ++ else if(errno != EACCES) ++ return(-errno); ++ ++ return(0); ++} ++ ++int os_open_file(char *file, struct openflags flags, int mode) ++{ ++ int fd, f = 0; ++ ++ if(flags.r && flags.w) f = O_RDWR; ++ else if(flags.r) f = O_RDONLY; ++ else if(flags.w) f = O_WRONLY; ++ else f = 0; ++ ++ if(flags.s) f |= O_SYNC; ++ if(flags.c) f |= O_CREAT; ++ if(flags.t) f |= O_TRUNC; ++ if(flags.e) f |= O_EXCL; ++ ++ fd = open64(file, f, mode); ++ if(fd < 0) return(-errno); ++ ++ if(flags.cl){ ++ if(fcntl(fd, F_SETFD, 1)){ ++ close(fd); ++ return(-errno); ++ } ++ } ++ ++ return(fd); ++} ++ ++int os_connect_socket(char *name) ++{ ++ struct sockaddr_un sock; ++ int fd, err; ++ ++ sock.sun_family = AF_UNIX; ++ snprintf(sock.sun_path, sizeof(sock.sun_path), "%s", name); ++ ++ fd = socket(AF_UNIX, SOCK_STREAM, 0); ++ if(fd < 0) ++ return(fd); ++ ++ err = connect(fd, (struct sockaddr *) &sock, sizeof(sock)); ++ if(err) ++ return(err); ++ ++ return(fd); ++} ++ ++void os_close_file(int fd) ++{ ++ close(fd); ++} ++ ++int os_seek_file(int fd, __u64 offset) ++{ ++ __u64 actual; ++ ++ actual = lseek64(fd, offset, SEEK_SET); ++ if(actual != offset) return(-errno); ++ return(0); ++} ++ ++int os_read_file(int fd, void *buf, int len) ++{ ++ int n; ++ ++ /* Force buf into memory if it's not already. */ ++ ++ /* XXX This fails if buf is kernel memory */ ++#ifdef notdef ++ if(copy_to_user_proc(buf, &c, sizeof(c))) ++ return(-EFAULT); ++#endif ++ ++ n = read(fd, buf, len); ++ if(n < 0) ++ return(-errno); ++ return(n); ++} ++ ++int os_write_file(int fd, void *buf, int count) ++{ ++ int n; ++ ++ /* Force buf into memory if it's not already. */ ++ ++ /* XXX This fails if buf is kernel memory */ ++#ifdef notdef ++ if(copy_to_user_proc(buf, buf, buf[0])) ++ return(-EFAULT); ++#endif ++ ++ n = write(fd, buf, count); ++ if(n < 0) ++ return(-errno); ++ return(n); ++} ++ ++int os_file_size(char *file, long long *size_out) ++{ ++ struct stat64 buf; ++ ++ if(stat64(file, &buf) == -1){ ++ printk("Couldn't stat \"%s\" : errno = %d\n", file, errno); ++ return(-errno); ++ } ++ if(S_ISBLK(buf.st_mode)){ ++ int fd, blocks; ++ ++ if((fd = open64(file, O_RDONLY)) < 0){ ++ printk("Couldn't open \"%s\", errno = %d\n", file, ++ errno); ++ return(-errno); ++ } ++ if(ioctl(fd, BLKGETSIZE, &blocks) < 0){ ++ printk("Couldn't get the block size of \"%s\", " ++ "errno = %d\n", file, errno); ++ close(fd); ++ return(-errno); ++ } ++ *size_out = ((long long) blocks) * 512; ++ close(fd); ++ return(0); ++ } ++ *size_out = buf.st_size; ++ return(0); ++} ++ ++int os_pipe(int *fds, int stream, int close_on_exec) ++{ ++ int err, type = stream ? SOCK_STREAM : SOCK_DGRAM; ++ ++ err = socketpair(AF_UNIX, type, 0, fds); ++ if(err) ++ return(-errno); ++ ++ if(!close_on_exec) ++ return(0); ++ ++ if((fcntl(fds[0], F_SETFD, 1) < 0) || (fcntl(fds[1], F_SETFD, 1) < 0)) ++ printk("os_pipe : Setting FD_CLOEXEC failed, errno = %d", ++ errno); ++ ++ return(0); ++} ++ ++int os_set_fd_async(int fd, int owner) ++{ ++ /* XXX This should do F_GETFL first */ ++ if(fcntl(fd, F_SETFL, O_ASYNC | O_NONBLOCK) < 0){ ++ printk("os_set_fd_async : failed to set O_ASYNC and " ++ "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno); ++ return(-errno); ++ } ++#ifdef notdef ++ if(fcntl(fd, F_SETFD, 1) < 0){ ++ printk("os_set_fd_async : Setting FD_CLOEXEC failed, " ++ "errno = %d\n", errno); ++ } ++#endif ++ ++ if((fcntl(fd, F_SETSIG, SIGIO) < 0) || ++ (fcntl(fd, F_SETOWN, owner) < 0)){ ++ printk("os_set_fd_async : Failed to fcntl F_SETOWN " ++ "(or F_SETSIG) fd %d to pid %d, errno = %d\n", fd, ++ owner, errno); ++ return(-errno); ++ } ++ ++ return(0); ++} ++ ++int os_set_fd_block(int fd, int blocking) ++{ ++ int flags; ++ ++ flags = fcntl(fd, F_GETFL); ++ ++ if(blocking) flags &= ~O_NONBLOCK; ++ else flags |= O_NONBLOCK; ++ ++ if(fcntl(fd, F_SETFL, flags) < 0){ ++ printk("Failed to change blocking on fd # %d, errno = %d\n", ++ fd, errno); ++ return(-errno); ++ } ++ return(0); ++} ++ ++int os_accept_connection(int fd) ++{ ++ int new; ++ ++ new = accept(fd, NULL, 0); ++ if(new < 0) ++ return(-errno); ++ return(new); ++} ++ ++#ifndef SHUT_RD ++#define SHUT_RD 0 ++#endif ++ ++#ifndef SHUT_WR ++#define SHUT_WR 1 ++#endif ++ ++#ifndef SHUT_RDWR ++#define SHUT_RDWR 2 ++#endif ++ ++int os_shutdown_socket(int fd, int r, int w) ++{ ++ int what, err; ++ ++ if(r && w) what = SHUT_RDWR; ++ else if(r) what = SHUT_RD; ++ else if(w) what = SHUT_WR; ++ else { ++ printk("os_shutdown_socket : neither r or w was set\n"); ++ return(-EINVAL); ++ } ++ err = shutdown(fd, what); ++ if(err) ++ return(-errno); ++ return(0); ++} ++ ++int os_rcv_fd(int fd, int *helper_pid_out) ++{ ++ int new, n; ++ char buf[CMSG_SPACE(sizeof(new))]; ++ struct msghdr msg; ++ struct cmsghdr *cmsg; ++ struct iovec iov; ++ ++ msg.msg_name = NULL; ++ msg.msg_namelen = 0; ++ iov = ((struct iovec) { .iov_base = helper_pid_out, ++ .iov_len = sizeof(*helper_pid_out) }); ++ msg.msg_iov = &iov; ++ msg.msg_iovlen = 1; ++ msg.msg_control = buf; ++ msg.msg_controllen = sizeof(buf); ++ msg.msg_flags = 0; ++ ++ n = recvmsg(fd, &msg, 0); ++ if(n < 0) ++ return(-errno); ++ ++ else if(n != sizeof(iov.iov_len)) ++ *helper_pid_out = -1; ++ ++ cmsg = CMSG_FIRSTHDR(&msg); ++ if(cmsg == NULL){ ++ printk("rcv_fd didn't receive anything, error = %d\n", errno); ++ return(-1); ++ } ++ if((cmsg->cmsg_level != SOL_SOCKET) || ++ (cmsg->cmsg_type != SCM_RIGHTS)){ ++ printk("rcv_fd didn't receive a descriptor\n"); ++ return(-1); ++ } ++ ++ new = ((int *) CMSG_DATA(cmsg))[0]; ++ return(new); ++} ++ ++int create_unix_socket(char *file, int len) ++{ ++ struct sockaddr_un addr; ++ int sock, err; ++ ++ sock = socket(PF_UNIX, SOCK_DGRAM, 0); ++ if (sock < 0){ ++ printk("create_unix_socket - socket failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ ++ addr.sun_family = AF_UNIX; ++ ++ /* XXX Be more careful about overflow */ ++ snprintf(addr.sun_path, len, "%s", file); ++ ++ err = bind(sock, (struct sockaddr *) &addr, sizeof(addr)); ++ if (err < 0){ ++ printk("create_listening_socket - bind failed, errno = %d\n", ++ errno); ++ return(-errno); ++ } ++ ++ return(sock); ++} ++ ++void os_flush_stdout(void) ++{ ++ fflush(stdout); ++} ++ ++int os_lock_file(int fd, int excl) ++{ ++ int type = excl ? F_WRLCK : F_RDLCK; ++ struct flock lock = ((struct flock) { .l_type = type, ++ .l_whence = SEEK_SET, ++ .l_start = 0, ++ .l_len = 0 } ); ++ int err, save; ++ ++ err = fcntl(fd, F_SETLK, &lock); ++ if(!err) ++ goto out; ++ ++ save = -errno; ++ err = fcntl(fd, F_GETLK, &lock); ++ if(err){ ++ err = -errno; ++ goto out; ++ } ++ ++ printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid); ++ err = save; ++ out: ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/include/file.h um/arch/um/os-Linux/include/file.h +--- orig/arch/um/os-Linux/include/file.h Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/include/file.h Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __OS_FILE_H__ ++#define __OS_FILE_H__ ++ ++#define DEV_NULL "/dev/null" ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/process.c um/arch/um/os-Linux/process.c +--- orig/arch/um/os-Linux/process.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/process.c Wed Jan 8 14:19:00 2003 +@@ -0,0 +1,142 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <stdio.h> ++#include <errno.h> ++#include <signal.h> ++#include <sys/mman.h> ++#include <sys/wait.h> ++#include "os.h" ++#include "user.h" ++ ++unsigned long os_process_pc(int pid) ++{ ++ char proc_stat[sizeof("/proc/#####/stat\0")], buf[256]; ++ unsigned long pc; ++ int fd; ++ ++ sprintf(proc_stat, "/proc/%d/stat", pid); ++ fd = os_open_file(proc_stat, of_read(OPENFLAGS()), 0); ++ if(fd < 0){ ++ printk("os_process_pc - couldn't open '%s', errno = %d\n", ++ proc_stat, errno); ++ return(-1); ++ } ++ if(read(fd, buf, sizeof(buf)) < 0){ ++ printk("os_process_pc - couldn't read '%s', errno = %d\n", ++ proc_stat, errno); ++ close(fd); ++ return(-1); ++ } ++ close(fd); ++ pc = -1; ++ if(sscanf(buf, "%*d %*s %*c %*d %*d %*d %*d %*d %*d %*d %*d " ++ "%*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d " ++ "%*d %*d %*d %*d %ld", &pc) != 1){ ++ printk("os_process_pc - couldn't find pc in '%s'\n", buf); ++ } ++ return(pc); ++} ++ ++int os_process_parent(int pid) ++{ ++ char stat[sizeof("/proc/nnnnn/stat\0")]; ++ char data[256]; ++ int parent, n, fd; ++ ++ if(pid == -1) return(-1); ++ ++ snprintf(stat, sizeof(stat), "/proc/%d/stat", pid); ++ fd = os_open_file(stat, of_read(OPENFLAGS()), 0); ++ if(fd < 0){ ++ printk("Couldn't open '%s', errno = %d\n", stat, -fd); ++ return(-1); ++ } ++ ++ n = read(fd, data, sizeof(data)); ++ close(fd); ++ ++ if(n < 0){ ++ printk("Couldn't read '%s', errno = %d\n", stat); ++ return(-1); ++ } ++ ++ parent = -1; ++ /* XXX This will break if there is a space in the command */ ++ n = sscanf(data, "%*d %*s %*c %d", &parent); ++ if(n != 1) printk("Failed to scan '%s'\n", data); ++ ++ return(parent); ++} ++ ++void os_stop_process(int pid) ++{ ++ kill(pid, SIGSTOP); ++} ++ ++void os_kill_process(int pid, int reap_child) ++{ ++ kill(pid, SIGKILL); ++ if(reap_child) ++ waitpid(pid, NULL, 0); ++ ++} ++ ++void os_usr1_process(int pid) ++{ ++ kill(pid, SIGUSR1); ++} ++ ++int os_getpid(void) ++{ ++ return(getpid()); ++} ++ ++int os_map_memory(void *virt, int fd, unsigned long off, unsigned long len, ++ int r, int w, int x) ++{ ++ void *loc; ++ int prot; ++ ++ prot = (r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | ++ (x ? PROT_EXEC : 0); ++ ++ loc = mmap((void *) virt, len, prot, MAP_SHARED | MAP_FIXED, ++ fd, off); ++ if(loc == MAP_FAILED) ++ return(-errno); ++ return(0); ++} ++ ++int os_protect_memory(void *addr, unsigned long len, int r, int w, int x) ++{ ++ int prot = ((r ? PROT_READ : 0) | (w ? PROT_WRITE : 0) | ++ (x ? PROT_EXEC : 0)); ++ ++ if(mprotect(addr, len, prot) < 0) ++ return(-errno); ++ return(0); ++} ++ ++int os_unmap_memory(void *addr, int len) ++{ ++ int err; ++ ++ err = munmap(addr, len); ++ if(err < 0) return(-errno); ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/os-Linux/tty.c um/arch/um/os-Linux/tty.c +--- orig/arch/um/os-Linux/tty.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/os-Linux/tty.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,61 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdlib.h> ++#include <errno.h> ++#include "os.h" ++#include "user.h" ++#include "kern_util.h" ++ ++struct grantpt_info { ++ int fd; ++ int res; ++ int err; ++}; ++ ++static void grantpt_cb(void *arg) ++{ ++ struct grantpt_info *info = arg; ++ ++ info->res = grantpt(info->fd); ++ info->err = errno; ++} ++ ++int get_pty(void) ++{ ++ struct grantpt_info info; ++ int fd; ++ ++ if((fd = os_open_file("/dev/ptmx", of_rdwr(OPENFLAGS()), 0)) < 0){ ++ printk("get_pty : Couldn't open /dev/ptmx - errno = %d\n", ++ errno); ++ return(-1); ++ } ++ ++ info.fd = fd; ++ initial_thread_cb(grantpt_cb, &info); ++ ++ if(info.res < 0){ ++ printk("get_pty : Couldn't grant pty - errno = %d\n", ++ info.err); ++ return(-1); ++ } ++ if(unlockpt(fd) < 0){ ++ printk("get_pty : Couldn't unlock pty - errno = %d\n", errno); ++ return(-1); ++ } ++ return(fd); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/Makefile um/arch/um/sys-i386/Makefile +--- orig/arch/um/sys-i386/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/Makefile Sat Nov 23 23:34:24 2002 +@@ -0,0 +1,46 @@ ++# ++# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++O_TARGET = built-in.o ++ ++obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \ ++ ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o ++export-objs = ksyms.o ++ ++USER_OBJS = bugs.o ptrace_user.o sigcontext.o fault.o ++ ++SYMLINKS = semaphore.c extable.c ++ ++semaphore.c-dir = kernel ++extable.c-dir = mm ++ ++include $(TOPDIR)/Rules.make ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$@) $(USER_CFLAGS) -c -o $@ $< ++ ++define make_link ++ -rm -f $1 ++ ln -sf $(TOPDIR)/arch/i386/$($1-dir)/$1 $1 ++endef ++ ++$(SYMLINKS): ++ $(call make_link,$@) ++ ++clean: ++ $(MAKE) -C util clean ++ rm -f $(SYMLINKS) ++ ++fastdep: ++ ++dep: ++ ++archmrproper: ++ ++archclean: ++ ++archdep: ++ ++modules: +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/bugs.c um/arch/um/sys-i386/bugs.c +--- orig/arch/um/sys-i386/bugs.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/bugs.c Sun Dec 8 20:38:45 2002 +@@ -0,0 +1,157 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <fcntl.h> ++#include <errno.h> ++#include <string.h> ++#include <sys/signal.h> ++#include "kern_util.h" ++#include "user.h" ++#include "sysdep/ptrace.h" ++#include "task.h" ++ ++#define MAXTOKEN 64 ++ ++/* Set during early boot */ ++int cpu_has_cmov = 1; ++int cpu_has_xmm = 0; ++ ++static char token(int fd, char *buf, int len, char stop) ++{ ++ int n; ++ char *ptr, *end, c; ++ ++ ptr = buf; ++ end = &buf[len]; ++ do { ++ n = read(fd, ptr, sizeof(*ptr)); ++ c = *ptr++; ++ if(n == 0) return(0); ++ else if(n != sizeof(*ptr)){ ++ printk("Reading /proc/cpuinfo failed, " ++ "errno = %d\n", errno); ++ return(-errno); ++ } ++ } while((c != '\n') && (c != stop) && (ptr < end)); ++ ++ if(ptr == end){ ++ printk("Failed to find '%c' in /proc/cpuinfo\n", stop); ++ return(-1); ++ } ++ *(ptr - 1) = '\0'; ++ return(c); ++} ++ ++static int check_cpu_feature(char *feature, int *have_it) ++{ ++ char buf[MAXTOKEN], c; ++ int fd, len = sizeof(buf)/sizeof(buf[0]), n; ++ ++ printk("Checking for host processor %s support...", feature); ++ fd = open("/proc/cpuinfo", O_RDONLY); ++ if(fd < 0){ ++ printk("Couldn't open /proc/cpuinfo, errno = %d\n", errno); ++ return(0); ++ } ++ ++ *have_it = 0; ++ buf[len - 1] = '\0'; ++ while(1){ ++ c = token(fd, buf, len - 1, ':'); ++ if(c <= 0) goto out; ++ else if(c != ':'){ ++ printk("Failed to find ':' in /proc/cpuinfo\n"); ++ goto out; ++ } ++ ++ if(!strncmp(buf, "flags", strlen("flags"))) break; ++ ++ do { ++ n = read(fd, &c, sizeof(c)); ++ if(n != sizeof(c)){ ++ printk("Failed to find newline in " ++ "/proc/cpuinfo, n = %d, errno = %d\n", ++ n, errno); ++ goto out; ++ } ++ } while(c != '\n'); ++ } ++ ++ c = token(fd, buf, len - 1, ' '); ++ if(c < 0) goto out; ++ else if(c != ' '){ ++ printk("Failed to find ':' in /proc/cpuinfo\n"); ++ goto out; ++ } ++ ++ while(1){ ++ c = token(fd, buf, len - 1, ' '); ++ if(c < 0) goto out; ++ else if(c == '\n') break; ++ ++ if(!strcmp(buf, feature)){ ++ *have_it = 1; ++ goto out; ++ } ++ } ++ out: ++ if(*have_it == 0) printk("No\n"); ++ else if(*have_it == 1) printk("Yes\n"); ++ close(fd); ++ return(1); ++} ++ ++void arch_check_bugs(void) ++{ ++ int have_it; ++ ++ if(access("/proc/cpuinfo", R_OK)){ ++ printk("/proc/cpuinfo not available - skipping CPU capability " ++ "checks\n"); ++ return; ++ } ++ if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it; ++ if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it; ++} ++ ++int arch_handle_signal(int sig, union uml_pt_regs *regs) ++{ ++ unsigned long ip; ++ ++ /* This is testing for a cmov (0x0f 0x4x) instruction causing a ++ * SIGILL in init. ++ */ ++ if((sig != SIGILL) || (TASK_PID(get_current()) != 1)) return(0); ++ ++ ip = UPT_IP(regs); ++ if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40)) ++ return(0); ++ ++ if(cpu_has_cmov == 0) ++ panic("SIGILL caused by cmov, which this processor doesn't " ++ "implement, boot a filesystem compiled for older " ++ "processors"); ++ else if(cpu_has_cmov == 1) ++ panic("SIGILL caused by cmov, which this processor claims to " ++ "implement"); ++ else if(cpu_has_cmov == -1) ++ panic("SIGILL caused by cmov, couldn't tell if this processor " ++ "implements it, boot a filesystem compiled for older " ++ "processors"); ++ else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov); ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/checksum.S um/arch/um/sys-i386/checksum.S +--- orig/arch/um/sys-i386/checksum.S Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/checksum.S Thu Oct 31 20:17:50 2002 +@@ -0,0 +1,460 @@ ++/* ++ * INET An implementation of the TCP/IP protocol suite for the LINUX ++ * operating system. INET is implemented using the BSD Socket ++ * interface as the means of communication with the user level. ++ * ++ * IP/TCP/UDP checksumming routines ++ * ++ * Authors: Jorge Cwik, <jorge@laser.satlink.net> ++ * Arnt Gulbrandsen, <agulbra@nvg.unit.no> ++ * Tom May, <ftom@netcom.com> ++ * Pentium Pro/II routines: ++ * Alexander Kjeldaas <astor@guardian.no> ++ * Finn Arne Gangstad <finnag@guardian.no> ++ * Lots of code moved from tcp.c and ip.c; see those files ++ * for more names. ++ * ++ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception ++ * handling. ++ * Andi Kleen, add zeroing on error ++ * converted to pure assembler ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ */ ++ ++#include <linux/config.h> ++#include <asm/errno.h> ++ ++/* ++ * computes a partial checksum, e.g. for TCP/UDP fragments ++ */ ++ ++/* ++unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) ++ */ ++ ++.text ++.align 4 ++.globl arch_csum_partial ++ ++#ifndef CONFIG_X86_USE_PPRO_CHECKSUM ++ ++ /* ++ * Experiments with Ethernet and SLIP connections show that buff ++ * is aligned on either a 2-byte or 4-byte boundary. We get at ++ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. ++ * Fortunately, it is easy to convert 2-byte alignment to 4-byte ++ * alignment for the unrolled loop. ++ */ ++arch_csum_partial: ++ pushl %esi ++ pushl %ebx ++ movl 20(%esp),%eax # Function arg: unsigned int sum ++ movl 16(%esp),%ecx # Function arg: int len ++ movl 12(%esp),%esi # Function arg: unsigned char *buff ++ testl $2, %esi # Check alignment. ++ jz 2f # Jump if alignment is ok. ++ subl $2, %ecx # Alignment uses up two bytes. ++ jae 1f # Jump if we had at least two bytes. ++ addl $2, %ecx # ecx was < 2. Deal with it. ++ jmp 4f ++1: movw (%esi), %bx ++ addl $2, %esi ++ addw %bx, %ax ++ adcl $0, %eax ++2: ++ movl %ecx, %edx ++ shrl $5, %ecx ++ jz 2f ++ testl %esi, %esi ++1: movl (%esi), %ebx ++ adcl %ebx, %eax ++ movl 4(%esi), %ebx ++ adcl %ebx, %eax ++ movl 8(%esi), %ebx ++ adcl %ebx, %eax ++ movl 12(%esi), %ebx ++ adcl %ebx, %eax ++ movl 16(%esi), %ebx ++ adcl %ebx, %eax ++ movl 20(%esi), %ebx ++ adcl %ebx, %eax ++ movl 24(%esi), %ebx ++ adcl %ebx, %eax ++ movl 28(%esi), %ebx ++ adcl %ebx, %eax ++ lea 32(%esi), %esi ++ dec %ecx ++ jne 1b ++ adcl $0, %eax ++2: movl %edx, %ecx ++ andl $0x1c, %edx ++ je 4f ++ shrl $2, %edx # This clears CF ++3: adcl (%esi), %eax ++ lea 4(%esi), %esi ++ dec %edx ++ jne 3b ++ adcl $0, %eax ++4: andl $3, %ecx ++ jz 7f ++ cmpl $2, %ecx ++ jb 5f ++ movw (%esi),%cx ++ leal 2(%esi),%esi ++ je 6f ++ shll $16,%ecx ++5: movb (%esi),%cl ++6: addl %ecx,%eax ++ adcl $0, %eax ++7: ++ popl %ebx ++ popl %esi ++ ret ++ ++#else ++ ++/* Version for PentiumII/PPro */ ++ ++arch_csum_partial: ++ pushl %esi ++ pushl %ebx ++ movl 20(%esp),%eax # Function arg: unsigned int sum ++ movl 16(%esp),%ecx # Function arg: int len ++ movl 12(%esp),%esi # Function arg: const unsigned char *buf ++ ++ testl $2, %esi ++ jnz 30f ++10: ++ movl %ecx, %edx ++ movl %ecx, %ebx ++ andl $0x7c, %ebx ++ shrl $7, %ecx ++ addl %ebx,%esi ++ shrl $2, %ebx ++ negl %ebx ++ lea 45f(%ebx,%ebx,2), %ebx ++ testl %esi, %esi ++ jmp *%ebx ++ ++ # Handle 2-byte-aligned regions ++20: addw (%esi), %ax ++ lea 2(%esi), %esi ++ adcl $0, %eax ++ jmp 10b ++ ++30: subl $2, %ecx ++ ja 20b ++ je 32f ++ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned ++ addl %ebx, %eax ++ adcl $0, %eax ++ jmp 80f ++32: ++ addw (%esi), %ax # csumming 2 bytes, 2-aligned ++ adcl $0, %eax ++ jmp 80f ++ ++40: ++ addl -128(%esi), %eax ++ adcl -124(%esi), %eax ++ adcl -120(%esi), %eax ++ adcl -116(%esi), %eax ++ adcl -112(%esi), %eax ++ adcl -108(%esi), %eax ++ adcl -104(%esi), %eax ++ adcl -100(%esi), %eax ++ adcl -96(%esi), %eax ++ adcl -92(%esi), %eax ++ adcl -88(%esi), %eax ++ adcl -84(%esi), %eax ++ adcl -80(%esi), %eax ++ adcl -76(%esi), %eax ++ adcl -72(%esi), %eax ++ adcl -68(%esi), %eax ++ adcl -64(%esi), %eax ++ adcl -60(%esi), %eax ++ adcl -56(%esi), %eax ++ adcl -52(%esi), %eax ++ adcl -48(%esi), %eax ++ adcl -44(%esi), %eax ++ adcl -40(%esi), %eax ++ adcl -36(%esi), %eax ++ adcl -32(%esi), %eax ++ adcl -28(%esi), %eax ++ adcl -24(%esi), %eax ++ adcl -20(%esi), %eax ++ adcl -16(%esi), %eax ++ adcl -12(%esi), %eax ++ adcl -8(%esi), %eax ++ adcl -4(%esi), %eax ++45: ++ lea 128(%esi), %esi ++ adcl $0, %eax ++ dec %ecx ++ jge 40b ++ movl %edx, %ecx ++50: andl $3, %ecx ++ jz 80f ++ ++ # Handle the last 1-3 bytes without jumping ++ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked ++ movl $0xffffff,%ebx # by the shll and shrl instructions ++ shll $3,%ecx ++ shrl %cl,%ebx ++ andl -128(%esi),%ebx # esi is 4-aligned so should be ok ++ addl %ebx,%eax ++ adcl $0,%eax ++80: ++ popl %ebx ++ popl %esi ++ ret ++ ++#endif ++ ++/* ++unsigned int csum_partial_copy_generic (const char *src, char *dst, ++ int len, int sum, int *src_err_ptr, int *dst_err_ptr) ++ */ ++ ++/* ++ * Copy from ds while checksumming, otherwise like csum_partial ++ * ++ * The macros SRC and DST specify the type of access for the instruction. ++ * thus we can call a custom exception handler for all access types. ++ * ++ * FIXME: could someone double-check whether I haven't mixed up some SRC and ++ * DST definitions? It's damn hard to trigger all cases. I hope I got ++ * them all but there's no guarantee. ++ */ ++ ++#define SRC(y...) \ ++ 9999: y; \ ++ .section __ex_table, "a"; \ ++ .long 9999b, 6001f ; \ ++ .previous ++ ++#define DST(y...) \ ++ 9999: y; \ ++ .section __ex_table, "a"; \ ++ .long 9999b, 6002f ; \ ++ .previous ++ ++.align 4 ++.globl csum_partial_copy_generic_i386 ++ ++#ifndef CONFIG_X86_USE_PPRO_CHECKSUM ++ ++#define ARGBASE 16 ++#define FP 12 ++ ++csum_partial_copy_generic_i386: ++ subl $4,%esp ++ pushl %edi ++ pushl %esi ++ pushl %ebx ++ movl ARGBASE+16(%esp),%eax # sum ++ movl ARGBASE+12(%esp),%ecx # len ++ movl ARGBASE+4(%esp),%esi # src ++ movl ARGBASE+8(%esp),%edi # dst ++ ++ testl $2, %edi # Check alignment. ++ jz 2f # Jump if alignment is ok. ++ subl $2, %ecx # Alignment uses up two bytes. ++ jae 1f # Jump if we had at least two bytes. ++ addl $2, %ecx # ecx was < 2. Deal with it. ++ jmp 4f ++SRC(1: movw (%esi), %bx ) ++ addl $2, %esi ++DST( movw %bx, (%edi) ) ++ addl $2, %edi ++ addw %bx, %ax ++ adcl $0, %eax ++2: ++ movl %ecx, FP(%esp) ++ shrl $5, %ecx ++ jz 2f ++ testl %esi, %esi ++SRC(1: movl (%esi), %ebx ) ++SRC( movl 4(%esi), %edx ) ++ adcl %ebx, %eax ++DST( movl %ebx, (%edi) ) ++ adcl %edx, %eax ++DST( movl %edx, 4(%edi) ) ++ ++SRC( movl 8(%esi), %ebx ) ++SRC( movl 12(%esi), %edx ) ++ adcl %ebx, %eax ++DST( movl %ebx, 8(%edi) ) ++ adcl %edx, %eax ++DST( movl %edx, 12(%edi) ) ++ ++SRC( movl 16(%esi), %ebx ) ++SRC( movl 20(%esi), %edx ) ++ adcl %ebx, %eax ++DST( movl %ebx, 16(%edi) ) ++ adcl %edx, %eax ++DST( movl %edx, 20(%edi) ) ++ ++SRC( movl 24(%esi), %ebx ) ++SRC( movl 28(%esi), %edx ) ++ adcl %ebx, %eax ++DST( movl %ebx, 24(%edi) ) ++ adcl %edx, %eax ++DST( movl %edx, 28(%edi) ) ++ ++ lea 32(%esi), %esi ++ lea 32(%edi), %edi ++ dec %ecx ++ jne 1b ++ adcl $0, %eax ++2: movl FP(%esp), %edx ++ movl %edx, %ecx ++ andl $0x1c, %edx ++ je 4f ++ shrl $2, %edx # This clears CF ++SRC(3: movl (%esi), %ebx ) ++ adcl %ebx, %eax ++DST( movl %ebx, (%edi) ) ++ lea 4(%esi), %esi ++ lea 4(%edi), %edi ++ dec %edx ++ jne 3b ++ adcl $0, %eax ++4: andl $3, %ecx ++ jz 7f ++ cmpl $2, %ecx ++ jb 5f ++SRC( movw (%esi), %cx ) ++ leal 2(%esi), %esi ++DST( movw %cx, (%edi) ) ++ leal 2(%edi), %edi ++ je 6f ++ shll $16,%ecx ++SRC(5: movb (%esi), %cl ) ++DST( movb %cl, (%edi) ) ++6: addl %ecx, %eax ++ adcl $0, %eax ++7: ++5000: ++ ++# Exception handler: ++.section .fixup, "ax" ++ ++6001: ++ movl ARGBASE+20(%esp), %ebx # src_err_ptr ++ movl $-EFAULT, (%ebx) ++ ++ # zero the complete destination - computing the rest ++ # is too much work ++ movl ARGBASE+8(%esp), %edi # dst ++ movl ARGBASE+12(%esp), %ecx # len ++ xorl %eax,%eax ++ rep ; stosb ++ ++ jmp 5000b ++ ++6002: ++ movl ARGBASE+24(%esp), %ebx # dst_err_ptr ++ movl $-EFAULT,(%ebx) ++ jmp 5000b ++ ++.previous ++ ++ popl %ebx ++ popl %esi ++ popl %edi ++ popl %ecx # equivalent to addl $4,%esp ++ ret ++ ++#else ++ ++/* Version for PentiumII/PPro */ ++ ++#define ROUND1(x) \ ++ SRC(movl x(%esi), %ebx ) ; \ ++ addl %ebx, %eax ; \ ++ DST(movl %ebx, x(%edi) ) ; ++ ++#define ROUND(x) \ ++ SRC(movl x(%esi), %ebx ) ; \ ++ adcl %ebx, %eax ; \ ++ DST(movl %ebx, x(%edi) ) ; ++ ++#define ARGBASE 12 ++ ++csum_partial_copy_generic_i386: ++ pushl %ebx ++ pushl %edi ++ pushl %esi ++ movl ARGBASE+4(%esp),%esi #src ++ movl ARGBASE+8(%esp),%edi #dst ++ movl ARGBASE+12(%esp),%ecx #len ++ movl ARGBASE+16(%esp),%eax #sum ++# movl %ecx, %edx ++ movl %ecx, %ebx ++ movl %esi, %edx ++ shrl $6, %ecx ++ andl $0x3c, %ebx ++ negl %ebx ++ subl %ebx, %esi ++ subl %ebx, %edi ++ lea -1(%esi),%edx ++ andl $-32,%edx ++ lea 3f(%ebx,%ebx), %ebx ++ testl %esi, %esi ++ jmp *%ebx ++1: addl $64,%esi ++ addl $64,%edi ++ SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) ++ ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) ++ ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) ++ ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) ++ ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) ++3: adcl $0,%eax ++ addl $64, %edx ++ dec %ecx ++ jge 1b ++4: movl ARGBASE+12(%esp),%edx #len ++ andl $3, %edx ++ jz 7f ++ cmpl $2, %edx ++ jb 5f ++SRC( movw (%esi), %dx ) ++ leal 2(%esi), %esi ++DST( movw %dx, (%edi) ) ++ leal 2(%edi), %edi ++ je 6f ++ shll $16,%edx ++5: ++SRC( movb (%esi), %dl ) ++DST( movb %dl, (%edi) ) ++6: addl %edx, %eax ++ adcl $0, %eax ++7: ++.section .fixup, "ax" ++6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr ++ movl $-EFAULT, (%ebx) ++ # zero the complete destination (computing the rest is too much work) ++ movl ARGBASE+8(%esp),%edi # dst ++ movl ARGBASE+12(%esp),%ecx # len ++ xorl %eax,%eax ++ rep; stosb ++ jmp 7b ++6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr ++ movl $-EFAULT, (%ebx) ++ jmp 7b ++.previous ++ ++ popl %esi ++ popl %edi ++ popl %ebx ++ ret ++ ++#undef ROUND ++#undef ROUND1 ++ ++#endif +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/fault.c um/arch/um/sys-i386/fault.c +--- orig/arch/um/sys-i386/fault.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/fault.c Sun Oct 27 16:49:35 2002 +@@ -0,0 +1,34 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <signal.h> ++#include "sysdep/ptrace.h" ++#include "sysdep/sigcontext.h" ++ ++extern unsigned long search_exception_table(unsigned long addr); ++ ++int arch_fixup(unsigned long address, void *sc_ptr) ++{ ++ struct sigcontext *sc = sc_ptr; ++ unsigned long fixup; ++ ++ fixup = search_exception_table(address); ++ if(fixup != 0){ ++ sc->eip = fixup; ++ return(1); ++ } ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ksyms.c um/arch/um/sys-i386/ksyms.c +--- orig/arch/um/sys-i386/ksyms.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/ksyms.c Tue Oct 29 21:01:45 2002 +@@ -0,0 +1,17 @@ ++#include "linux/module.h" ++#include "linux/in6.h" ++#include "linux/rwsem.h" ++#include "asm/byteorder.h" ++#include "asm/semaphore.h" ++#include "asm/uaccess.h" ++#include "asm/checksum.h" ++#include "asm/errno.h" ++ ++EXPORT_SYMBOL(__down_failed); ++EXPORT_SYMBOL(__down_failed_interruptible); ++EXPORT_SYMBOL(__down_failed_trylock); ++EXPORT_SYMBOL(__up_wakeup); ++ ++/* Networking helper routines. */ ++EXPORT_SYMBOL(csum_partial_copy_from); ++EXPORT_SYMBOL(csum_partial_copy_to); +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ldt.c um/arch/um/sys-i386/ldt.c +--- orig/arch/um/sys-i386/ldt.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/ldt.c Wed Nov 13 12:43:04 2002 +@@ -0,0 +1,92 @@ ++/* ++ * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/config.h" ++#include "linux/slab.h" ++#include "asm/uaccess.h" ++#include "asm/ptrace.h" ++#include "choose-mode.h" ++#include "kern.h" ++ ++#ifdef CONFIG_MODE_TT ++extern int modify_ldt(int func, void *ptr, unsigned long bytecount); ++ ++int sys_modify_ldt_tt(int func, void *ptr, unsigned long bytecount) ++{ ++ if(verify_area(VERIFY_READ, ptr, bytecount)) return(-EFAULT); ++ return(modify_ldt(func, ptr, bytecount)); ++} ++#endif ++ ++#ifdef CONFIG_MODE_SKAS ++extern int userspace_pid; ++ ++int sys_modify_ldt_skas(int func, void *ptr, unsigned long bytecount) ++{ ++ struct ptrace_ldt ldt; ++ void *buf; ++ int res, n; ++ ++ buf = kmalloc(bytecount, GFP_KERNEL); ++ if(buf == NULL) ++ return(-ENOMEM); ++ ++ res = 0; ++ ++ switch(func){ ++ case 1: ++ case 0x11: ++ res = copy_from_user(buf, ptr, bytecount); ++ break; ++ } ++ ++ if(res != 0){ ++ res = -EFAULT; ++ goto out; ++ } ++ ++ ldt = ((struct ptrace_ldt) { .func = func, ++ .ptr = buf, ++ .bytecount = bytecount }); ++ res = ptrace(PTRACE_LDT, userspace_pid, 0, (unsigned long) &ldt); ++ if(res < 0) ++ goto out; ++ ++ switch(func){ ++ case 0: ++ case 2: ++ n = res; ++ res = copy_to_user(ptr, buf, n); ++ if(res != 0) ++ res = -EFAULT; ++ else ++ res = n; ++ break; ++ } ++ ++ out: ++ kfree(buf); ++ return(res); ++} ++#endif ++ ++int sys_modify_ldt(int func, void *ptr, unsigned long bytecount) ++{ ++ return(CHOOSE_MODE_PROC(sys_modify_ldt_tt, sys_modify_ldt_skas, func, ++ ptr, bytecount)); ++} ++ ++ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ptrace.c um/arch/um/sys-i386/ptrace.c +--- orig/arch/um/sys-i386/ptrace.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/ptrace.c Sun Oct 27 16:49:35 2002 +@@ -0,0 +1,365 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/sched.h" ++#include "asm/elf.h" ++#include "asm/ptrace.h" ++#include "asm/uaccess.h" ++#include "ptrace_user.h" ++#include "sysdep/sigcontext.h" ++#include "sysdep/sc.h" ++ ++void arch_switch(void) ++{ ++ update_debugregs(current->thread.arch.debugregs_seq); ++} ++ ++int is_syscall(unsigned long addr) ++{ ++ unsigned short instr; ++ int n; ++ ++ n = copy_from_user(&instr, (void *) addr, sizeof(instr)); ++ if(n){ ++ printk("is_syscall : failed to read instruction from 0x%lu\n", ++ addr); ++ return(0); ++ } ++ return(instr == 0x80cd); ++} ++ ++/* determines which flags the user has access to. */ ++/* 1 = access 0 = no access */ ++#define FLAG_MASK 0x00044dd5 ++ ++int putreg(struct task_struct *child, int regno, unsigned long value) ++{ ++ regno >>= 2; ++ switch (regno) { ++ case FS: ++ if (value && (value & 3) != 3) ++ return -EIO; ++ PT_REGS_FS(&child->thread.regs) = value; ++ return 0; ++ case GS: ++ if (value && (value & 3) != 3) ++ return -EIO; ++ PT_REGS_GS(&child->thread.regs) = value; ++ return 0; ++ case DS: ++ case ES: ++ if (value && (value & 3) != 3) ++ return -EIO; ++ value &= 0xffff; ++ break; ++ case SS: ++ case CS: ++ if ((value & 3) != 3) ++ return -EIO; ++ value &= 0xffff; ++ break; ++ case EFL: ++ value &= FLAG_MASK; ++ value |= PT_REGS_EFLAGS(&child->thread.regs); ++ break; ++ } ++ PT_REGS_SET(&child->thread.regs, regno, value); ++ return 0; ++} ++ ++unsigned long getreg(struct task_struct *child, int regno) ++{ ++ unsigned long retval = ~0UL; ++ ++ regno >>= 2; ++ switch (regno) { ++ case FS: ++ case GS: ++ case DS: ++ case ES: ++ case SS: ++ case CS: ++ retval = 0xffff; ++ /* fall through */ ++ default: ++ retval &= PT_REG(&child->thread.regs, regno); ++ } ++ return retval; ++} ++ ++struct i387_fxsave_struct { ++ unsigned short cwd; ++ unsigned short swd; ++ unsigned short twd; ++ unsigned short fop; ++ long fip; ++ long fcs; ++ long foo; ++ long fos; ++ long mxcsr; ++ long reserved; ++ long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ ++ long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ ++ long padding[56]; ++}; ++ ++/* ++ * FPU tag word conversions. ++ */ ++ ++static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) ++{ ++ unsigned int tmp; /* to avoid 16 bit prefixes in the code */ ++ ++ /* Transform each pair of bits into 01 (valid) or 00 (empty) */ ++ tmp = ~twd; ++ tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ ++ /* and move the valid bits to the lower byte. */ ++ tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ ++ tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ ++ tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ ++ return tmp; ++} ++ ++static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) ++{ ++ struct _fpxreg *st = NULL; ++ unsigned long twd = (unsigned long) fxsave->twd; ++ unsigned long tag; ++ unsigned long ret = 0xffff0000; ++ int i; ++ ++#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16); ++ ++ for ( i = 0 ; i < 8 ; i++ ) { ++ if ( twd & 0x1 ) { ++ st = (struct _fpxreg *) FPREG_ADDR( fxsave, i ); ++ ++ switch ( st->exponent & 0x7fff ) { ++ case 0x7fff: ++ tag = 2; /* Special */ ++ break; ++ case 0x0000: ++ if ( !st->significand[0] && ++ !st->significand[1] && ++ !st->significand[2] && ++ !st->significand[3] ) { ++ tag = 1; /* Zero */ ++ } else { ++ tag = 2; /* Special */ ++ } ++ break; ++ default: ++ if ( st->significand[3] & 0x8000 ) { ++ tag = 0; /* Valid */ ++ } else { ++ tag = 2; /* Special */ ++ } ++ break; ++ } ++ } else { ++ tag = 3; /* Empty */ ++ } ++ ret |= (tag << (2 * i)); ++ twd = twd >> 1; ++ } ++ return ret; ++} ++ ++/* ++ * FXSR floating point environment conversions. ++ */ ++ ++#ifdef CONFIG_MODE_TT ++static inline int convert_fxsr_to_user_tt(struct _fpstate *buf, ++ struct pt_regs *regs) ++{ ++ struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs)); ++ unsigned long env[7]; ++ struct _fpreg *to; ++ struct _fpxreg *from; ++ int i; ++ ++ env[0] = (unsigned long)fxsave->cwd | 0xffff0000; ++ env[1] = (unsigned long)fxsave->swd | 0xffff0000; ++ env[2] = twd_fxsr_to_i387(fxsave); ++ env[3] = fxsave->fip; ++ env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); ++ env[5] = fxsave->foo; ++ env[6] = fxsave->fos; ++ ++ if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) ++ return 1; ++ ++ to = &buf->_st[0]; ++ from = (struct _fpxreg *) &fxsave->st_space[0]; ++ for ( i = 0 ; i < 8 ; i++, to++, from++ ) { ++ if ( __copy_to_user( to, from, sizeof(*to) ) ) ++ return 1; ++ } ++ return 0; ++} ++#endif ++ ++static inline int convert_fxsr_to_user(struct _fpstate *buf, ++ struct pt_regs *regs) ++{ ++ return(CHOOSE_MODE(convert_fxsr_to_user_tt(buf, regs), 0)); ++} ++ ++#ifdef CONFIG_MODE_TT ++static inline int convert_fxsr_from_user_tt(struct pt_regs *regs, ++ struct _fpstate *buf) ++{ ++ struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs)); ++ unsigned long env[7]; ++ struct _fpxreg *to; ++ struct _fpreg *from; ++ int i; ++ ++ if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) ++ return 1; ++ ++ fxsave->cwd = (unsigned short)(env[0] & 0xffff); ++ fxsave->swd = (unsigned short)(env[1] & 0xffff); ++ fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); ++ fxsave->fip = env[3]; ++ fxsave->fop = (unsigned short)((env[4] & 0xffff0000) >> 16); ++ fxsave->fcs = (env[4] & 0xffff); ++ fxsave->foo = env[5]; ++ fxsave->fos = env[6]; ++ ++ to = (struct _fpxreg *) &fxsave->st_space[0]; ++ from = &buf->_st[0]; ++ for ( i = 0 ; i < 8 ; i++, to++, from++ ) { ++ if ( __copy_from_user( to, from, sizeof(*from) ) ) ++ return 1; ++ } ++ return 0; ++} ++#endif ++ ++static inline int convert_fxsr_from_user(struct pt_regs *regs, ++ struct _fpstate *buf) ++{ ++ return(CHOOSE_MODE(convert_fxsr_from_user_tt(regs, buf), 0)); ++} ++ ++int get_fpregs(unsigned long buf, struct task_struct *child) ++{ ++ int err; ++ ++ err = convert_fxsr_to_user((struct _fpstate *) buf, ++ &child->thread.regs); ++ if(err) return(-EFAULT); ++ else return(0); ++} ++ ++int set_fpregs(unsigned long buf, struct task_struct *child) ++{ ++ int err; ++ ++ err = convert_fxsr_from_user(&child->thread.regs, ++ (struct _fpstate *) buf); ++ if(err) return(-EFAULT); ++ else return(0); ++} ++ ++#ifdef CONFIG_MODE_TT ++int get_fpxregs_tt(unsigned long buf, struct task_struct *tsk) ++{ ++ struct pt_regs *regs = &tsk->thread.regs; ++ struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs)); ++ int err; ++ ++ err = __copy_to_user((void *) buf, fxsave, ++ sizeof(struct user_fxsr_struct)); ++ if(err) return -EFAULT; ++ else return 0; ++} ++#endif ++ ++int get_fpxregs(unsigned long buf, struct task_struct *tsk) ++{ ++ return(CHOOSE_MODE(get_fpxregs_tt(buf, tsk), 0)); ++} ++ ++#ifdef CONFIG_MODE_TT ++int set_fpxregs_tt(unsigned long buf, struct task_struct *tsk) ++{ ++ struct pt_regs *regs = &tsk->thread.regs; ++ struct i387_fxsave_struct *fxsave = SC_FXSR_ENV(PT_REGS_SC(regs)); ++ int err; ++ ++ err = __copy_from_user(fxsave, (void *) buf, ++ sizeof(struct user_fxsr_struct) ); ++ if(err) return -EFAULT; ++ else return 0; ++} ++#endif ++ ++int set_fpxregs(unsigned long buf, struct task_struct *tsk) ++{ ++ return(CHOOSE_MODE(set_fpxregs_tt(buf, tsk), 0)); ++} ++ ++#ifdef notdef ++int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu) ++{ ++ fpu->cwd = (((SC_FP_CW(PT_REGS_SC(regs)) & 0xffff) << 16) | ++ (SC_FP_SW(PT_REGS_SC(regs)) & 0xffff)); ++ fpu->swd = SC_FP_CSSEL(PT_REGS_SC(regs)) & 0xffff; ++ fpu->twd = SC_FP_IPOFF(PT_REGS_SC(regs)); ++ fpu->fip = SC_FP_CSSEL(PT_REGS_SC(regs)) & 0xffff; ++ fpu->fcs = SC_FP_DATAOFF(PT_REGS_SC(regs)); ++ fpu->foo = SC_FP_DATASEL(PT_REGS_SC(regs)); ++ fpu->fos = 0; ++ memcpy(fpu->st_space, (void *) SC_FP_ST(PT_REGS_SC(regs)), ++ sizeof(fpu->st_space)); ++ return(1); ++} ++#endif ++ ++#ifdef CONFIG_MODE_TT ++static inline void copy_fpu_fxsave_tt(struct pt_regs *regs, ++ struct user_i387_struct *buf) ++{ ++ struct i387_fxsave_struct *fpu = SC_FXSR_ENV(PT_REGS_SC(regs)); ++ unsigned short *to; ++ unsigned short *from; ++ int i; ++ ++ memcpy( buf, fpu, 7 * sizeof(long) ); ++ ++ to = (unsigned short *) &buf->st_space[0]; ++ from = (unsigned short *) &fpu->st_space[0]; ++ for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { ++ memcpy( to, from, 5 * sizeof(unsigned short) ); ++ } ++} ++#endif ++ ++static inline void copy_fpu_fxsave(struct pt_regs *regs, ++ struct user_i387_struct *buf) ++{ ++ (void) CHOOSE_MODE(copy_fpu_fxsave_tt(regs, buf), 0); ++} ++ ++int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu ) ++{ ++ copy_fpu_fxsave(regs, (struct user_i387_struct *) fpu); ++ return(1); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/ptrace_user.c um/arch/um/sys-i386/ptrace_user.c +--- orig/arch/um/sys-i386/ptrace_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/ptrace_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,117 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stdio.h> ++#include <errno.h> ++#include <unistd.h> ++#include <linux/stddef.h> ++#include <sys/ptrace.h> ++#include <asm/ptrace.h> ++#include <asm/user.h> ++#include "kern_util.h" ++#include "sysdep/thread.h" ++#include "user.h" ++#include "os.h" ++ ++int ptrace_getregs(long pid, unsigned long *regs_out) ++{ ++ return(ptrace(PTRACE_GETREGS, pid, 0, regs_out)); ++} ++ ++int ptrace_setregs(long pid, unsigned long *regs) ++{ ++ return(ptrace(PTRACE_SETREGS, pid, 0, regs)); ++} ++ ++int ptrace_getfpregs(long pid, unsigned long *regs) ++{ ++ return(ptrace(PTRACE_GETFPREGS, pid, 0, regs)); ++} ++ ++static void write_debugregs(int pid, unsigned long *regs) ++{ ++ struct user *dummy; ++ int nregs, i; ++ ++ dummy = NULL; ++ nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]); ++ for(i = 0; i < nregs; i++){ ++ if((i == 4) || (i == 5)) continue; ++ if(ptrace(PTRACE_POKEUSR, pid, &dummy->u_debugreg[i], ++ regs[i]) < 0) ++ printk("write_debugregs - ptrace failed, " ++ "errno = %d\n", errno); ++ } ++} ++ ++static void read_debugregs(int pid, unsigned long *regs) ++{ ++ struct user *dummy; ++ int nregs, i; ++ ++ dummy = NULL; ++ nregs = sizeof(dummy->u_debugreg)/sizeof(dummy->u_debugreg[0]); ++ for(i = 0; i < nregs; i++){ ++ regs[i] = ptrace(PTRACE_PEEKUSR, pid, ++ &dummy->u_debugreg[i], 0); ++ } ++} ++ ++/* Accessed only by the tracing thread */ ++static unsigned long kernel_debugregs[8] = { [ 0 ... 7 ] = 0 }; ++static int debugregs_seq = 0; ++ ++void arch_enter_kernel(void *task, int pid) ++{ ++ read_debugregs(pid, TASK_DEBUGREGS(task)); ++ write_debugregs(pid, kernel_debugregs); ++} ++ ++void arch_leave_kernel(void *task, int pid) ++{ ++ read_debugregs(pid, kernel_debugregs); ++ write_debugregs(pid, TASK_DEBUGREGS(task)); ++} ++ ++void ptrace_pokeuser(unsigned long addr, unsigned long data) ++{ ++ if((addr < offsetof(struct user, u_debugreg[0])) || ++ (addr > offsetof(struct user, u_debugreg[7]))) ++ return; ++ addr -= offsetof(struct user, u_debugreg[0]); ++ addr = addr >> 2; ++ if(kernel_debugregs[addr] == data) return; ++ ++ kernel_debugregs[addr] = data; ++ debugregs_seq++; ++} ++ ++static void update_debugregs_cb(void *arg) ++{ ++ int pid = *((int *) arg); ++ ++ write_debugregs(pid, kernel_debugregs); ++} ++ ++void update_debugregs(int seq) ++{ ++ int me; ++ ++ if(seq == debugregs_seq) return; ++ ++ me = os_getpid(); ++ initial_thread_cb(update_debugregs_cb, &me); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/sigcontext.c um/arch/um/sys-i386/sigcontext.c +--- orig/arch/um/sys-i386/sigcontext.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/sigcontext.c Mon Dec 2 23:20:13 2002 +@@ -0,0 +1,80 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <stddef.h> ++#include <string.h> ++#include <asm/ptrace.h> ++#include <asm/sigcontext.h> ++#include "sysdep/ptrace.h" ++#include "kern_util.h" ++#include "frame_user.h" ++ ++int sc_size(void *data) ++{ ++ struct arch_frame_data *arch = data; ++ ++ return(sizeof(struct sigcontext) + arch->fpstate_size); ++} ++ ++void sc_to_sc(void *to_ptr, void *from_ptr) ++{ ++ struct sigcontext *to = to_ptr, *from = from_ptr; ++ int size = sizeof(*to) + signal_frame_sc.common.arch.fpstate_size; ++ ++ memcpy(to, from, size); ++ if(from->fpstate != NULL) to->fpstate = (struct _fpstate *) (to + 1); ++} ++ ++unsigned long *sc_sigmask(void *sc_ptr) ++{ ++ struct sigcontext *sc = sc_ptr; ++ ++ return(&sc->oldmask); ++} ++ ++int sc_get_fpregs(unsigned long buf, void *sc_ptr) ++{ ++ struct sigcontext *sc = sc_ptr; ++ struct _fpstate *from = sc->fpstate, *to = (struct _fpstate *) buf; ++ int err = 0; ++ ++ if(from == NULL){ ++ err |= clear_user_proc(&to->cw, sizeof(to->cw)); ++ err |= clear_user_proc(&to->sw, sizeof(to->sw)); ++ err |= clear_user_proc(&to->tag, sizeof(to->tag)); ++ err |= clear_user_proc(&to->ipoff, sizeof(to->ipoff)); ++ err |= clear_user_proc(&to->cssel, sizeof(to->cssel)); ++ err |= clear_user_proc(&to->dataoff, sizeof(to->dataoff)); ++ err |= clear_user_proc(&to->datasel, sizeof(to->datasel)); ++ err |= clear_user_proc(&to->_st, sizeof(to->_st)); ++ } ++ else { ++ err |= copy_to_user_proc(&to->cw, &from->cw, sizeof(to->cw)); ++ err |= copy_to_user_proc(&to->sw, &from->sw, sizeof(to->sw)); ++ err |= copy_to_user_proc(&to->tag, &from->tag, ++ sizeof(to->tag)); ++ err |= copy_to_user_proc(&to->ipoff, &from->ipoff, ++ sizeof(to->ipoff)); ++ err |= copy_to_user_proc(&to->cssel,& from->cssel, ++ sizeof(to->cssel)); ++ err |= copy_to_user_proc(&to->dataoff, &from->dataoff, ++ sizeof(to->dataoff)); ++ err |= copy_to_user_proc(&to->datasel, &from->datasel, ++ sizeof(to->datasel)); ++ err |= copy_to_user_proc(to->_st, from->_st, sizeof(to->_st)); ++ } ++ return(err); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/syscalls.c um/arch/um/sys-i386/syscalls.c +--- orig/arch/um/sys-i386/syscalls.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/syscalls.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,68 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "asm/mman.h" ++#include "asm/uaccess.h" ++#include "asm/unistd.h" ++ ++/* ++ * Perform the select(nd, in, out, ex, tv) and mmap() system ++ * calls. Linux/i386 didn't use to be able to handle more than ++ * 4 system call parameters, so these system calls used a memory ++ * block for parameter passing.. ++ */ ++ ++struct mmap_arg_struct { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++extern int old_mmap(unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long fd, unsigned long offset); ++ ++int old_mmap_i386(struct mmap_arg_struct *arg) ++{ ++ struct mmap_arg_struct a; ++ int err = -EFAULT; ++ ++ if (copy_from_user(&a, arg, sizeof(a))) ++ goto out; ++ ++ err = old_mmap(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); ++ out: ++ return err; ++} ++ ++struct sel_arg_struct { ++ unsigned long n; ++ fd_set *inp, *outp, *exp; ++ struct timeval *tvp; ++}; ++ ++int old_select(struct sel_arg_struct *arg) ++{ ++ struct sel_arg_struct a; ++ ++ if (copy_from_user(&a, arg, sizeof(a))) ++ return -EFAULT; ++ /* sys_select() does the appropriate kernel locking */ ++ return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/sysrq.c um/arch/um/sys-i386/sysrq.c +--- orig/arch/um/sys-i386/sysrq.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/sysrq.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++#include "linux/kernel.h" ++#include "linux/smp.h" ++#include "linux/sched.h" ++#include "asm/ptrace.h" ++#include "sysrq.h" ++ ++void show_regs(struct pt_regs *regs) ++{ ++ printk("\n"); ++ printk("EIP: %04lx:[<%08lx>] CPU: %d %s", ++ 0xffff & PT_REGS_CS(regs), PT_REGS_IP(regs), ++ smp_processor_id(), print_tainted()); ++ if (PT_REGS_CS(regs) & 3) ++ printk(" ESP: %04lx:%08lx", 0xffff & PT_REGS_SS(regs), ++ PT_REGS_SP(regs)); ++ printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs), ++ print_tainted()); ++ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", ++ PT_REGS_EAX(regs), PT_REGS_EBX(regs), ++ PT_REGS_ECX(regs), ++ PT_REGS_EDX(regs)); ++ printk("ESI: %08lx EDI: %08lx EBP: %08lx", ++ PT_REGS_ESI(regs), PT_REGS_EDI(regs), ++ PT_REGS_EBP(regs)); ++ printk(" DS: %04lx ES: %04lx\n", ++ 0xffff & PT_REGS_DS(regs), ++ 0xffff & PT_REGS_ES(regs)); ++ ++ show_trace((unsigned long *) ®s); ++} +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/Makefile um/arch/um/sys-i386/util/Makefile +--- orig/arch/um/sys-i386/util/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/util/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,28 @@ ++EXE = mk_sc mk_thread ++ ++include $(TOPDIR)/Rules.make ++ ++all : $(EXE) ++ ++mk_sc : mk_sc.o ++ $(CC) -o mk_sc mk_sc.o ++ ++mk_sc.o : mk_sc.c ++ $(CC) -c $< ++ ++mk_thread : mk_thread_user.o mk_thread_kern.o ++ $(CC) -o mk_thread mk_thread_user.o mk_thread_kern.o ++ ++mk_thread_user.o : mk_thread_user.c ++ $(CC) -c $< ++ ++mk_thread_kern.o : mk_thread_kern.c ++ $(CC) $(CFLAGS) -c $< ++ ++clean : ++ $(RM) $(EXE) *.o ++ ++archmrproper : clean ++ ++fastdep : ++ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_sc.c um/arch/um/sys-i386/util/mk_sc.c +--- orig/arch/um/sys-i386/util/mk_sc.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/util/mk_sc.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,51 @@ ++#include <stdio.h> ++#include <signal.h> ++#include <linux/stddef.h> ++ ++#define SC_OFFSET(name, field) \ ++ printf("#define " name "(sc) *((unsigned long *) &(((char *) (sc))[%d]))\n",\ ++ offsetof(struct sigcontext, field)) ++ ++#define SC_FP_OFFSET(name, field) \ ++ printf("#define " name \ ++ "(sc) *((unsigned long *) &(((char *) (SC_FPSTATE(sc)))[%d]))\n",\ ++ offsetof(struct _fpstate, field)) ++ ++#define SC_FP_OFFSET_PTR(name, field, type) \ ++ printf("#define " name \ ++ "(sc) ((" type " *) &(((char *) (SC_FPSTATE(sc)))[%d]))\n",\ ++ offsetof(struct _fpstate, field)) ++ ++int main(int argc, char **argv) ++{ ++ SC_OFFSET("SC_IP", eip); ++ SC_OFFSET("SC_SP", esp); ++ SC_OFFSET("SC_FS", fs); ++ SC_OFFSET("SC_GS", gs); ++ SC_OFFSET("SC_DS", ds); ++ SC_OFFSET("SC_ES", es); ++ SC_OFFSET("SC_SS", ss); ++ SC_OFFSET("SC_CS", cs); ++ SC_OFFSET("SC_EFLAGS", eflags); ++ SC_OFFSET("SC_EAX", eax); ++ SC_OFFSET("SC_EBX", ebx); ++ SC_OFFSET("SC_ECX", ecx); ++ SC_OFFSET("SC_EDX", edx); ++ SC_OFFSET("SC_EDI", edi); ++ SC_OFFSET("SC_ESI", esi); ++ SC_OFFSET("SC_EBP", ebp); ++ SC_OFFSET("SC_TRAPNO", trapno); ++ SC_OFFSET("SC_ERR", err); ++ SC_OFFSET("SC_CR2", cr2); ++ SC_OFFSET("SC_FPSTATE", fpstate); ++ SC_FP_OFFSET("SC_FP_CW", cw); ++ SC_FP_OFFSET("SC_FP_SW", sw); ++ SC_FP_OFFSET("SC_FP_TAG", tag); ++ SC_FP_OFFSET("SC_FP_IPOFF", ipoff); ++ SC_FP_OFFSET("SC_FP_CSSEL", cssel); ++ SC_FP_OFFSET("SC_FP_DATAOFF", dataoff); ++ SC_FP_OFFSET("SC_FP_DATASEL", datasel); ++ SC_FP_OFFSET_PTR("SC_FP_ST", _st, "struct _fpstate"); ++ SC_FP_OFFSET_PTR("SC_FXSR_ENV", _fxsr_env, "void"); ++ return(0); ++} +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_thread_kern.c um/arch/um/sys-i386/util/mk_thread_kern.c +--- orig/arch/um/sys-i386/util/mk_thread_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/util/mk_thread_kern.c Mon Dec 9 23:24:38 2002 +@@ -0,0 +1,22 @@ ++#include "linux/config.h" ++#include "linux/stddef.h" ++#include "linux/sched.h" ++ ++extern void print_head(void); ++extern void print_constant_ptr(char *name, int value); ++extern void print_constant(char *name, char *type, int value); ++extern void print_tail(void); ++ ++#define THREAD_OFFSET(field) offsetof(struct task_struct, thread.field) ++ ++int main(int argc, char **argv) ++{ ++ print_head(); ++ print_constant_ptr("TASK_DEBUGREGS", THREAD_OFFSET(arch.debugregs)); ++#ifdef CONFIG_MODE_TT ++ print_constant("TASK_EXTERN_PID", "int", THREAD_OFFSET(mode.tt.extern_pid)); ++#endif ++ print_tail(); ++ return(0); ++} ++ +diff -Naur -X ../exclude-files orig/arch/um/sys-i386/util/mk_thread_user.c um/arch/um/sys-i386/util/mk_thread_user.c +--- orig/arch/um/sys-i386/util/mk_thread_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-i386/util/mk_thread_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++#include <stdio.h> ++ ++void print_head(void) ++{ ++ printf("/*\n"); ++ printf(" * Generated by mk_thread\n"); ++ printf(" */\n"); ++ printf("\n"); ++ printf("#ifndef __UM_THREAD_H\n"); ++ printf("#define __UM_THREAD_H\n"); ++ printf("\n"); ++} ++ ++void print_constant_ptr(char *name, int value) ++{ ++ printf("#define %s(task) ((unsigned long *) " ++ "&(((char *) (task))[%d]))\n", name, value); ++} ++ ++void print_constant(char *name, char *type, int value) ++{ ++ printf("#define %s(task) *((%s *) &(((char *) (task))[%d]))\n", name, type, ++ value); ++} ++ ++void print_tail(void) ++{ ++ printf("\n"); ++ printf("#endif\n"); ++} +diff -Naur -X ../exclude-files orig/arch/um/sys-ia64/Makefile um/arch/um/sys-ia64/Makefile +--- orig/arch/um/sys-ia64/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ia64/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,26 @@ ++OBJ = sys.o ++ ++OBJS = ++ ++all: $(OBJ) ++ ++$(OBJ): $(OBJS) ++ rm -f $@ ++ $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@ ++clean: ++ rm -f $(OBJS) ++ ++fastdep: ++ ++archmrproper: ++ ++archclean: ++ rm -f link.ld ++ @$(MAKEBOOT) clean ++ ++archdep: ++ @$(MAKEBOOT) dep ++ ++modules: ++ ++include $(TOPDIR)/Rules.make +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/Makefile um/arch/um/sys-ppc/Makefile +--- orig/arch/um/sys-ppc/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/Makefile Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,80 @@ ++OBJ = sys.o ++ ++.S.o: ++ $(CC) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o ++ ++OBJS = ptrace.o sigcontext.o semaphore.o checksum.o miscthings.o misc.o \ ++ ptrace_user.o sysrq.o ++ ++EXTRA_AFLAGS := -DCONFIG_ALL_PPC -I. -I$(TOPDIR)/arch/ppc/kernel ++ ++all: $(OBJ) ++ ++$(OBJ): $(OBJS) ++ rm -f $@ ++ $(LD) $(LINKFLAGS) --start-group $^ --end-group -o $@ ++ ++ptrace_user.o: ptrace_user.c ++ $(CC) -D__KERNEL__ $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< ++ ++sigcontext.o: sigcontext.c ++ $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< ++ ++semaphore.c: ++ rm -f $@ ++ ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ ++ ++checksum.S: ++ rm -f $@ ++ ln -s $(TOPDIR)/arch/ppc/lib/$@ $@ ++ ++mk_defs.c: ++ rm -f $@ ++ ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ ++ ++ppc_defs.head: ++ rm -f $@ ++ ln -s $(TOPDIR)/arch/ppc/kernel/$@ $@ ++ ++ppc_defs.h: mk_defs.c ppc_defs.head \ ++ $(TOPDIR)/include/asm-ppc/mmu.h \ ++ $(TOPDIR)/include/asm-ppc/processor.h \ ++ $(TOPDIR)/include/asm-ppc/pgtable.h \ ++ $(TOPDIR)/include/asm-ppc/ptrace.h ++# $(CC) $(CFLAGS) -S mk_defs.c ++ cp ppc_defs.head ppc_defs.h ++# for bk, this way we can write to the file even if it's not checked out ++ echo '#define THREAD 608' >> ppc_defs.h ++ echo '#define PT_REGS 8' >> ppc_defs.h ++ echo '#define CLONE_VM 256' >> ppc_defs.h ++# chmod u+w ppc_defs.h ++# grep '^#define' mk_defs.s >> ppc_defs.h ++# rm mk_defs.s ++ ++# the asm link is horrible, and breaks the other targets. This is also ++# not going to work with parallel makes. ++ ++checksum.o: checksum.S ++ rm -f asm ++ ln -s $(TOPDIR)/include/asm-ppc asm ++ $(CC) $(EXTRA_AFLAGS) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o ++ rm -f asm ++ ++misc.o: misc.S ppc_defs.h ++ rm -f asm ++ ln -s $(TOPDIR)/include/asm-ppc asm ++ $(CC) $(EXTRA_AFLAGS) $(AFLAGS) -D__ASSEMBLY__ -D__UM_PPC__ -c $< -o $*.o ++ rm -f asm ++ ++clean: ++ rm -f $(OBJS) ++ rm -f ppc_defs.h ++ rm -f checksum.S semaphore.c mk_defs.c ++ ++fastdep: ++ ++dep: ++ ++modules: ++ ++include $(TOPDIR)/Rules.make +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/misc.S um/arch/um/sys-ppc/misc.S +--- orig/arch/um/sys-ppc/misc.S Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/misc.S Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,116 @@ ++/* ++ * This file contains miscellaneous low-level functions. ++ * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) ++ * ++ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu) ++ * and Paul Mackerras. ++ * ++ * A couple of functions stolen from arch/ppc/kernel/misc.S for UML ++ * by Chris Emerson. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License ++ * as published by the Free Software Foundation; either version ++ * 2 of the License, or (at your option) any later version. ++ * ++ */ ++ ++#include <linux/config.h> ++#include <asm/processor.h> ++#include "ppc_asm.h" ++ ++#if defined(CONFIG_4xx) || defined(CONFIG_8xx) ++#define CACHE_LINE_SIZE 16 ++#define LG_CACHE_LINE_SIZE 4 ++#define MAX_COPY_PREFETCH 1 ++#elif !defined(CONFIG_PPC64BRIDGE) ++#define CACHE_LINE_SIZE 32 ++#define LG_CACHE_LINE_SIZE 5 ++#define MAX_COPY_PREFETCH 4 ++#else ++#define CACHE_LINE_SIZE 128 ++#define LG_CACHE_LINE_SIZE 7 ++#define MAX_COPY_PREFETCH 1 ++#endif /* CONFIG_4xx || CONFIG_8xx */ ++ ++ .text ++ ++/* ++ * Clear a page using the dcbz instruction, which doesn't cause any ++ * memory traffic (except to write out any cache lines which get ++ * displaced). This only works on cacheable memory. ++ */ ++_GLOBAL(clear_page) ++ li r0,4096/CACHE_LINE_SIZE ++ mtctr r0 ++#ifdef CONFIG_8xx ++ li r4, 0 ++1: stw r4, 0(r3) ++ stw r4, 4(r3) ++ stw r4, 8(r3) ++ stw r4, 12(r3) ++#else ++1: dcbz 0,r3 ++#endif ++ addi r3,r3,CACHE_LINE_SIZE ++ bdnz 1b ++ blr ++ ++/* ++ * Copy a whole page. We use the dcbz instruction on the destination ++ * to reduce memory traffic (it eliminates the unnecessary reads of ++ * the destination into cache). This requires that the destination ++ * is cacheable. ++ */ ++#define COPY_16_BYTES \ ++ lwz r6,4(r4); \ ++ lwz r7,8(r4); \ ++ lwz r8,12(r4); \ ++ lwzu r9,16(r4); \ ++ stw r6,4(r3); \ ++ stw r7,8(r3); \ ++ stw r8,12(r3); \ ++ stwu r9,16(r3) ++ ++_GLOBAL(copy_page) ++ addi r3,r3,-4 ++ addi r4,r4,-4 ++ li r5,4 ++ ++#ifndef CONFIG_8xx ++#if MAX_COPY_PREFETCH > 1 ++ li r0,MAX_COPY_PREFETCH ++ li r11,4 ++ mtctr r0 ++11: dcbt r11,r4 ++ addi r11,r11,CACHE_LINE_SIZE ++ bdnz 11b ++#else /* MAX_COPY_PREFETCH == 1 */ ++ dcbt r5,r4 ++ li r11,CACHE_LINE_SIZE+4 ++#endif /* MAX_COPY_PREFETCH */ ++#endif /* CONFIG_8xx */ ++ ++ li r0,4096/CACHE_LINE_SIZE ++ mtctr r0 ++1: ++#ifndef CONFIG_8xx ++ dcbt r11,r4 ++ dcbz r5,r3 ++#endif ++ COPY_16_BYTES ++#if CACHE_LINE_SIZE >= 32 ++ COPY_16_BYTES ++#if CACHE_LINE_SIZE >= 64 ++ COPY_16_BYTES ++ COPY_16_BYTES ++#if CACHE_LINE_SIZE >= 128 ++ COPY_16_BYTES ++ COPY_16_BYTES ++ COPY_16_BYTES ++ COPY_16_BYTES ++#endif ++#endif ++#endif ++ bdnz 1b ++ blr +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/miscthings.c um/arch/um/sys-ppc/miscthings.c +--- orig/arch/um/sys-ppc/miscthings.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/miscthings.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,53 @@ ++#include "linux/threads.h" ++#include "linux/stddef.h" // for NULL ++#include "linux/elf.h" // for AT_NULL ++ ++/* The following function nicked from arch/ppc/kernel/process.c and ++ * adapted slightly */ ++/* ++ * XXX ld.so expects the auxiliary table to start on ++ * a 16-byte boundary, so we have to find it and ++ * move it up. :-( ++ */ ++void shove_aux_table(unsigned long sp) ++{ ++ int argc; ++ char *p; ++ unsigned long e; ++ unsigned long aux_start, offset; ++ ++ argc = *(int *)sp; ++ sp += sizeof(int) + (argc + 1) * sizeof(char *); ++ /* skip over the environment pointers */ ++ do { ++ p = *(char **)sp; ++ sp += sizeof(char *); ++ } while (p != NULL); ++ aux_start = sp; ++ /* skip to the end of the auxiliary table */ ++ do { ++ e = *(unsigned long *)sp; ++ sp += 2 * sizeof(unsigned long); ++ } while (e != AT_NULL); ++ offset = ((aux_start + 15) & ~15) - aux_start; ++ if (offset != 0) { ++ do { ++ sp -= sizeof(unsigned long); ++ e = *(unsigned long *)sp; ++ *(unsigned long *)(sp + offset) = e; ++ } while (sp > aux_start); ++ } ++} ++/* END stuff taken from arch/ppc/kernel/process.c */ ++ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/ptrace.c um/arch/um/sys-ppc/ptrace.c +--- orig/arch/um/sys-ppc/ptrace.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/ptrace.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,28 @@ ++#include "linux/sched.h" ++#include "asm/ptrace.h" ++ ++int putreg(struct task_struct *child, unsigned long regno, ++ unsigned long value) ++{ ++ child->thread.process_regs.regs[regno >> 2] = value; ++ return 0; ++} ++ ++unsigned long getreg(struct task_struct *child, unsigned long regno) ++{ ++ unsigned long retval = ~0UL; ++ ++ retval &= child->thread.process_regs.regs[regno >> 2]; ++ return retval; ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/ptrace_user.c um/arch/um/sys-ppc/ptrace_user.c +--- orig/arch/um/sys-ppc/ptrace_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/ptrace_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,40 @@ ++#include <sys/ptrace.h> ++#include <errno.h> ++#include <asm/ptrace.h> ++#include "sysdep/ptrace.h" ++ ++int ptrace_getregs(long pid, unsigned long *regs_out) ++{ ++ int i; ++ for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) { ++ errno = 0; ++ regs_out->regs[i] = ptrace(PTRACE_PEEKUSER, pid, i*4, 0); ++ if (errno) { ++ return -errno; ++ } ++ } ++ return 0; ++} ++ ++int ptrace_setregs(long pid, unsigned long *regs_in) ++{ ++ int i; ++ for (i=0; i < sizeof(struct sys_pt_regs)/sizeof(PPC_REG); ++i) { ++ if (i != 34 /* FIXME: PT_ORIG_R3 */ && i <= PT_MQ) { ++ if (ptrace(PTRACE_POKEUSER, pid, i*4, regs_in->regs[i]) < 0) { ++ return -errno; ++ } ++ } ++ } ++ return 0; ++} ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/sigcontext.c um/arch/um/sys-ppc/sigcontext.c +--- orig/arch/um/sys-ppc/sigcontext.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/sigcontext.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,15 @@ ++#include "asm/ptrace.h" ++#include "asm/sigcontext.h" ++#include "sysdep/ptrace.h" ++#include "user_util.h" ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/sys-ppc/sysrq.c um/arch/um/sys-ppc/sysrq.c +--- orig/arch/um/sys-ppc/sysrq.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/sys-ppc/sysrq.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,43 @@ ++/* ++ * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/kernel.h" ++#include "linux/smp.h" ++#include "asm/ptrace.h" ++#include "sysrq.h" ++ ++void show_regs(struct pt_regs_subarch *regs) ++{ ++ printk("\n"); ++ printk("show_regs(): insert regs here.\n"); ++#if 0 ++ printk("\n"); ++ printk("EIP: %04x:[<%08lx>] CPU: %d",0xffff & regs->xcs, regs->eip, ++ smp_processor_id()); ++ if (regs->xcs & 3) ++ printk(" ESP: %04x:%08lx",0xffff & regs->xss, regs->esp); ++ printk(" EFLAGS: %08lx\n", regs->eflags); ++ printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", ++ regs->eax, regs->ebx, regs->ecx, regs->edx); ++ printk("ESI: %08lx EDI: %08lx EBP: %08lx", ++ regs->esi, regs->edi, regs->ebp); ++ printk(" DS: %04x ES: %04x\n", ++ 0xffff & regs->xds, 0xffff & regs->xes); ++#endif ++ ++ show_trace(®s->gpr[1]); ++} ++ ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/arch/um/util/Makefile um/arch/um/util/Makefile +--- orig/arch/um/util/Makefile Wed Dec 31 19:00:00 1969 ++++ um/arch/um/util/Makefile Wed Oct 23 21:09:14 2002 +@@ -0,0 +1,26 @@ ++ALL = mk_task mk_constants ++ ++all : $(ALL) ++ ++mk_task : mk_task_user.o mk_task_kern.o ++ $(CC) -o mk_task mk_task_user.o mk_task_kern.o ++ ++mk_task_user.o : mk_task_user.c ++ $(CC) -c $< ++ ++mk_task_kern.o : mk_task_kern.c ++ $(CC) $(CFLAGS) -c $< ++ ++mk_constants : mk_constants_user.o mk_constants_kern.o ++ $(CC) -o mk_constants mk_constants_user.o mk_constants_kern.o ++ ++mk_constants_user.o : mk_constants_user.c ++ $(CC) -c $< ++ ++mk_constants_kern.o : mk_constants_kern.c ++ $(CC) $(CFLAGS) -c $< ++ ++clean : ++ $(RM) $(ALL) *.o *~ ++ ++archmrproper : clean +diff -Naur -X ../exclude-files orig/arch/um/util/mk_constants_kern.c um/arch/um/util/mk_constants_kern.c +--- orig/arch/um/util/mk_constants_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/util/mk_constants_kern.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,24 @@ ++#include "linux/kernel.h" ++#include "linux/stringify.h" ++#include "asm/page.h" ++ ++extern void print_head(void); ++extern void print_constant_str(char *name, char *value); ++extern void print_constant_int(char *name, int value); ++extern void print_tail(void); ++ ++int main(int argc, char **argv) ++{ ++ print_head(); ++ print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE); ++ print_constant_str("UM_KERN_EMERG", KERN_EMERG); ++ print_constant_str("UM_KERN_ALERT", KERN_ALERT); ++ print_constant_str("UM_KERN_CRIT", KERN_CRIT); ++ print_constant_str("UM_KERN_ERR", KERN_ERR); ++ print_constant_str("UM_KERN_WARNING", KERN_WARNING); ++ print_constant_str("UM_KERN_NOTICE", KERN_NOTICE); ++ print_constant_str("UM_KERN_INFO", KERN_INFO); ++ print_constant_str("UM_KERN_DEBUG", KERN_DEBUG); ++ print_tail(); ++ return(0); ++} +diff -Naur -X ../exclude-files orig/arch/um/util/mk_constants_user.c um/arch/um/util/mk_constants_user.c +--- orig/arch/um/util/mk_constants_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/util/mk_constants_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,28 @@ ++#include <stdio.h> ++ ++void print_head(void) ++{ ++ printf("/*\n"); ++ printf(" * Generated by mk_constants\n"); ++ printf(" */\n"); ++ printf("\n"); ++ printf("#ifndef __UM_CONSTANTS_H\n"); ++ printf("#define __UM_CONSTANTS_H\n"); ++ printf("\n"); ++} ++ ++void print_constant_str(char *name, char *value) ++{ ++ printf("#define %s \"%s\"\n", name, value); ++} ++ ++void print_constant_int(char *name, int value) ++{ ++ printf("#define %s %d\n", name, value); ++} ++ ++void print_tail(void) ++{ ++ printf("\n"); ++ printf("#endif\n"); ++} +diff -Naur -X ../exclude-files orig/arch/um/util/mk_task_kern.c um/arch/um/util/mk_task_kern.c +--- orig/arch/um/util/mk_task_kern.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/util/mk_task_kern.c Sun Dec 8 21:03:34 2002 +@@ -0,0 +1,17 @@ ++#include "linux/sched.h" ++#include "linux/stddef.h" ++ ++extern void print(char *name, char *type, int offset); ++extern void print_ptr(char *name, char *type, int offset); ++extern void print_head(void); ++extern void print_tail(void); ++ ++int main(int argc, char **argv) ++{ ++ print_head(); ++ print_ptr("TASK_REGS", "union uml_pt_regs", ++ offsetof(struct task_struct, thread.regs)); ++ print("TASK_PID", "int", offsetof(struct task_struct, pid)); ++ print_tail(); ++ return(0); ++} +diff -Naur -X ../exclude-files orig/arch/um/util/mk_task_user.c um/arch/um/util/mk_task_user.c +--- orig/arch/um/util/mk_task_user.c Wed Dec 31 19:00:00 1969 ++++ um/arch/um/util/mk_task_user.c Wed Oct 23 21:08:04 2002 +@@ -0,0 +1,30 @@ ++#include <stdio.h> ++ ++void print(char *name, char *type, int offset) ++{ ++ printf("#define %s(task) *((%s *) &(((char *) (task))[%d]))\n", name, type, ++ offset); ++} ++ ++void print_ptr(char *name, char *type, int offset) ++{ ++ printf("#define %s(task) ((%s *) &(((char *) (task))[%d]))\n", name, type, ++ offset); ++} ++ ++void print_head(void) ++{ ++ printf("/*\n"); ++ printf(" * Generated by mk_task\n"); ++ printf(" */\n"); ++ printf("\n"); ++ printf("#ifndef __TASK_H\n"); ++ printf("#define __TASK_H\n"); ++ printf("\n"); ++} ++ ++void print_tail(void) ++{ ++ printf("\n"); ++ printf("#endif\n"); ++} +diff -Naur -X ../exclude-files orig/drivers/char/Makefile um/drivers/char/Makefile +--- orig/drivers/char/Makefile Thu Feb 27 13:04:15 2003 ++++ um/drivers/char/Makefile Thu Feb 27 13:05:21 2003 +@@ -95,6 +95,12 @@ + endif + endif + ++ifeq ($(ARCH),um) ++ KEYMAP = ++ KEYBD = ++ CONSOLE = ++endif ++ + ifeq ($(ARCH),sh) + KEYMAP = + KEYBD = +diff -Naur -X ../exclude-files orig/drivers/char/tty_io.c um/drivers/char/tty_io.c +--- orig/drivers/char/tty_io.c Thu Feb 27 13:04:15 2003 ++++ um/drivers/char/tty_io.c Thu Feb 27 13:05:21 2003 +@@ -637,6 +637,9 @@ + wake_up_interruptible(&tty->write_wait); + } + ++extern int write_tty_log(int fd, const unsigned char *buf, int len, void *tty, ++ int direction); ++ + static ssize_t tty_read(struct file * file, char * buf, size_t count, + loff_t *ppos) + { +@@ -677,8 +680,13 @@ + else + i = -EIO; + unlock_kernel(); +- if (i > 0) ++ if (i > 0){ + inode->i_atime = CURRENT_TIME; ++#ifdef CONFIG_TTY_LOG ++ if(tty->log_fd >= 0) ++ write_tty_log(tty->log_fd, buf, i, tty, 1); ++#endif ++ } + return i; + } + +@@ -732,6 +740,10 @@ + if (written) { + file->f_dentry->d_inode->i_mtime = CURRENT_TIME; + ret = written; ++#ifdef CONFIG_TTY_LOG ++ if(tty->log_fd >= 0) ++ write_tty_log(tty->log_fd, buf - ret, ret, tty, 0); ++#endif + } + up(&tty->atomic_write); + return ret; +@@ -945,6 +957,9 @@ + goto release_mem_out; + } + } ++#ifdef CONFIG_TTY_LOG ++ tty->log_fd = -1; ++#endif + goto success; + + /* +@@ -1039,6 +1054,8 @@ + free_tty_struct(tty); + } + ++extern int close_tty_log(int fd, void *tty); ++ + /* + * Even releasing the tty structures is a tricky business.. We have + * to be very careful that the structures are all released at the +@@ -1267,6 +1284,10 @@ + run_task_queue(&tq_timer); + flush_scheduled_tasks(); + ++#ifdef CONFIG_TTY_LOG ++ if(tty->log_fd >= 0) close_tty_log(tty->log_fd, tty); ++#endif ++ + /* + * The release_mem function takes care of the details of clearing + * the slots and preserving the termios structure. +@@ -1274,6 +1295,8 @@ + release_mem(tty, idx); + } + ++extern int open_tty_log(void *tty, void *current_tty); ++ + /* + * tty_open and tty_release keep up the tty count that contains the + * number of opens done on a tty. We cannot use the inode-count, as +@@ -1425,6 +1448,11 @@ + nr_warns++; + } + } ++ ++#ifdef CONFIG_TTY_LOG ++ if(tty->log_fd < 0) ++ tty->log_fd = open_tty_log(tty, current->tty); ++#endif + return 0; + } + +diff -Naur -X ../exclude-files orig/drivers/net/setup.c um/drivers/net/setup.c +--- orig/drivers/net/setup.c Sun Sep 15 12:13:19 2002 ++++ um/drivers/net/setup.c Wed Oct 23 21:08:05 2002 +@@ -28,7 +28,6 @@ + extern int lmc_setup(void); + + extern int madgemc_probe(void); +-extern int uml_net_probe(void); + + /* Pad device name to IFNAMSIZ=16. F.e. __PAD6 is string of 9 zeros. */ + #define __PAD6 "\0\0\0\0\0\0\0\0\0" +@@ -102,9 +101,6 @@ + */ + #ifdef CONFIG_MADGEMC + {madgemc_probe, 0}, +-#endif +-#ifdef CONFIG_UML_NET +- {uml_net_probe, 0}, + #endif + + {NULL, 0}, +diff -Naur -X ../exclude-files orig/include/asm-i386/hardirq.h um/include/asm-i386/hardirq.h +--- orig/include/asm-i386/hardirq.h Sun Sep 15 12:13:19 2002 ++++ um/include/asm-i386/hardirq.h Wed Apr 16 13:59:04 2003 +@@ -4,6 +4,7 @@ + #include <linux/config.h> + #include <linux/threads.h> + #include <linux/irq.h> ++#include <asm/processor.h> /* for cpu_relax */ + + /* assembly code in softirq.h is sensitive to the offsets of these fields */ + typedef struct { +diff -Naur -X ../exclude-files orig/include/asm-um/a.out.h um/include/asm-um/a.out.h +--- orig/include/asm-um/a.out.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/a.out.h Sun Oct 27 11:54:50 2002 +@@ -0,0 +1,20 @@ ++#ifndef __UM_A_OUT_H ++#define __UM_A_OUT_H ++ ++#include "linux/config.h" ++#include "asm/arch/a.out.h" ++#include "choose-mode.h" ++ ++#undef STACK_TOP ++ ++extern unsigned long stacksizelim; ++ ++extern unsigned long host_task_size; ++ ++#define STACK_ROOM (stacksizelim) ++ ++extern int honeypot; ++#define STACK_TOP \ ++ CHOOSE_MODE((honeypot ? host_task_size : task_size), task_size) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/arch-signal-i386.h um/include/asm-um/arch-signal-i386.h +--- orig/include/asm-um/arch-signal-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/arch-signal-i386.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,24 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_ARCH_SIGNAL_I386_H ++#define __UM_ARCH_SIGNAL_I386_H ++ ++struct arch_signal_context { ++ unsigned long extrasigs[_NSIG_WORDS]; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/archparam-i386.h um/include/asm-um/archparam-i386.h +--- orig/include/asm-um/archparam-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/archparam-i386.h Sun Dec 8 20:09:11 2002 +@@ -0,0 +1,80 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_ARCHPARAM_I386_H ++#define __UM_ARCHPARAM_I386_H ++ ++/********* Bits for asm-um/elf.h ************/ ++ ++#include "user.h" ++ ++#define ELF_PLATFORM "i586" ++ ++#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) ++ ++typedef struct user_i387_struct elf_fpregset_t; ++typedef unsigned long elf_greg_t; ++ ++#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) ++typedef elf_greg_t elf_gregset_t[ELF_NGREG]; ++ ++#define ELF_DATA ELFDATA2LSB ++#define ELF_ARCH EM_386 ++ ++#define ELF_PLAT_INIT(regs) do { \ ++ PT_REGS_EBX(regs) = 0; \ ++ PT_REGS_ECX(regs) = 0; \ ++ PT_REGS_EDX(regs) = 0; \ ++ PT_REGS_ESI(regs) = 0; \ ++ PT_REGS_EDI(regs) = 0; \ ++ PT_REGS_EBP(regs) = 0; \ ++ PT_REGS_EAX(regs) = 0; \ ++} while(0) ++ ++/* Shamelessly stolen from include/asm-i386/elf.h */ ++ ++#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ ++ pr_reg[0] = PT_REGS_EBX(regs); \ ++ pr_reg[1] = PT_REGS_ECX(regs); \ ++ pr_reg[2] = PT_REGS_EDX(regs); \ ++ pr_reg[3] = PT_REGS_ESI(regs); \ ++ pr_reg[4] = PT_REGS_EDI(regs); \ ++ pr_reg[5] = PT_REGS_EBP(regs); \ ++ pr_reg[6] = PT_REGS_EAX(regs); \ ++ pr_reg[7] = PT_REGS_DS(regs); \ ++ pr_reg[8] = PT_REGS_ES(regs); \ ++ /* fake once used fs and gs selectors? */ \ ++ pr_reg[9] = PT_REGS_DS(regs); \ ++ pr_reg[10] = PT_REGS_DS(regs); \ ++ pr_reg[11] = PT_REGS_SYSCALL_NR(regs); \ ++ pr_reg[12] = PT_REGS_IP(regs); \ ++ pr_reg[13] = PT_REGS_CS(regs); \ ++ pr_reg[14] = PT_REGS_EFLAGS(regs); \ ++ pr_reg[15] = PT_REGS_SP(regs); \ ++ pr_reg[16] = PT_REGS_SS(regs); \ ++} while(0); ++ ++/********* Bits for asm-um/delay.h **********/ ++ ++typedef unsigned long um_udelay_t; ++ ++/********* Nothing for asm-um/hardirq.h **********/ ++ ++/********* Nothing for asm-um/hw_irq.h **********/ ++ ++/********* Nothing for asm-um/string.h **********/ ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/archparam-ppc.h um/include/asm-um/archparam-ppc.h +--- orig/include/asm-um/archparam-ppc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/archparam-ppc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,41 @@ ++#ifndef __UM_ARCHPARAM_PPC_H ++#define __UM_ARCHPARAM_PPC_H ++ ++/********* Bits for asm-um/elf.h ************/ ++ ++#define ELF_PLATFORM (0) ++ ++#define ELF_ET_DYN_BASE (0x08000000) ++ ++/* the following stolen from asm-ppc/elf.h */ ++#define ELF_NGREG 48 /* includes nip, msr, lr, etc. */ ++#define ELF_NFPREG 33 /* includes fpscr */ ++/* General registers */ ++typedef unsigned long elf_greg_t; ++typedef elf_greg_t elf_gregset_t[ELF_NGREG]; ++ ++/* Floating point registers */ ++typedef double elf_fpreg_t; ++typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; ++ ++#define ELF_DATA ELFDATA2MSB ++#define ELF_ARCH EM_PPC ++ ++/********* Bits for asm-um/delay.h **********/ ++ ++typedef unsigned int um_udelay_t; ++ ++/********* Bits for asm-um/hw_irq.h **********/ ++ ++struct hw_interrupt_type; ++ ++/********* Bits for asm-um/hardirq.h **********/ ++ ++#define irq_enter(cpu, irq) hardirq_enter(cpu) ++#define irq_exit(cpu, irq) hardirq_exit(cpu) ++ ++/********* Bits for asm-um/string.h **********/ ++ ++#define __HAVE_ARCH_STRRCHR ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/atomic.h um/include/asm-um/atomic.h +--- orig/include/asm-um/atomic.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/atomic.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_ATOMIC_H ++#define __UM_ATOMIC_H ++ ++#include "asm/arch/atomic.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/bitops.h um/include/asm-um/bitops.h +--- orig/include/asm-um/bitops.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/bitops.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_BITOPS_H ++#define __UM_BITOPS_H ++ ++#include "asm/arch/bitops.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/boot.h um/include/asm-um/boot.h +--- orig/include/asm-um/boot.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/boot.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_BOOT_H ++#define __UM_BOOT_H ++ ++#include "asm/arch/boot.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/bugs.h um/include/asm-um/bugs.h +--- orig/include/asm-um/bugs.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/bugs.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_BUGS_H ++#define __UM_BUGS_H ++ ++void check_bugs(void); ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/byteorder.h um/include/asm-um/byteorder.h +--- orig/include/asm-um/byteorder.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/byteorder.h Thu Feb 27 13:20:12 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_BYTEORDER_H ++#define __UM_BYTEORDER_H ++ ++#include "asm/arch/byteorder.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/cache.h um/include/asm-um/cache.h +--- orig/include/asm-um/cache.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/cache.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_CACHE_H ++#define __UM_CACHE_H ++ ++#define L1_CACHE_BYTES 32 ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/checksum.h um/include/asm-um/checksum.h +--- orig/include/asm-um/checksum.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/checksum.h Tue Oct 29 17:25:12 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_CHECKSUM_H ++#define __UM_CHECKSUM_H ++ ++#include "sysdep/checksum.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/cobalt.h um/include/asm-um/cobalt.h +--- orig/include/asm-um/cobalt.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/cobalt.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_COBALT_H ++#define __UM_COBALT_H ++ ++#include "asm/arch/cobalt.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/current.h um/include/asm-um/current.h +--- orig/include/asm-um/current.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/current.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,34 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_CURRENT_H ++#define __UM_CURRENT_H ++ ++#ifndef __ASSEMBLY__ ++ ++#include "linux/config.h" ++#include "asm/page.h" ++ ++struct task_struct; ++ ++#define CURRENT_TASK(dummy) (((unsigned long) &dummy) & \ ++ (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER)) ++ ++#define current ({ int dummy; (struct task_struct *) CURRENT_TASK(dummy); }) ++ ++#endif /* __ASSEMBLY__ */ ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/delay.h um/include/asm-um/delay.h +--- orig/include/asm-um/delay.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/delay.h Sun Dec 8 20:09:15 2002 +@@ -0,0 +1,7 @@ ++#ifndef __UM_DELAY_H ++#define __UM_DELAY_H ++ ++#include "asm/arch/delay.h" ++#include "asm/archparam.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/desc.h um/include/asm-um/desc.h +--- orig/include/asm-um/desc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/desc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_DESC_H ++#define __UM_DESC_H ++ ++#include "asm/arch/desc.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/div64.h um/include/asm-um/div64.h +--- orig/include/asm-um/div64.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/div64.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef _UM_DIV64_H ++#define _UM_DIV64_H ++ ++#include "asm/arch/div64.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/dma.h um/include/asm-um/dma.h +--- orig/include/asm-um/dma.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/dma.h Sun Oct 27 16:53:42 2002 +@@ -0,0 +1,10 @@ ++#ifndef __UM_DMA_H ++#define __UM_DMA_H ++ ++#include "asm/io.h" ++ ++extern unsigned long uml_physmem; ++ ++#define MAX_DMA_ADDRESS (uml_physmem) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/elf.h um/include/asm-um/elf.h +--- orig/include/asm-um/elf.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/elf.h Sun Dec 8 20:13:07 2002 +@@ -0,0 +1,18 @@ ++#ifndef __UM_ELF_H ++#define __UM_ELF_H ++ ++#include "asm/archparam.h" ++ ++#define ELF_HWCAP (0) ++ ++#define SET_PERSONALITY(ex, ibcs2) do ; while(0) ++ ++#define ELF_EXEC_PAGESIZE 4096 ++ ++#define elf_check_arch(x) (1) ++ ++#define ELF_CLASS ELFCLASS32 ++ ++#define USE_ELF_CORE_DUMP ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/errno.h um/include/asm-um/errno.h +--- orig/include/asm-um/errno.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/errno.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_ERRNO_H ++#define __UM_ERRNO_H ++ ++#include "asm/arch/errno.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/fcntl.h um/include/asm-um/fcntl.h +--- orig/include/asm-um/fcntl.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/fcntl.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_FCNTL_H ++#define __UM_FCNTL_H ++ ++#include "asm/arch/fcntl.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/fixmap.h um/include/asm-um/fixmap.h +--- orig/include/asm-um/fixmap.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/fixmap.h Wed Mar 26 22:01:27 2003 +@@ -0,0 +1,89 @@ ++#ifndef __UM_FIXMAP_H ++#define __UM_FIXMAP_H ++ ++#include <linux/config.h> ++#include <asm/kmap_types.h> ++ ++/* ++ * Here we define all the compile-time 'special' virtual ++ * addresses. The point is to have a constant address at ++ * compile time, but to set the physical address only ++ * in the boot process. We allocate these special addresses ++ * from the end of virtual memory (0xfffff000) backwards. ++ * Also this lets us do fail-safe vmalloc(), we ++ * can guarantee that these special addresses and ++ * vmalloc()-ed addresses never overlap. ++ * ++ * these 'compile-time allocated' memory buffers are ++ * fixed-size 4k pages. (or larger if used with an increment ++ * highger than 1) use fixmap_set(idx,phys) to associate ++ * physical memory with fixmap indices. ++ * ++ * TLB entries of such buffers will not be flushed across ++ * task switches. ++ */ ++ ++/* ++ * on UP currently we will have no trace of the fixmap mechanizm, ++ * no page table allocations, etc. This might change in the ++ * future, say framebuffers for the console driver(s) could be ++ * fix-mapped? ++ */ ++enum fixed_addresses { ++#ifdef CONFIG_HIGHMEM ++ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ ++ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++#endif ++ __end_of_fixed_addresses ++}; ++ ++extern void __set_fixmap (enum fixed_addresses idx, ++ unsigned long phys, pgprot_t flags); ++ ++#define set_fixmap(idx, phys) \ ++ __set_fixmap(idx, phys, PAGE_KERNEL) ++/* ++ * Some hardware wants to get fixmapped without caching. ++ */ ++#define set_fixmap_nocache(idx, phys) \ ++ __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) ++/* ++ * used by vmalloc.c. ++ * ++ * Leave one empty page between vmalloc'ed areas and ++ * the start of the fixmap, and leave one page empty ++ * at the top of mem.. ++ */ ++extern unsigned long get_kmem_end(void); ++ ++#define FIXADDR_TOP (get_kmem_end() - 0x2000) ++#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) ++#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) ++ ++#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) ++ ++extern void __this_fixmap_does_not_exist(void); ++ ++/* ++ * 'index to address' translation. If anyone tries to use the idx ++ * directly without tranlation, we catch the bug with a NULL-deference ++ * kernel oops. Illegal ranges of incoming indices are caught too. ++ */ ++static inline unsigned long fix_to_virt(const unsigned int idx) ++{ ++ /* ++ * this branch gets completely eliminated after inlining, ++ * except when someone tries to use fixaddr indices in an ++ * illegal way. (such as mixing up address types or using ++ * out-of-range indices). ++ * ++ * If it doesn't get removed, the linker will complain ++ * loudly with a reasonably clear error message.. ++ */ ++ if (idx >= __end_of_fixed_addresses) ++ __this_fixmap_does_not_exist(); ++ ++ return __fix_to_virt(idx); ++} ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/floppy.h um/include/asm-um/floppy.h +--- orig/include/asm-um/floppy.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/floppy.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_FLOPPY_H ++#define __UM_FLOPPY_H ++ ++#include "asm/arch/floppy.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/hardirq.h um/include/asm-um/hardirq.h +--- orig/include/asm-um/hardirq.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/hardirq.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_HARDIRQ_H ++#define __UM_HARDIRQ_H ++ ++#include "asm/arch/hardirq.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/hdreg.h um/include/asm-um/hdreg.h +--- orig/include/asm-um/hdreg.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/hdreg.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_HDREG_H ++#define __UM_HDREG_H ++ ++#include "asm/arch/hdreg.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/highmem.h um/include/asm-um/highmem.h +--- orig/include/asm-um/highmem.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/highmem.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,12 @@ ++#ifndef __UM_HIGHMEM_H ++#define __UM_HIGHMEM_H ++ ++#include "asm/page.h" ++#include "asm/fixmap.h" ++#include "asm/arch/highmem.h" ++ ++#undef PKMAP_BASE ++ ++#define PKMAP_BASE ((FIXADDR_START - LAST_PKMAP * PAGE_SIZE) & PMD_MASK) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/hw_irq.h um/include/asm-um/hw_irq.h +--- orig/include/asm-um/hw_irq.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/hw_irq.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,10 @@ ++#ifndef _ASM_UM_HW_IRQ_H ++#define _ASM_UM_HW_IRQ_H ++ ++#include "asm/irq.h" ++#include "asm/archparam.h" ++ ++static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) ++{} ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ide.h um/include/asm-um/ide.h +--- orig/include/asm-um/ide.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ide.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_IDE_H ++#define __UM_IDE_H ++ ++#include "asm/arch/ide.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/init.h um/include/asm-um/init.h +--- orig/include/asm-um/init.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/init.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,11 @@ ++#ifndef _UM_INIT_H ++#define _UM_INIT_H ++ ++#ifdef notdef ++#define __init ++#define __initdata ++#define __initfunc(__arginit) __arginit ++#define __cacheline_aligned ++#endif ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/io.h um/include/asm-um/io.h +--- orig/include/asm-um/io.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/io.h Sun Oct 27 16:53:42 2002 +@@ -0,0 +1,25 @@ ++#ifndef __UM_IO_H ++#define __UM_IO_H ++ ++#include "asm/page.h" ++ ++#define IO_SPACE_LIMIT 0xdeadbeef /* Sure hope nothing uses this */ ++ ++static inline int inb(unsigned long i) { return(0); } ++static inline void outb(char c, unsigned long i) { } ++ ++/* ++ * Change virtual addresses to physical addresses and vv. ++ * These are pretty trivial ++ */ ++static inline unsigned long virt_to_phys(volatile void * address) ++{ ++ return __pa((void *) address); ++} ++ ++static inline void * phys_to_virt(unsigned long address) ++{ ++ return __va(address); ++} ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ioctl.h um/include/asm-um/ioctl.h +--- orig/include/asm-um/ioctl.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ioctl.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_IOCTL_H ++#define __UM_IOCTL_H ++ ++#include "asm/arch/ioctl.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ioctls.h um/include/asm-um/ioctls.h +--- orig/include/asm-um/ioctls.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ioctls.h Wed Oct 23 21:11:14 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_IOCTLS_H ++#define __UM_IOCTLS_H ++ ++#include "asm/arch/ioctls.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ipc.h um/include/asm-um/ipc.h +--- orig/include/asm-um/ipc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ipc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_IPC_H ++#define __UM_IPC_H ++ ++#include "asm/arch/ipc.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ipcbuf.h um/include/asm-um/ipcbuf.h +--- orig/include/asm-um/ipcbuf.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ipcbuf.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_IPCBUF_H ++#define __UM_IPCBUF_H ++ ++#include "asm/arch/ipcbuf.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/irq.h um/include/asm-um/irq.h +--- orig/include/asm-um/irq.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/irq.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,36 @@ ++#ifndef __UM_IRQ_H ++#define __UM_IRQ_H ++ ++/* The i386 irq.h has a struct task_struct in a prototype without including ++ * sched.h. This forward declaration kills the resulting warning. ++ */ ++struct task_struct; ++ ++#include "asm/arch/irq.h" ++#include "asm/ptrace.h" ++ ++#undef NR_IRQS ++ ++#define TIMER_IRQ 0 ++#define UMN_IRQ 1 ++#define CONSOLE_IRQ 2 ++#define CONSOLE_WRITE_IRQ 3 ++#define UBD_IRQ 4 ++#define UM_ETH_IRQ 5 ++#define SSL_IRQ 6 ++#define SSL_WRITE_IRQ 7 ++#define ACCEPT_IRQ 8 ++#define MCONSOLE_IRQ 9 ++#define WINCH_IRQ 10 ++#define SIGIO_WRITE_IRQ 11 ++#define TELNETD_IRQ 12 ++#define XTERM_IRQ 13 ++ ++#define LAST_IRQ XTERM_IRQ ++#define NR_IRQS (LAST_IRQ + 1) ++ ++extern int um_request_irq(unsigned int irq, int fd, int type, ++ void (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, const char * devname, ++ void *dev_id); ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/keyboard.h um/include/asm-um/keyboard.h +--- orig/include/asm-um/keyboard.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/keyboard.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_KEYBOARD_H ++#define __UM_KEYBOARD_H ++ ++#include "asm/arch/keyboard.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/kmap_types.h um/include/asm-um/kmap_types.h +--- orig/include/asm-um/kmap_types.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/kmap_types.h Thu Feb 27 13:20:14 2003 +@@ -0,0 +1,11 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_KMAP_TYPES_H ++#define __UM_KMAP_TYPES_H ++ ++#include "asm/arch/kmap_types.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/linux_logo.h um/include/asm-um/linux_logo.h +--- orig/include/asm-um/linux_logo.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/linux_logo.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_LINUX_LOGO_H ++#define __UM_LINUX_LOGO_H ++ ++#include "asm/arch/linux_logo.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/locks.h um/include/asm-um/locks.h +--- orig/include/asm-um/locks.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/locks.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_LOCKS_H ++#define __UM_LOCKS_H ++ ++#include "asm/arch/locks.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/mca_dma.h um/include/asm-um/mca_dma.h +--- orig/include/asm-um/mca_dma.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/mca_dma.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef mca___UM_DMA_H ++#define mca___UM_DMA_H ++ ++#include "asm/arch/mca_dma.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/mman.h um/include/asm-um/mman.h +--- orig/include/asm-um/mman.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/mman.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_MMAN_H ++#define __UM_MMAN_H ++ ++#include "asm/arch/mman.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/mmu.h um/include/asm-um/mmu.h +--- orig/include/asm-um/mmu.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/mmu.h Sat Nov 9 12:51:11 2002 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __MMU_H ++#define __MMU_H ++ ++#include "um_mmu.h" ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/mmu_context.h um/include/asm-um/mmu_context.h +--- orig/include/asm-um/mmu_context.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/mmu_context.h Wed Apr 16 13:59:16 2003 +@@ -0,0 +1,72 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_MMU_CONTEXT_H ++#define __UM_MMU_CONTEXT_H ++ ++#include "linux/sched.h" ++#include "choose-mode.h" ++ ++#define get_mmu_context(task) do ; while(0) ++#define activate_context(tsk) do ; while(0) ++ ++static inline void activate_mm(struct mm_struct *old, struct mm_struct *new) ++{ ++} ++ ++extern void switch_mm_skas(int mm_fd); ++ ++static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, ++ struct task_struct *tsk, unsigned cpu) ++{ ++ if(prev != next){ ++ clear_bit(cpu, &prev->cpu_vm_mask); ++ set_bit(cpu, &next->cpu_vm_mask); ++ if(next != &init_mm) ++ CHOOSE_MODE((void) 0, ++ switch_mm_skas(next->context.skas.mm_fd)); ++ } ++} ++ ++static inline void enter_lazy_tlb(struct mm_struct *mm, ++ struct task_struct *tsk, unsigned cpu) ++{ ++} ++ ++extern int init_new_context_skas(struct task_struct *task, ++ struct mm_struct *mm); ++ ++static inline int init_new_context_tt(struct task_struct *task, ++ struct mm_struct *mm) ++{ ++ return(0); ++} ++ ++static inline int init_new_context(struct task_struct *task, ++ struct mm_struct *mm) ++{ ++ return(CHOOSE_MODE_PROC(init_new_context_tt, init_new_context_skas, ++ task, mm)); ++} ++ ++extern void destroy_context_skas(struct mm_struct *mm); ++ ++static inline void destroy_context(struct mm_struct *mm) ++{ ++ CHOOSE_MODE((void) 0, destroy_context_skas(mm)); ++} ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/module.h um/include/asm-um/module.h +--- orig/include/asm-um/module.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/module.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_MODULE_H ++#define __UM_MODULE_H ++ ++#include "asm/arch/module.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/msgbuf.h um/include/asm-um/msgbuf.h +--- orig/include/asm-um/msgbuf.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/msgbuf.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_MSGBUF_H ++#define __UM_MSGBUF_H ++ ++#include "asm/arch/msgbuf.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/mtrr.h um/include/asm-um/mtrr.h +--- orig/include/asm-um/mtrr.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/mtrr.h Thu Mar 27 15:11:56 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_MTRR_H ++#define __UM_MTRR_H ++ ++#include "asm/arch/mtrr.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/namei.h um/include/asm-um/namei.h +--- orig/include/asm-um/namei.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/namei.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_NAMEI_H ++#define __UM_NAMEI_H ++ ++#include "asm/arch/namei.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/page.h um/include/asm-um/page.h +--- orig/include/asm-um/page.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/page.h Sun Oct 27 16:49:35 2002 +@@ -0,0 +1,53 @@ ++#ifndef __UM_PAGE_H ++#define __UM_PAGE_H ++ ++struct page; ++ ++#include "asm/arch/page.h" ++ ++#undef BUG ++#undef PAGE_BUG ++#undef __pa ++#undef __va ++#undef virt_to_page ++#undef VALID_PAGE ++#undef PAGE_OFFSET ++#undef KERNELBASE ++ ++extern unsigned long uml_physmem; ++ ++#define PAGE_OFFSET (uml_physmem) ++#define KERNELBASE PAGE_OFFSET ++ ++#ifndef __ASSEMBLY__ ++ ++extern void stop(void); ++ ++#define BUG() do { \ ++ panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ ++} while (0) ++ ++#define PAGE_BUG(page) do { \ ++ BUG(); \ ++} while (0) ++ ++#endif /* __ASSEMBLY__ */ ++ ++#define __va_space (8*1024*1024) ++ ++extern unsigned long region_pa(void *virt); ++extern void *region_va(unsigned long phys); ++ ++#define __pa(virt) region_pa((void *) (virt)) ++#define __va(phys) region_va((unsigned long) (phys)) ++ ++extern struct page *page_mem_map(struct page *page); ++ ++extern struct page *pfn_to_page(unsigned long pfn); ++ ++#define VALID_PAGE(page) (page_mem_map(page) != NULL) ++ ++extern struct page *arch_validate(struct page *page, int mask, int order); ++#define HAVE_ARCH_VALIDATE ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/page_offset.h um/include/asm-um/page_offset.h +--- orig/include/asm-um/page_offset.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/page_offset.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1 @@ ++#define PAGE_OFFSET_RAW (uml_physmem) +diff -Naur -X ../exclude-files orig/include/asm-um/param.h um/include/asm-um/param.h +--- orig/include/asm-um/param.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/param.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,24 @@ ++#ifndef _UM_PARAM_H ++#define _UM_PARAM_H ++ ++#ifndef HZ ++#define HZ 52 ++#endif ++ ++#define EXEC_PAGESIZE 4096 ++ ++#ifndef NGROUPS ++#define NGROUPS 32 ++#endif ++ ++#ifndef NOGROUP ++#define NOGROUP (-1) ++#endif ++ ++#define MAXHOSTNAMELEN 64 /* max length of hostname */ ++ ++#ifdef __KERNEL__ ++# define CLOCKS_PER_SEC 100 /* frequency at which times() counts */ ++#endif ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/pci.h um/include/asm-um/pci.h +--- orig/include/asm-um/pci.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/pci.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_PCI_H ++#define __UM_PCI_H ++ ++#define PCI_DMA_BUS_IS_PHYS (1) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/pgalloc.h um/include/asm-um/pgalloc.h +--- orig/include/asm-um/pgalloc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/pgalloc.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,162 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Derived from include/asm-i386/pgalloc.h and include/asm-i386/pgtable.h ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PGALLOC_H ++#define __UM_PGALLOC_H ++ ++#include "linux/config.h" ++#include "linux/mm.h" ++#include "asm/fixmap.h" ++#include "choose-mode.h" ++ ++#define pgd_quicklist (current_cpu_data.pgd_quick) ++#define pmd_quicklist (current_cpu_data.pmd_quick) ++#define pte_quicklist (current_cpu_data.pte_quick) ++#define pgtable_cache_size (current_cpu_data.pgtable_cache_sz) ++ ++#define pmd_populate(mm, pmd, pte) set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))) ++ ++/* ++ * Allocate and free page tables. ++ */ ++ ++static inline pgd_t *get_pgd_slow_tt(void) ++{ ++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); ++ ++ if (pgd) { ++ memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); ++ memcpy(pgd + USER_PTRS_PER_PGD, ++ swapper_pg_dir + USER_PTRS_PER_PGD, ++ (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); ++ } ++ return pgd; ++} ++ ++static inline pgd_t *get_pgd_slow_skas(void) ++{ ++ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); ++ ++ if (pgd) ++ memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); ++ return pgd; ++} ++ ++static inline pgd_t *get_pgd_slow(void) ++{ ++ return(CHOOSE_MODE(get_pgd_slow_tt(), get_pgd_slow_skas())); ++} ++ ++static inline pgd_t *get_pgd_fast(void) ++{ ++ unsigned long *ret; ++ ++ if ((ret = pgd_quicklist) != NULL) { ++ pgd_quicklist = (unsigned long *)(*ret); ++ ret[0] = 0; ++ pgtable_cache_size--; ++ } else ++ ret = (unsigned long *)get_pgd_slow(); ++ return (pgd_t *)ret; ++} ++ ++static inline void free_pgd_fast(pgd_t *pgd) ++{ ++ *(unsigned long *)pgd = (unsigned long) pgd_quicklist; ++ pgd_quicklist = (unsigned long *) pgd; ++ pgtable_cache_size++; ++} ++ ++static inline void free_pgd_slow(pgd_t *pgd) ++{ ++ free_page((unsigned long)pgd); ++} ++ ++static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) ++{ ++ pte_t *pte; ++ ++ pte = (pte_t *) __get_free_page(GFP_KERNEL); ++ if (pte) ++ clear_page(pte); ++ return pte; ++} ++ ++static inline pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address) ++{ ++ unsigned long *ret; ++ ++ if ((ret = (unsigned long *)pte_quicklist) != NULL) { ++ pte_quicklist = (unsigned long *)(*ret); ++ ret[0] = ret[1]; ++ pgtable_cache_size--; ++ } ++ return (pte_t *)ret; ++} ++ ++static inline void pte_free_fast(pte_t *pte) ++{ ++ *(unsigned long *)pte = (unsigned long) pte_quicklist; ++ pte_quicklist = (unsigned long *) pte; ++ pgtable_cache_size++; ++} ++ ++static inline void pte_free_slow(pte_t *pte) ++{ ++ free_page((unsigned long)pte); ++} ++ ++#define pte_free(pte) pte_free_fast(pte) ++#define pgd_free(pgd) free_pgd_slow(pgd) ++#define pgd_alloc(mm) get_pgd_fast() ++ ++/* ++ * allocating and freeing a pmd is trivial: the 1-entry pmd is ++ * inside the pgd, so has no extra memory associated with it. ++ */ ++ ++#define pmd_alloc_one_fast(mm, addr) ({ BUG(); ((pmd_t *)1); }) ++#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) ++#define pmd_free_slow(x) do { } while (0) ++#define pmd_free_fast(x) do { } while (0) ++#define pmd_free(x) do { } while (0) ++#define pgd_populate(mm, pmd, pte) BUG() ++ ++/* ++ * TLB flushing: ++ * ++ * - flush_tlb() flushes the current mm struct TLBs ++ * - flush_tlb_all() flushes all processes TLBs ++ * - flush_tlb_mm(mm) flushes the specified mm context TLB's ++ * - flush_tlb_page(vma, vmaddr) flushes one page ++ * - flush_tlb_kernel_vm() flushes the kernel vm area ++ * - flush_tlb_range(mm, start, end) flushes a range of pages ++ * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables ++ */ ++ ++extern void flush_tlb_all(void); ++extern void flush_tlb_mm(struct mm_struct *mm); ++extern void flush_tlb_range(struct mm_struct *mm, unsigned long start, ++ unsigned long end); ++extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); ++extern void flush_tlb_kernel_vm(void); ++ ++static inline void flush_tlb_pgtables(struct mm_struct *mm, ++ unsigned long start, unsigned long end) ++{ ++} ++ ++#endif ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/pgtable.h um/include/asm-um/pgtable.h +--- orig/include/asm-um/pgtable.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/pgtable.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,428 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Derived from include/asm-i386/pgtable.h ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PGTABLE_H ++#define __UM_PGTABLE_H ++ ++#include "linux/sched.h" ++#include "asm/processor.h" ++#include "asm/page.h" ++ ++extern pgd_t swapper_pg_dir[1024]; ++ ++#define flush_cache_all() do ; while (0) ++#define flush_cache_mm(mm) do ; while (0) ++#define flush_cache_range(vma, start, end) do ; while (0) ++#define flush_cache_page(vma, vmaddr) do ; while (0) ++#define flush_page_to_ram(page) do ; while (0) ++#define flush_dcache_page(page) do ; while (0) ++#define flush_icache_range(from, to) do ; while (0) ++#define flush_icache_page(vma,pg) do ; while (0) ++#define flush_icache_user_range(vma,pg,adr,len) do ; while (0) ++ ++extern void __flush_tlb_one(unsigned long addr); ++ ++extern void pte_free(pte_t *pte); ++ ++extern void pgd_free(pgd_t *pgd); ++ ++extern int do_check_pgt_cache(int, int); ++ ++extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt, ++ pte_t *pte_out); ++ ++/* zero page used for uninitialized stuff */ ++extern unsigned long *empty_zero_page; ++ ++#define pgtable_cache_init() do ; while (0) ++ ++/* PMD_SHIFT determines the size of the area a second-level page table can map */ ++#define PMD_SHIFT 22 ++#define PMD_SIZE (1UL << PMD_SHIFT) ++#define PMD_MASK (~(PMD_SIZE-1)) ++ ++/* PGDIR_SHIFT determines what a third-level page table entry can map */ ++#define PGDIR_SHIFT 22 ++#define PGDIR_SIZE (1UL << PGDIR_SHIFT) ++#define PGDIR_MASK (~(PGDIR_SIZE-1)) ++ ++/* ++ * entries per page directory level: the i386 is two-level, so ++ * we don't really have any PMD directory physically. ++ */ ++#define PTRS_PER_PTE 1024 ++#define PTRS_PER_PMD 1 ++#define PTRS_PER_PGD 1024 ++#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) ++#define FIRST_USER_PGD_NR 0 ++ ++#define pte_ERROR(e) \ ++ printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) ++#define pmd_ERROR(e) \ ++ printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) ++#define pgd_ERROR(e) \ ++ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) ++ ++/* ++ * pgd entries used up by user/kernel: ++ */ ++ ++#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT) ++#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) ++ ++#ifndef __ASSEMBLY__ ++/* Just any arbitrary offset to the start of the vmalloc VM area: the ++ * current 8MB value just means that there will be a 8MB "hole" after the ++ * physical memory until the kernel virtual memory starts. That means that ++ * any out-of-bounds memory accesses will hopefully be caught. ++ * The vmalloc() routines leaves a hole of 4kB between each vmalloced ++ * area for the same reason. ;) ++ */ ++ ++extern unsigned long high_physmem; ++ ++#define VMALLOC_OFFSET (__va_space) ++#define VMALLOC_START (((unsigned long) high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) ++#define VMALLOC_VMADDR(x) ((unsigned long)(x)) ++ ++#if CONFIG_HIGHMEM ++# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) ++#else ++# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) ++#endif ++ ++#define _PAGE_PRESENT 0x001 ++#define _PAGE_NEWPAGE 0x002 ++#define _PAGE_PROTNONE 0x004 /* If not present */ ++#define _PAGE_RW 0x008 ++#define _PAGE_USER 0x010 ++#define _PAGE_ACCESSED 0x020 ++#define _PAGE_DIRTY 0x040 ++#define _PAGE_NEWPROT 0x080 ++ ++#define REGION_MASK 0xf0000000 ++#define REGION_SHIFT 28 ++ ++#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) ++#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) ++#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) ++ ++#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) ++#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) ++#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) ++#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) ++#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) ++#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) ++ ++/* ++ * The i386 can't do page protection for execute, and considers that the same are read. ++ * Also, write permissions imply read permissions. This is the closest we can get.. ++ */ ++#define __P000 PAGE_NONE ++#define __P001 PAGE_READONLY ++#define __P010 PAGE_COPY ++#define __P011 PAGE_COPY ++#define __P100 PAGE_READONLY ++#define __P101 PAGE_READONLY ++#define __P110 PAGE_COPY ++#define __P111 PAGE_COPY ++ ++#define __S000 PAGE_NONE ++#define __S001 PAGE_READONLY ++#define __S010 PAGE_SHARED ++#define __S011 PAGE_SHARED ++#define __S100 PAGE_READONLY ++#define __S101 PAGE_READONLY ++#define __S110 PAGE_SHARED ++#define __S111 PAGE_SHARED ++ ++/* ++ * Define this if things work differently on an i386 and an i486: ++ * it will (on an i486) warn about kernel memory accesses that are ++ * done without a 'verify_area(VERIFY_WRITE,..)' ++ */ ++#undef TEST_VERIFY_AREA ++ ++/* page table for 0-4MB for everybody */ ++extern unsigned long pg0[1024]; ++ ++/* ++ * BAD_PAGETABLE is used when we need a bogus page-table, while ++ * BAD_PAGE is used for a bogus page. ++ * ++ * ZERO_PAGE is a global shared page that is always zero: used ++ * for zero-mapped memory areas etc.. ++ */ ++extern pte_t __bad_page(void); ++extern pte_t * __bad_pagetable(void); ++ ++#define BAD_PAGETABLE __bad_pagetable() ++#define BAD_PAGE __bad_page() ++#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) ++ ++/* number of bits that fit into a memory pointer */ ++#define BITS_PER_PTR (8*sizeof(unsigned long)) ++ ++/* to align the pointer to a pointer address */ ++#define PTR_MASK (~(sizeof(void*)-1)) ++ ++/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */ ++/* 64-bit machines, beware! SRB. */ ++#define SIZEOF_PTR_LOG2 2 ++ ++/* to find an entry in a page-table */ ++#define PAGE_PTR(address) \ ++((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK) ++ ++#define pte_none(x) !(pte_val(x) & ~_PAGE_NEWPAGE) ++#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) ++ ++#define pte_clear(xp) do { pte_val(*(xp)) = _PAGE_NEWPAGE; } while (0) ++ ++#define phys_region_index(x) (((x) & REGION_MASK) >> REGION_SHIFT) ++#define pte_region_index(x) phys_region_index(pte_val(x)) ++ ++#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) ++#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) ++#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) ++#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) ++ ++#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) ++#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) ++ ++/* ++ * The "pgd_xxx()" functions here are trivial for a folded two-level ++ * setup: the pgd is never bad, and a pmd always exists (as it's folded ++ * into the pgd entry) ++ */ ++static inline int pgd_none(pgd_t pgd) { return 0; } ++static inline int pgd_bad(pgd_t pgd) { return 0; } ++static inline int pgd_present(pgd_t pgd) { return 1; } ++static inline void pgd_clear(pgd_t * pgdp) { } ++ ++#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) ++ ++extern struct page *pte_mem_map(pte_t pte); ++extern struct page *phys_mem_map(unsigned long phys); ++extern unsigned long phys_to_pfn(unsigned long p); ++ ++#define pte_page(x) pfn_to_page(pte_pfn(x)) ++#define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) ++#define mk_phys(a, r) ((a) + (r << REGION_SHIFT)) ++#define phys_addr(p) ((p) & ~REGION_MASK) ++#define phys_page(p) (phys_mem_map(p) + ((phys_addr(p)) >> PAGE_SHIFT)) ++#define virt_to_page(kaddr) \ ++ (phys_mem_map(__pa(kaddr)) + (phys_addr(__pa(kaddr)) >> PAGE_SHIFT)) ++#define pte_pfn(x) phys_to_pfn(pte_val(x)) ++ ++static inline pte_t pte_mknewprot(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_NEWPROT; ++ return(pte); ++} ++ ++static inline pte_t pte_mknewpage(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_NEWPAGE; ++ return(pte); ++} ++ ++static inline void set_pte(pte_t *pteptr, pte_t pteval) ++{ ++ /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so ++ * fix_range knows to unmap it. _PAGE_NEWPROT is specific to ++ * mapped pages. ++ */ ++ *pteptr = pte_mknewpage(pteval); ++ if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); ++} ++ ++/* ++ * (pmds are folded into pgds so this doesnt get actually called, ++ * but the define is needed for a generic inline function.) ++ */ ++#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) ++#define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) ++ ++/* ++ * The following only work if pte_present() is true. ++ * Undefined behaviour if not.. ++ */ ++static inline int pte_read(pte_t pte) ++{ ++ return((pte_val(pte) & _PAGE_USER) && ++ !(pte_val(pte) & _PAGE_PROTNONE)); ++} ++ ++static inline int pte_exec(pte_t pte){ ++ return((pte_val(pte) & _PAGE_USER) && ++ !(pte_val(pte) & _PAGE_PROTNONE)); ++} ++ ++static inline int pte_write(pte_t pte) ++{ ++ return((pte_val(pte) & _PAGE_RW) && ++ !(pte_val(pte) & _PAGE_PROTNONE)); ++} ++ ++static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } ++static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } ++static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } ++static inline int pte_newprot(pte_t pte) ++{ ++ return(pte_present(pte) && (pte_val(pte) & _PAGE_NEWPROT)); ++} ++ ++static inline pte_t pte_rdprotect(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_USER; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_exprotect(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_USER; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_mkclean(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_DIRTY; ++ return(pte); ++} ++ ++static inline pte_t pte_mkold(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_ACCESSED; ++ return(pte); ++} ++ ++static inline pte_t pte_wrprotect(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_RW; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_mkread(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_USER; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_mkexec(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_USER; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_mkdirty(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_DIRTY; ++ return(pte); ++} ++ ++static inline pte_t pte_mkyoung(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_ACCESSED; ++ return(pte); ++} ++ ++static inline pte_t pte_mkwrite(pte_t pte) ++{ ++ pte_val(pte) |= _PAGE_RW; ++ return(pte_mknewprot(pte)); ++} ++ ++static inline pte_t pte_mkuptodate(pte_t pte) ++{ ++ pte_val(pte) &= ~_PAGE_NEWPAGE; ++ if(pte_present(pte)) pte_val(pte) &= ~_PAGE_NEWPROT; ++ return(pte); ++} ++ ++extern unsigned long page_to_phys(struct page *page); ++ ++/* ++ * Conversion functions: convert a page and protection to a page entry, ++ * and a page entry and page directory to the page they refer to. ++ */ ++ ++#define mk_pte(page, pgprot) \ ++({ \ ++ pte_t __pte; \ ++ \ ++ pte_val(__pte) = page_to_phys(page) + pgprot_val(pgprot);\ ++ if(pte_present(__pte)) pte_mknewprot(pte_mknewpage(__pte)); \ ++ __pte; \ ++}) ++ ++/* This takes a physical page address that is used by the remapping functions */ ++#define mk_pte_phys(physpage, pgprot) \ ++ pte_mknewpage(mk_pte(phys_page(physpage), pgprot)) ++ ++static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) ++{ ++ pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); ++ if(pte_present(pte)) pte = pte_mknewpage(pte_mknewprot(pte)); ++ return pte; ++} ++ ++#define pmd_page(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) ++ ++/* to find an entry in a page-table-directory. */ ++#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) ++#define __pgd_offset(address) pgd_index(address) ++ ++/* to find an entry in a page-table-directory */ ++#define pgd_offset(mm, address) \ ++((mm)->pgd + ((address) >> PGDIR_SHIFT)) ++ ++/* to find an entry in a kernel page-table-directory */ ++#define pgd_offset_k(address) pgd_offset(&init_mm, address) ++ ++#define __pmd_offset(address) \ ++ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) ++ ++/* Find an entry in the second-level page table.. */ ++static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) ++{ ++ return (pmd_t *) dir; ++} ++ ++/* Find an entry in the third-level page table.. */ ++#define pte_offset(pmd, address) \ ++((pte_t *) (pmd_page(*pmd) + ((address>>10) & ((PTRS_PER_PTE-1)<<2)))) ++ ++#define update_mmu_cache(vma,address,pte) do ; while (0) ++ ++/* Encode and de-code a swap entry */ ++#define SWP_TYPE(x) (((x).val >> 3) & 0x7f) ++#define SWP_OFFSET(x) ((x).val >> 10) ++ ++#define SWP_ENTRY(type, offset) \ ++ ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) ++#define pte_to_swp_entry(pte) \ ++ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) ++#define swp_entry_to_pte(x) ((pte_t) { (x).val }) ++ ++#define PageSkip(x) (0) ++#define kern_addr_valid(addr) (1) ++ ++#include <asm-generic/pgtable.h> ++ ++#endif ++ ++#endif ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/poll.h um/include/asm-um/poll.h +--- orig/include/asm-um/poll.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/poll.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_POLL_H ++#define __UM_POLL_H ++ ++#include "asm/arch/poll.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/posix_types.h um/include/asm-um/posix_types.h +--- orig/include/asm-um/posix_types.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/posix_types.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_POSIX_TYPES_H ++#define __UM_POSIX_TYPES_H ++ ++#include "asm/arch/posix_types.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/processor-generic.h um/include/asm-um/processor-generic.h +--- orig/include/asm-um/processor-generic.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/processor-generic.h Wed Apr 16 13:59:03 2003 +@@ -0,0 +1,182 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PROCESSOR_GENERIC_H ++#define __UM_PROCESSOR_GENERIC_H ++ ++struct pt_regs; ++ ++struct task_struct; ++ ++#include "linux/config.h" ++#include "linux/signal.h" ++#include "asm/ptrace.h" ++#include "asm/siginfo.h" ++#include "choose-mode.h" ++ ++struct mm_struct; ++ ++#define current_text_addr() ((void *) 0) ++ ++#define cpu_relax() do ; while (0) ++ ++#ifdef CONFIG_MODE_TT ++struct proc_tt_mode { ++ int extern_pid; ++ int tracing; ++ int switch_pipe[2]; ++ int singlestep_syscall; ++ int vm_seq; ++}; ++#endif ++ ++#ifdef CONFIG_MODE_SKAS ++struct proc_skas_mode { ++ void *switch_buf; ++ void *fork_buf; ++}; ++#endif ++ ++struct thread_struct { ++ int forking; ++ unsigned long kernel_stack; ++ int nsyscalls; ++ struct pt_regs regs; ++ unsigned long cr2; ++ int err; ++ void *fault_addr; ++ void *fault_catcher; ++ struct task_struct *prev_sched; ++ unsigned long temp_stack; ++ void *exec_buf; ++ struct arch_thread arch; ++ union { ++#ifdef CONFIG_MODE_TT ++ struct proc_tt_mode tt; ++#endif ++#ifdef CONFIG_MODE_SKAS ++ struct proc_skas_mode skas; ++#endif ++ } mode; ++ struct { ++ int op; ++ union { ++ struct { ++ int pid; ++ } fork, exec; ++ struct { ++ int (*proc)(void *); ++ void *arg; ++ } thread; ++ struct { ++ void (*proc)(void *); ++ void *arg; ++ } cb; ++ } u; ++ } request; ++}; ++ ++#define INIT_THREAD \ ++{ \ ++ .forking = 0, \ ++ .kernel_stack = 0, \ ++ .nsyscalls = 0, \ ++ .regs = EMPTY_REGS, \ ++ .cr2 = 0, \ ++ .err = 0, \ ++ .fault_addr = NULL, \ ++ .prev_sched = NULL, \ ++ .temp_stack = 0, \ ++ .exec_buf = NULL, \ ++ .arch = INIT_ARCH_THREAD, \ ++ .request = { 0 } \ ++} ++ ++#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) ++ ++typedef struct { ++ unsigned long seg; ++} mm_segment_t; ++ ++extern struct task_struct *alloc_task_struct(void); ++extern void free_task_struct(struct task_struct *task); ++ ++#define get_task_struct(tsk) atomic_inc(&virt_to_page(tsk)->count) ++ ++extern void release_thread(struct task_struct *); ++extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); ++extern void dump_thread(struct pt_regs *regs, struct user *u); ++ ++extern unsigned long thread_saved_pc(struct thread_struct *t); ++ ++static inline void mm_copy_segments(struct mm_struct *from_mm, ++ struct mm_struct *new_mm) ++{ ++} ++ ++static inline void copy_segments(struct task_struct *p, ++ struct mm_struct *new_mm) ++{ ++} ++ ++static inline void release_segments(struct mm_struct *mm) ++{ ++} ++ ++#define init_task (init_task_union.task) ++#define init_stack (init_task_union.stack) ++ ++/* ++ * User space process size: 3GB (default). ++ */ ++extern unsigned long task_size; ++ ++#define TASK_SIZE (task_size) ++ ++/* This decides where the kernel will search for a free chunk of vm ++ * space during mmap's. ++ */ ++#define TASK_UNMAPPED_BASE (0x40000000) ++ ++extern void start_thread(struct pt_regs *regs, unsigned long entry, ++ unsigned long stack); ++ ++struct cpuinfo_um { ++ unsigned long loops_per_jiffy; ++ unsigned long *pgd_quick; ++ unsigned long *pmd_quick; ++ unsigned long *pte_quick; ++ unsigned long pgtable_cache_sz; ++ int ipi_pipe[2]; ++}; ++ ++extern struct cpuinfo_um boot_cpu_data; ++ ++#define my_cpu_data cpu_data[smp_processor_id()] ++ ++#ifdef CONFIG_SMP ++extern struct cpuinfo_um cpu_data[]; ++#define current_cpu_data cpu_data[smp_processor_id()] ++#else ++#define cpu_data (&boot_cpu_data) ++#define current_cpu_data boot_cpu_data ++#endif ++ ++#define KSTK_EIP(tsk) (PT_REGS_IP(&tsk->thread.regs)) ++#define KSTK_ESP(tsk) (PT_REGS_SP(&tsk->thread.regs)) ++#define get_wchan(p) (0) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/processor-i386.h um/include/asm-um/processor-i386.h +--- orig/include/asm-um/processor-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/processor-i386.h Wed Apr 16 13:59:03 2003 +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PROCESSOR_I386_H ++#define __UM_PROCESSOR_I386_H ++ ++extern int cpu_has_xmm; ++extern int cpu_has_cmov; ++ ++struct arch_thread { ++ unsigned long debugregs[8]; ++ int debugregs_seq; ++}; ++ ++#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ ++ .debugregs_seq = 0 } ++ ++#include "asm/arch/user.h" ++ ++#include "asm/processor-generic.h" ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/processor-ppc.h um/include/asm-um/processor-ppc.h +--- orig/include/asm-um/processor-ppc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/processor-ppc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,15 @@ ++#ifndef __UM_PROCESSOR_PPC_H ++#define __UM_PROCESSOR_PPC_H ++ ++#if defined(__ASSEMBLY__) ++ ++#define CONFIG_ALL_PPC ++#include "arch/processor.h" ++ ++#else ++ ++#include "asm/processor-generic.h" ++ ++#endif ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/ptrace-generic.h um/include/asm-um/ptrace-generic.h +--- orig/include/asm-um/ptrace-generic.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ptrace-generic.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PTRACE_GENERIC_H ++#define __UM_PTRACE_GENERIC_H ++ ++#ifndef __ASSEMBLY__ ++ ++#include "linux/config.h" ++ ++#include "asm/current.h" ++ ++#define pt_regs pt_regs_subarch ++#define show_regs show_regs_subarch ++ ++#include "asm/arch/ptrace.h" ++ ++#undef pt_regs ++#undef show_regs ++#undef user_mode ++#undef instruction_pointer ++ ++#include "sysdep/ptrace.h" ++#include "skas_ptrace.h" ++ ++struct pt_regs { ++ union uml_pt_regs regs; ++}; ++ ++#define EMPTY_REGS { regs : EMPTY_UML_PT_REGS } ++ ++#define PT_REGS_IP(r) UPT_IP(&(r)->regs) ++#define PT_REGS_SP(r) UPT_SP(&(r)->regs) ++ ++#define PT_REG(r, reg) UPT_REG(&(r)->regs, reg) ++#define PT_REGS_SET(r, reg, val) UPT_SET(&(r)->regs, reg, val) ++ ++#define PT_REGS_SET_SYSCALL_RETURN(r, res) \ ++ UPT_SET_SYSCALL_RETURN(&(r)->regs, res) ++#define PT_REGS_RESTART_SYSCALL(r) UPT_RESTART_SYSCALL(&(r)->regs) ++ ++#define PT_REGS_SYSCALL_NR(r) UPT_SYSCALL_NR(&(r)->regs) ++ ++#define PT_REGS_SC(r) UPT_SC(&(r)->regs) ++ ++struct task_struct; ++ ++extern unsigned long getreg(struct task_struct *child, int regno); ++extern int putreg(struct task_struct *child, int regno, unsigned long value); ++extern int get_fpregs(unsigned long buf, struct task_struct *child); ++extern int set_fpregs(unsigned long buf, struct task_struct *child); ++extern int get_fpxregs(unsigned long buf, struct task_struct *child); ++extern int set_fpxregs(unsigned long buf, struct task_struct *tsk); ++ ++extern void show_regs(struct pt_regs *regs); ++ ++#define INIT_TASK_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) ++ ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/ptrace-i386.h um/include/asm-um/ptrace-i386.h +--- orig/include/asm-um/ptrace-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ptrace-i386.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,46 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_PTRACE_I386_H ++#define __UM_PTRACE_I386_H ++ ++#include "sysdep/ptrace.h" ++#include "asm/ptrace-generic.h" ++ ++#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs) ++#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs) ++#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs) ++#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs) ++#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs) ++#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs) ++#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs) ++ ++#define PT_REGS_CS(r) UPT_CS(&(r)->regs) ++#define PT_REGS_SS(r) UPT_SS(&(r)->regs) ++#define PT_REGS_DS(r) UPT_DS(&(r)->regs) ++#define PT_REGS_ES(r) UPT_ES(&(r)->regs) ++#define PT_REGS_FS(r) UPT_FS(&(r)->regs) ++#define PT_REGS_GS(r) UPT_GS(&(r)->regs) ++ ++#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) ++ ++#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r) ++#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r) ++#define PT_FIX_EXEC_STACK(sp) do ; while(0) ++ ++#define user_mode(r) UPT_IS_USER(&(r)->regs) ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/resource.h um/include/asm-um/resource.h +--- orig/include/asm-um/resource.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/resource.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_RESOURCE_H ++#define __UM_RESOURCE_H ++ ++#include "asm/arch/resource.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/rwlock.h um/include/asm-um/rwlock.h +--- orig/include/asm-um/rwlock.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/rwlock.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_RWLOCK_H ++#define __UM_RWLOCK_H ++ ++#include "asm/arch/rwlock.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/rwsem.h um/include/asm-um/rwsem.h +--- orig/include/asm-um/rwsem.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/rwsem.h Wed Apr 16 13:59:03 2003 +@@ -0,0 +1,10 @@ ++#ifndef __UM_RWSEM_H__ ++#define __UM_RWSEM_H__ ++ ++#if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96) ++#define __builtin_expect(exp,c) (exp) ++#endif ++ ++#include "asm/arch/rwsem.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/scatterlist.h um/include/asm-um/scatterlist.h +--- orig/include/asm-um/scatterlist.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/scatterlist.h Thu Feb 27 13:21:49 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SCATTERLIST_H ++#define __UM_SCATTERLIST_H ++ ++#include "asm/arch/scatterlist.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/segment.h um/include/asm-um/segment.h +--- orig/include/asm-um/segment.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/segment.h Fri Nov 1 19:45:34 2002 +@@ -0,0 +1,4 @@ ++#ifndef __UM_SEGMENT_H ++#define __UM_SEGMENT_H ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/semaphore.h um/include/asm-um/semaphore.h +--- orig/include/asm-um/semaphore.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/semaphore.h Wed Apr 16 13:59:03 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SEMAPHORE_H ++#define __UM_SEMAPHORE_H ++ ++#include "asm/arch/semaphore.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/sembuf.h um/include/asm-um/sembuf.h +--- orig/include/asm-um/sembuf.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/sembuf.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SEMBUF_H ++#define __UM_SEMBUF_H ++ ++#include "asm/arch/sembuf.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/serial.h um/include/asm-um/serial.h +--- orig/include/asm-um/serial.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/serial.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SERIAL_H ++#define __UM_SERIAL_H ++ ++#include "asm/arch/serial.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/shmbuf.h um/include/asm-um/shmbuf.h +--- orig/include/asm-um/shmbuf.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/shmbuf.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SHMBUF_H ++#define __UM_SHMBUF_H ++ ++#include "asm/arch/shmbuf.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/shmparam.h um/include/asm-um/shmparam.h +--- orig/include/asm-um/shmparam.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/shmparam.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SHMPARAM_H ++#define __UM_SHMPARAM_H ++ ++#include "asm/arch/shmparam.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-generic.h um/include/asm-um/sigcontext-generic.h +--- orig/include/asm-um/sigcontext-generic.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/sigcontext-generic.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SIGCONTEXT_GENERIC_H ++#define __UM_SIGCONTEXT_GENERIC_H ++ ++#include "asm/arch/sigcontext.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-i386.h um/include/asm-um/sigcontext-i386.h +--- orig/include/asm-um/sigcontext-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/sigcontext-i386.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SIGCONTEXT_I386_H ++#define __UM_SIGCONTEXT_I386_H ++ ++#include "asm/sigcontext-generic.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/sigcontext-ppc.h um/include/asm-um/sigcontext-ppc.h +--- orig/include/asm-um/sigcontext-ppc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/sigcontext-ppc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,10 @@ ++#ifndef __UM_SIGCONTEXT_PPC_H ++#define __UM_SIGCONTEXT_PPC_H ++ ++#define pt_regs sys_pt_regs ++ ++#include "asm/sigcontext-generic.h" ++ ++#undef pt_regs ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/siginfo.h um/include/asm-um/siginfo.h +--- orig/include/asm-um/siginfo.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/siginfo.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SIGINFO_H ++#define __UM_SIGINFO_H ++ ++#include "asm/arch/siginfo.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/signal.h um/include/asm-um/signal.h +--- orig/include/asm-um/signal.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/signal.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,22 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_SIGNAL_H ++#define __UM_SIGNAL_H ++ ++#include "asm/arch/signal.h" ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/smp.h um/include/asm-um/smp.h +--- orig/include/asm-um/smp.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/smp.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,19 @@ ++#ifndef __UM_SMP_H ++#define __UM_SMP_H ++ ++#ifdef CONFIG_SMP ++ ++#include "linux/config.h" ++#include "asm/current.h" ++ ++#define smp_processor_id() (current->processor) ++#define cpu_logical_map(n) (n) ++#define cpu_number_map(n) (n) ++#define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ ++extern int hard_smp_processor_id(void); ++extern unsigned long cpu_online_map; ++#define NO_PROC_ID -1 ++ ++#endif ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/smplock.h um/include/asm-um/smplock.h +--- orig/include/asm-um/smplock.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/smplock.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SMPLOCK_H ++#define __UM_SMPLOCK_H ++ ++#include "asm/arch/smplock.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/socket.h um/include/asm-um/socket.h +--- orig/include/asm-um/socket.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/socket.h Thu Feb 27 13:20:13 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SOCKET_H ++#define __UM_SOCKET_H ++ ++#include "asm/arch/socket.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/sockios.h um/include/asm-um/sockios.h +--- orig/include/asm-um/sockios.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/sockios.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_SOCKIOS_H ++#define __UM_SOCKIOS_H ++ ++#include "asm/arch/sockios.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/softirq.h um/include/asm-um/softirq.h +--- orig/include/asm-um/softirq.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/softirq.h Wed Apr 16 13:59:04 2003 +@@ -0,0 +1,13 @@ ++#ifndef __UM_SOFTIRQ_H ++#define __UM_SOFTIRQ_H ++ ++#include "linux/smp.h" ++#include "asm/system.h" ++#include "asm/processor.h" ++ ++/* A gratuitous name change */ ++#define i386_bh_lock um_bh_lock ++#include "asm/arch/softirq.h" ++#undef i386_bh_lock ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/spinlock.h um/include/asm-um/spinlock.h +--- orig/include/asm-um/spinlock.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/spinlock.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,10 @@ ++#ifndef __UM_SPINLOCK_H ++#define __UM_SPINLOCK_H ++ ++#include "linux/config.h" ++ ++#ifdef CONFIG_SMP ++#include "asm/arch/spinlock.h" ++#endif ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/stat.h um/include/asm-um/stat.h +--- orig/include/asm-um/stat.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/stat.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_STAT_H ++#define __UM_STAT_H ++ ++#include "asm/arch/stat.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/statfs.h um/include/asm-um/statfs.h +--- orig/include/asm-um/statfs.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/statfs.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,6 @@ ++#ifndef _UM_STATFS_H ++#define _UM_STATFS_H ++ ++#include "asm/arch/statfs.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/string.h um/include/asm-um/string.h +--- orig/include/asm-um/string.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/string.h Mon Feb 24 22:52:09 2003 +@@ -0,0 +1,7 @@ ++#ifndef __UM_STRING_H ++#define __UM_STRING_H ++ ++#include "asm/arch/string.h" ++#include "asm/archparam.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/system-generic.h um/include/asm-um/system-generic.h +--- orig/include/asm-um/system-generic.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/system-generic.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,50 @@ ++#ifndef __UM_SYSTEM_GENERIC_H ++#define __UM_SYSTEM_GENERIC_H ++ ++#include "asm/arch/system.h" ++ ++#undef prepare_to_switch ++#undef switch_to ++#undef __save_flags ++#undef save_flags ++#undef __restore_flags ++#undef restore_flags ++#undef __cli ++#undef __sti ++#undef cli ++#undef sti ++#undef local_irq_save ++#undef local_irq_restore ++#undef local_irq_disable ++#undef local_irq_enable ++ ++#define prepare_to_switch() do ; while(0) ++ ++void *_switch_to(void *prev, void *next); ++ ++#define switch_to(prev, next, last) prev = _switch_to(prev, next) ++ ++extern int get_signals(void); ++extern int set_signals(int enable); ++extern void block_signals(void); ++extern void unblock_signals(void); ++ ++#define local_irq_save(flags) do { (flags) = set_signals(0); } while(0) ++ ++#define local_irq_restore(flags) do { set_signals(flags); } while(0) ++ ++#define local_irq_enable() unblock_signals() ++#define local_irq_disable() block_signals() ++ ++#define __sti() unblock_signals() ++#define sti() unblock_signals() ++#define __cli() block_signals() ++#define cli() block_signals() ++ ++#define __save_flags(x) do { (flags) = get_signals(); } while(0) ++#define save_flags(x) __save_flags(x) ++ ++#define __restore_flags(x) local_irq_restore(x) ++#define restore_flags(x) __restore_flags(x) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/system-i386.h um/include/asm-um/system-i386.h +--- orig/include/asm-um/system-i386.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/system-i386.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,39 @@ ++#ifndef __UM_SYSTEM_I386_H ++#define __UM_SYSTEM_I386_H ++ ++#include "asm/system-generic.h" ++ ++#define __HAVE_ARCH_CMPXCHG 1 ++ ++static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, ++ unsigned long new, int size) ++{ ++ unsigned long prev; ++ switch (size) { ++ case 1: ++ __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" ++ : "=a"(prev) ++ : "q"(new), "m"(*__xg(ptr)), "0"(old) ++ : "memory"); ++ return prev; ++ case 2: ++ __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" ++ : "=a"(prev) ++ : "q"(new), "m"(*__xg(ptr)), "0"(old) ++ : "memory"); ++ return prev; ++ case 4: ++ __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" ++ : "=a"(prev) ++ : "q"(new), "m"(*__xg(ptr)), "0"(old) ++ : "memory"); ++ return prev; ++ } ++ return old; ++} ++ ++#define cmpxchg(ptr,o,n)\ ++ ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ ++ (unsigned long)(n),sizeof(*(ptr)))) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/system-ppc.h um/include/asm-um/system-ppc.h +--- orig/include/asm-um/system-ppc.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/system-ppc.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,12 @@ ++#ifndef __UM_SYSTEM_PPC_H ++#define __UM_SYSTEM_PPC_H ++ ++#define _switch_to _ppc_switch_to ++ ++#include "asm/arch/system.h" ++ ++#undef _switch_to ++ ++#include "asm/system-generic.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/termbits.h um/include/asm-um/termbits.h +--- orig/include/asm-um/termbits.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/termbits.h Wed Oct 23 21:11:14 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_TERMBITS_H ++#define __UM_TERMBITS_H ++ ++#include "asm/arch/termbits.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/termios.h um/include/asm-um/termios.h +--- orig/include/asm-um/termios.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/termios.h Thu Feb 27 13:20:13 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_TERMIOS_H ++#define __UM_TERMIOS_H ++ ++#include "asm/arch/termios.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/timex.h um/include/asm-um/timex.h +--- orig/include/asm-um/timex.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/timex.h Wed Mar 26 22:01:25 2003 +@@ -0,0 +1,18 @@ ++#ifndef __UM_TIMEX_H ++#define __UM_TIMEX_H ++ ++#include "linux/time.h" ++ ++typedef unsigned long cycles_t; ++ ++#define cacheflush_time (0) ++ ++static inline cycles_t get_cycles (void) ++{ ++ return 0; ++} ++ ++#define vxtime_lock() do ; while (0) ++#define vxtime_unlock() do ; while (0) ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/tlb.h um/include/asm-um/tlb.h +--- orig/include/asm-um/tlb.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/tlb.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1 @@ ++#include <asm-generic/tlb.h> +diff -Naur -X ../exclude-files orig/include/asm-um/types.h um/include/asm-um/types.h +--- orig/include/asm-um/types.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/types.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_TYPES_H ++#define __UM_TYPES_H ++ ++#include "asm/arch/types.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/uaccess.h um/include/asm-um/uaccess.h +--- orig/include/asm-um/uaccess.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/uaccess.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,97 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __UM_UACCESS_H ++#define __UM_UACCESS_H ++ ++#define VERIFY_READ 0 ++#define VERIFY_WRITE 1 ++ ++/* ++ * The fs value determines whether argument validity checking should be ++ * performed or not. If get_fs() == USER_DS, checking is performed, with ++ * get_fs() == KERNEL_DS, checking is bypassed. ++ * ++ * For historical reasons, these macros are grossly misnamed. ++ */ ++ ++#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) }) ++ ++#define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFF) ++#define USER_DS MAKE_MM_SEG(TASK_SIZE) ++ ++#define get_ds() (KERNEL_DS) ++#define get_fs() (current->addr_limit) ++#define set_fs(x) (current->addr_limit = (x)) ++ ++#define segment_eq(a, b) ((a).seg == (b).seg) ++ ++#include "um_uaccess.h" ++ ++#define __copy_from_user(to, from, n) copy_from_user(to, from, n) ++ ++#define __copy_to_user(to, from, n) copy_to_user(to, from, n) ++ ++#define __get_user(x, ptr) \ ++({ \ ++ const __typeof__(ptr) __private_ptr = ptr; \ ++ __typeof__(*(__private_ptr)) __private_val; \ ++ int __private_ret = -EFAULT; \ ++ (x) = 0; \ ++ if (__copy_from_user(&__private_val, (__private_ptr), \ ++ sizeof(*(__private_ptr))) == 0) {\ ++ (x) = (__typeof__(*(__private_ptr))) __private_val; \ ++ __private_ret = 0; \ ++ } \ ++ __private_ret; \ ++}) ++ ++#define get_user(x, ptr) \ ++({ \ ++ const __typeof__((*ptr)) *private_ptr = (ptr); \ ++ (access_ok(VERIFY_READ, private_ptr, sizeof(*private_ptr)) ? \ ++ __get_user(x, private_ptr) : ((x) = 0, -EFAULT)); \ ++}) ++ ++#define __put_user(x, ptr) \ ++({ \ ++ __typeof__(ptr) __private_ptr = ptr; \ ++ __typeof__(*(__private_ptr)) __private_val; \ ++ int __private_ret = -EFAULT; \ ++ __private_val = (__typeof__(*(__private_ptr))) (x); \ ++ if (__copy_to_user((__private_ptr), &__private_val, \ ++ sizeof(*(__private_ptr))) == 0) { \ ++ __private_ret = 0; \ ++ } \ ++ __private_ret; \ ++}) ++ ++#define put_user(x, ptr) \ ++({ \ ++ __typeof__(*(ptr)) *private_ptr = (ptr); \ ++ (access_ok(VERIFY_WRITE, private_ptr, sizeof(*private_ptr)) ? \ ++ __put_user(x, private_ptr) : -EFAULT); \ ++}) ++ ++#define strlen_user(str) strnlen_user(str, ~0UL >> 1) ++ ++struct exception_table_entry ++{ ++ unsigned long insn; ++ unsigned long fixup; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/ucontext.h um/include/asm-um/ucontext.h +--- orig/include/asm-um/ucontext.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/ucontext.h Sun Dec 1 13:20:58 2002 +@@ -0,0 +1,6 @@ ++#ifndef _ASM_UM_UCONTEXT_H ++#define _ASM_UM_UCONTEXT_H ++ ++#include "asm/arch/ucontext.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/unaligned.h um/include/asm-um/unaligned.h +--- orig/include/asm-um/unaligned.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/unaligned.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_UNALIGNED_H ++#define __UM_UNALIGNED_H ++ ++#include "asm/arch/unaligned.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/unistd.h um/include/asm-um/unistd.h +--- orig/include/asm-um/unistd.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/unistd.h Wed Mar 26 22:01:27 2003 +@@ -0,0 +1,118 @@ ++/* ++ * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef _UM_UNISTD_H_ ++#define _UM_UNISTD_H_ ++ ++#include "linux/resource.h" ++#include "asm/uaccess.h" ++ ++extern long sys_open(const char *filename, int flags, int mode); ++extern long sys_dup(unsigned int fildes); ++extern long sys_close(unsigned int fd); ++extern int um_execve(const char *file, char *const argv[], char *const env[]); ++extern long sys_setsid(void); ++extern long sys_waitpid(pid_t pid, unsigned int * stat_addr, int options); ++extern long sys_wait4(pid_t pid,unsigned int *stat_addr, int options, ++ struct rusage *ru); ++extern long sys_mount(char *dev_name, char *dir_name, char *type, ++ unsigned long flags, void *data); ++extern long sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, ++ struct timeval *tvp); ++extern long sys_lseek(unsigned int fildes, unsigned long offset, int whence); ++extern long sys_read(unsigned int fildes, char *buf, int len); ++extern long sys_write(unsigned int fildes, char *buf, int len); ++ ++#ifdef __KERNEL_SYSCALLS__ ++ ++#define KERNEL_CALL(ret_t, sys, args...) \ ++ mm_segment_t fs = get_fs(); \ ++ ret_t ret; \ ++ set_fs(KERNEL_DS); \ ++ ret = sys(args); \ ++ set_fs(fs); \ ++ return ret; ++ ++static inline long open(const char *pathname, int flags, int mode) ++{ ++ KERNEL_CALL(int, sys_open, pathname, flags, mode) ++} ++ ++static inline long dup(unsigned int fd) ++{ ++ KERNEL_CALL(int, sys_dup, fd); ++} ++ ++static inline long close(unsigned int fd) ++{ ++ KERNEL_CALL(int, sys_close, fd); ++} ++ ++static inline int execve(const char *filename, char *const argv[], ++ char *const envp[]) ++{ ++ KERNEL_CALL(int, um_execve, filename, argv, envp); ++} ++ ++static inline long waitpid(pid_t pid, unsigned int *status, int options) ++{ ++ KERNEL_CALL(pid_t, sys_wait4, pid, status, options, NULL) ++} ++ ++static inline pid_t wait(int *status) ++{ ++ KERNEL_CALL(pid_t, sys_wait4, -1, status, 0, NULL) ++} ++ ++static inline pid_t setsid(void) ++{ ++ KERNEL_CALL(pid_t, sys_setsid) ++} ++ ++static inline long lseek(unsigned int fd, off_t offset, unsigned int whence) ++{ ++ KERNEL_CALL(long, sys_lseek, fd, offset, whence) ++} ++ ++static inline int read(unsigned int fd, char * buf, int len) ++{ ++ KERNEL_CALL(int, sys_read, fd, buf, len) ++} ++ ++static inline int write(unsigned int fd, char * buf, int len) ++{ ++ KERNEL_CALL(int, sys_write, fd, buf, len) ++} ++ ++#endif ++ ++/* Save the value of __KERNEL_SYSCALLS__, undefine it, include the underlying ++ * arch's unistd.h for the system call numbers, and restore the old ++ * __KERNEL_SYSCALLS__. ++ */ ++ ++#ifdef __KERNEL_SYSCALLS__ ++#define __SAVE_KERNEL_SYSCALLS__ __KERNEL_SYSCALLS__ ++#endif ++ ++#undef __KERNEL_SYSCALLS__ ++#include "asm/arch/unistd.h" ++ ++#ifdef __KERNEL_SYSCALLS__ ++#define __KERNEL_SYSCALLS__ __SAVE_KERNEL_SYSCALLS__ ++#endif ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/asm-um/user.h um/include/asm-um/user.h +--- orig/include/asm-um/user.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/user.h Wed Apr 16 13:59:45 2003 +@@ -0,0 +1,6 @@ ++#ifndef __UM_USER_H ++#define __UM_USER_H ++ ++#include "asm/arch/user.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/vga.h um/include/asm-um/vga.h +--- orig/include/asm-um/vga.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/vga.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_VGA_H ++#define __UM_VGA_H ++ ++#include "asm/arch/vga.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/asm-um/xor.h um/include/asm-um/xor.h +--- orig/include/asm-um/xor.h Wed Dec 31 19:00:00 1969 ++++ um/include/asm-um/xor.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,6 @@ ++#ifndef __UM_XOR_H ++#define __UM_XOR_H ++ ++#include "asm-generic/xor.h" ++ ++#endif +diff -Naur -X ../exclude-files orig/include/linux/blk.h um/include/linux/blk.h +--- orig/include/linux/blk.h Sun Sep 15 12:13:19 2002 ++++ um/include/linux/blk.h Wed Apr 16 13:59:04 2003 +@@ -320,6 +320,15 @@ + #define DEVICE_REQUEST do_ida_request + #define DEVICE_NR(device) (MINOR(device) >> 4) + ++#elif (MAJOR_NR == UBD_MAJOR) ++ ++#define DEVICE_NAME "User-mode block device" ++#define DEVICE_INTR do_ubd ++#define DEVICE_REQUEST do_ubd_request ++#define DEVICE_NR(device) (MINOR(device) >> UBD_SHIFT) ++#define DEVICE_ON(device) ++#define DEVICE_OFF(device) ++ + #endif /* MAJOR_NR == whatever */ + + /* provide DEVICE_xxx defaults, if not explicitly defined +diff -Naur -X ../exclude-files orig/include/linux/fs.h um/include/linux/fs.h +--- orig/include/linux/fs.h Thu Feb 27 13:04:27 2003 ++++ um/include/linux/fs.h Wed Apr 16 13:59:03 2003 +@@ -318,6 +318,8 @@ + #include <linux/ncp_fs_i.h> + #include <linux/proc_fs_i.h> + #include <linux/usbdev_fs_i.h> ++#include <linux/hostfs_fs_i.h> ++#include <linux/hppfs_fs_i.h> + #include <linux/jffs2_fs_i.h> + #include <linux/cramfs_fs_sb.h> + +@@ -509,7 +511,9 @@ + struct proc_inode_info proc_i; + struct socket socket_i; + struct usbdev_inode_info usbdev_i; +- struct jffs2_inode_info jffs2_i; ++ struct hostfs_inode_info hostfs_i; ++ struct hppfs_inode_info hppfs_i; ++ struct jffs2_inode_info jffs2_i; + void *generic_ip; + } u; + }; +diff -Naur -X ../exclude-files orig/include/linux/hostfs_fs_i.h um/include/linux/hostfs_fs_i.h +--- orig/include/linux/hostfs_fs_i.h Wed Dec 31 19:00:00 1969 ++++ um/include/linux/hostfs_fs_i.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,21 @@ ++#ifndef _HOSTFS_FS_I ++#define _HOSTFS_FS_I ++ ++struct hostfs_inode_info { ++ char *host_filename; ++ int fd; ++ int mode; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/linux/hppfs_fs_i.h um/include/linux/hppfs_fs_i.h +--- orig/include/linux/hppfs_fs_i.h Wed Dec 31 19:00:00 1969 ++++ um/include/linux/hppfs_fs_i.h Wed Oct 23 21:08:05 2002 +@@ -0,0 +1,19 @@ ++#ifndef _HPPFS_FS_I ++#define _HPPFS_FS_I ++ ++struct hppfs_inode_info { ++ struct dentry *proc_dentry; ++}; ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/include/linux/kernel.h um/include/linux/kernel.h +--- orig/include/linux/kernel.h Thu Feb 27 13:04:27 2003 ++++ um/include/linux/kernel.h Wed Mar 26 22:01:25 2003 +@@ -49,7 +49,7 @@ + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, + +-#ifdef __i386__ ++#if defined(__i386__) || defined(UM_FASTCALL) + #define FASTCALL(x) x __attribute__((regparm(3))) + #else + #define FASTCALL(x) x +diff -Naur -X ../exclude-files orig/include/linux/kernel_stat.h um/include/linux/kernel_stat.h +--- orig/include/linux/kernel_stat.h Thu Feb 27 13:04:27 2003 ++++ um/include/linux/kernel_stat.h Wed Apr 16 13:59:39 2003 +@@ -12,7 +12,7 @@ + * used by rstatd/perfmeter + */ + +-#define DK_MAX_MAJOR 16 ++#define DK_MAX_MAJOR 99 + #define DK_MAX_DISK 16 + + struct kernel_stat { +diff -Naur -X ../exclude-files orig/include/linux/mm.h um/include/linux/mm.h +--- orig/include/linux/mm.h Sun Sep 15 12:13:19 2002 ++++ um/include/linux/mm.h Wed Apr 16 13:59:04 2003 +@@ -425,6 +425,14 @@ + extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)); + extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); + ++#ifndef HAVE_ARCH_VALIDATE ++static inline struct page *arch_validate(struct page *page, ++ unsigned int gfp_mask, int order) ++{ ++ return(page); ++} ++#endif ++ + static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) + { + /* +@@ -432,7 +440,7 @@ + */ + if (order >= MAX_ORDER) + return NULL; +- return _alloc_pages(gfp_mask, order); ++ return arch_validate(_alloc_pages(gfp_mask, order), gfp_mask, order); + } + + #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +@@ -492,6 +500,9 @@ + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + ++extern long do_mprotect(struct mm_struct *mm, unsigned long start, ++ size_t len, unsigned long prot); ++ + /* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc() that does all +@@ -539,9 +550,10 @@ + + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flag, unsigned long pgoff); ++extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flag, ++ unsigned long pgoff); + + static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +@@ -551,7 +563,8 @@ + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) +- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); ++ ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, ++ offset >> PAGE_SHIFT); + out: + return ret; + } +diff -Naur -X ../exclude-files orig/include/linux/proc_mm.h um/include/linux/proc_mm.h +--- orig/include/linux/proc_mm.h Wed Dec 31 19:00:00 1969 ++++ um/include/linux/proc_mm.h Wed Apr 16 13:59:47 2003 +@@ -0,0 +1,48 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#ifndef __PROC_MM_H ++#define __PROC_MM_H ++ ++#include "linux/sched.h" ++ ++#define MM_MMAP 54 ++#define MM_MUNMAP 55 ++#define MM_MPROTECT 56 ++#define MM_COPY_SEGMENTS 57 ++ ++struct mm_mmap { ++ unsigned long addr; ++ unsigned long len; ++ unsigned long prot; ++ unsigned long flags; ++ unsigned long fd; ++ unsigned long offset; ++}; ++ ++struct mm_munmap { ++ unsigned long addr; ++ unsigned long len; ++}; ++ ++struct mm_mprotect { ++ unsigned long addr; ++ unsigned long len; ++ unsigned int prot; ++}; ++ ++struct proc_mm_op { ++ int op; ++ union { ++ struct mm_mmap mmap; ++ struct mm_munmap munmap; ++ struct mm_mprotect mprotect; ++ int copy_segments; ++ } u; ++}; ++ ++extern struct mm_struct *proc_mm_get_mm(int fd); ++ ++#endif +diff -Naur -X ../exclude-files orig/include/linux/tty.h um/include/linux/tty.h +--- orig/include/linux/tty.h Thu Feb 27 13:04:28 2003 ++++ um/include/linux/tty.h Wed Apr 16 13:59:04 2003 +@@ -309,6 +309,9 @@ + spinlock_t read_lock; + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct tq_struct SAK_tq; ++#ifdef CONFIG_TTY_LOG ++ int log_fd; ++#endif + }; + + /* tty magic number */ +@@ -366,6 +369,7 @@ + extern int specialix_init(void); + extern int espserial_init(void); + extern int macserial_init(void); ++extern int stdio_init(void); + extern int a2232board_init(void); + + extern int tty_paranoia_check(struct tty_struct *tty, kdev_t device, +@@ -420,6 +424,8 @@ + + extern int vt_ioctl(struct tty_struct *tty, struct file * file, + unsigned int cmd, unsigned long arg); ++ ++extern void stdio_console_init(void); + + #endif /* __KERNEL__ */ + #endif +diff -Naur -X ../exclude-files orig/init/do_mounts.c um/init/do_mounts.c +--- orig/init/do_mounts.c Thu Feb 27 13:04:28 2003 ++++ um/init/do_mounts.c Thu Feb 27 13:05:27 2003 +@@ -153,6 +153,22 @@ + { "pf", 0x2f00 }, + { "apblock", APBLOCK_MAJOR << 8}, + { "ddv", DDV_MAJOR << 8}, ++ { "ubd0", UBD_MAJOR << 8 | 0 << 4}, ++ { "ubda", UBD_MAJOR << 8 | 0 << 4}, ++ { "ubd1", UBD_MAJOR << 8 | 1 << 4}, ++ { "ubdb", UBD_MAJOR << 8 | 1 << 4}, ++ { "ubd2", UBD_MAJOR << 8 | 2 << 4}, ++ { "ubdc", UBD_MAJOR << 8 | 2 << 4}, ++ { "ubd3", UBD_MAJOR << 8 | 3 << 4}, ++ { "ubdd", UBD_MAJOR << 8 | 3 << 4}, ++ { "ubd4", UBD_MAJOR << 8 | 4 << 4}, ++ { "ubde", UBD_MAJOR << 8 | 4 << 4}, ++ { "ubd5", UBD_MAJOR << 8 | 5 << 4}, ++ { "ubdf", UBD_MAJOR << 8 | 5 << 4}, ++ { "ubd6", UBD_MAJOR << 8 | 6 << 4}, ++ { "ubdg", UBD_MAJOR << 8 | 6 << 4}, ++ { "ubd7", UBD_MAJOR << 8 | 7 << 4}, ++ { "ubdh", UBD_MAJOR << 8 | 7 << 4}, + { "jsfd", JSFD_MAJOR << 8}, + #if defined(CONFIG_ARCH_S390) + { "dasda", (DASD_MAJOR << MINORBITS) }, +diff -Naur -X ../exclude-files orig/kernel/panic.c um/kernel/panic.c +--- orig/kernel/panic.c Thu Feb 27 13:04:29 2003 ++++ um/kernel/panic.c Thu Feb 27 13:05:27 2003 +@@ -66,7 +66,7 @@ + smp_send_stop(); + #endif + +- notifier_call_chain(&panic_notifier_list, 0, NULL); ++ notifier_call_chain(&panic_notifier_list, 0, buf); + + if (panic_timeout > 0) + { +diff -Naur -X ../exclude-files orig/mm/Makefile um/mm/Makefile +--- orig/mm/Makefile Wed Aug 21 11:47:43 2002 ++++ um/mm/Makefile Fri Nov 8 14:21:36 2002 +@@ -17,5 +17,6 @@ + shmem.o + + obj-$(CONFIG_HIGHMEM) += highmem.o ++obj-$(CONFIG_PROC_MM) += proc_mm.o + + include $(TOPDIR)/Rules.make +diff -Naur -X ../exclude-files orig/mm/mmap.c um/mm/mmap.c +--- orig/mm/mmap.c Thu Feb 27 13:04:29 2003 ++++ um/mm/mmap.c Thu Feb 27 13:05:27 2003 +@@ -390,10 +390,11 @@ + return 0; + } + +-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, +- unsigned long prot, unsigned long flags, unsigned long pgoff) ++unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long pgoff) + { +- struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned int vm_flags; + int correct_wcount = 0; +diff -Naur -X ../exclude-files orig/mm/mprotect.c um/mm/mprotect.c +--- orig/mm/mprotect.c Wed Aug 21 11:47:43 2002 ++++ um/mm/mprotect.c Sun Nov 10 20:24:32 2002 +@@ -264,7 +264,8 @@ + return 0; + } + +-asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++long do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, ++ unsigned long prot) + { + unsigned long nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; +@@ -281,9 +282,9 @@ + if (end == start) + return 0; + +- down_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); + +- vma = find_vma_prev(current->mm, start, &prev); ++ vma = find_vma_prev(mm, start, &prev); + error = -ENOMEM; + if (!vma || vma->vm_start > start) + goto out; +@@ -332,6 +333,11 @@ + prev->vm_mm->map_count--; + } + out: +- up_write(¤t->mm->mmap_sem); ++ up_write(&mm->mmap_sem); + return error; ++} ++ ++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++{ ++ return(do_mprotect(current->mm, start, len, prot)); + } +diff -Naur -X ../exclude-files orig/mm/proc_mm.c um/mm/proc_mm.c +--- orig/mm/proc_mm.c Wed Dec 31 19:00:00 1969 ++++ um/mm/proc_mm.c Tue Nov 19 14:20:26 2002 +@@ -0,0 +1,173 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include "linux/init.h" ++#include "linux/proc_fs.h" ++#include "linux/proc_mm.h" ++#include "linux/file.h" ++#include "asm/uaccess.h" ++#include "asm/mmu_context.h" ++ ++static struct file_operations proc_mm_fops; ++ ++struct mm_struct *proc_mm_get_mm(int fd) ++{ ++ struct mm_struct *ret = ERR_PTR(-EBADF); ++ struct file *file; ++ ++ file = fget(fd); ++ if (!file) ++ goto out; ++ ++ ret = ERR_PTR(-EINVAL); ++ if(file->f_op != &proc_mm_fops) ++ goto out_fput; ++ ++ ret = file->private_data; ++ out_fput: ++ fput(file); ++ out: ++ return(ret); ++} ++ ++extern long do_mmap2(struct mm_struct *mm, unsigned long addr, ++ unsigned long len, unsigned long prot, ++ unsigned long flags, unsigned long fd, ++ unsigned long pgoff); ++ ++static ssize_t write_proc_mm(struct file *file, const char *buffer, ++ size_t count, loff_t *ppos) ++{ ++ struct mm_struct *mm = file->private_data; ++ struct proc_mm_op req; ++ int n, ret; ++ ++ if(count > sizeof(req)) ++ return(-EINVAL); ++ ++ n = copy_from_user(&req, buffer, count); ++ if(n != 0) ++ return(-EFAULT); ++ ++ ret = count; ++ switch(req.op){ ++ case MM_MMAP: { ++ struct mm_mmap *map = &req.u.mmap; ++ ++ ret = do_mmap2(mm, map->addr, map->len, map->prot, ++ map->flags, map->fd, map->offset >> PAGE_SHIFT); ++ if((ret & ~PAGE_MASK) == 0) ++ ret = count; ++ ++ break; ++ } ++ case MM_MUNMAP: { ++ struct mm_munmap *unmap = &req.u.munmap; ++ ++ down_write(&mm->mmap_sem); ++ ret = do_munmap(mm, unmap->addr, unmap->len); ++ up_write(&mm->mmap_sem); ++ ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ case MM_MPROTECT: { ++ struct mm_mprotect *protect = &req.u.mprotect; ++ ++ ret = do_mprotect(mm, protect->addr, protect->len, ++ protect->prot); ++ if(ret == 0) ++ ret = count; ++ break; ++ } ++ ++ case MM_COPY_SEGMENTS: { ++ struct mm_struct *from = proc_mm_get_mm(req.u.copy_segments); ++ ++ if(IS_ERR(from)){ ++ ret = PTR_ERR(from); ++ break; ++ } ++ ++ mm_copy_segments(from, mm); ++ break; ++ } ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return(ret); ++} ++ ++static int open_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = mm_alloc(); ++ int ret; ++ ++ ret = -ENOMEM; ++ if(mm == NULL) ++ goto out_mem; ++ ++ ret = init_new_context(current, mm); ++ if(ret) ++ goto out_free; ++ ++ spin_lock(&mmlist_lock); ++ list_add(&mm->mmlist, ¤t->mm->mmlist); ++ mmlist_nr++; ++ spin_unlock(&mmlist_lock); ++ ++ file->private_data = mm; ++ ++ return(0); ++ ++ out_free: ++ mmput(mm); ++ out_mem: ++ return(ret); ++} ++ ++static int release_proc_mm(struct inode *inode, struct file *file) ++{ ++ struct mm_struct *mm = file->private_data; ++ ++ mmput(mm); ++ return(0); ++} ++ ++static struct file_operations proc_mm_fops = { ++ .open = open_proc_mm, ++ .release = release_proc_mm, ++ .write = write_proc_mm, ++}; ++ ++static int make_proc_mm(void) ++{ ++ struct proc_dir_entry *ent; ++ ++ ent = create_proc_entry("mm", 0222, &proc_root); ++ if(ent == NULL){ ++ printk("make_proc_mm : Failed to register /proc/mm\n"); ++ return(0); ++ } ++ ent->proc_fops = &proc_mm_fops; ++ ++ return(0); ++} ++ ++__initcall(make_proc_mm); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -Naur -X ../exclude-files orig/mm/slab.c um/mm/slab.c +--- orig/mm/slab.c Thu Feb 27 13:04:29 2003 ++++ um/mm/slab.c Thu Feb 27 13:05:27 2003 +@@ -1946,10 +1946,14 @@ + + name = cachep->name; + { ++ mm_segment_t fs; + char tmp; ++ fs = get_fs(); ++ set_fs(KERNEL_DS); + if (__get_user(tmp, name)) + name = "broken"; +- } ++ set_fs(fs); ++ } + + seq_printf(m, "%-17s %6lu %6lu %6u %4lu %4lu %4u", + name, active_objs, num_objs, cachep->objsize, diff --git a/lustre/kernel_patches/patches/uml_check_get_page.patch b/lustre/kernel_patches/patches/uml_check_get_page.patch index fafdf90..228d086 100644 --- a/lustre/kernel_patches/patches/uml_check_get_page.patch +++ b/lustre/kernel_patches/patches/uml_check_get_page.patch @@ -1,11 +1,12 @@ - 0 files changed + arch/um/kernel/mem.c | 15 +++++++++++++++ + 1 files changed, 15 insertions(+) ---- linux-2.4.18-17.8.0/arch/um/kernel/mem.c~uml_check_get_page 2002-12-06 14:52:30.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/arch/um/kernel/mem.c 2002-12-06 14:52:30.000000000 -0800 -@@ -529,6 +529,21 @@ struct page *pte_mem_map(pte_t pte) +--- linux-2.4.20/arch/um/kernel/mem.c~uml_check_get_page 2003-04-08 23:34:50.000000000 -0600 ++++ linux-2.4.20-braam/arch/um/kernel/mem.c 2003-04-08 23:34:50.000000000 -0600 +@@ -712,6 +712,21 @@ struct page *pte_mem_map(pte_t pte) return(phys_mem_map(pte_val(pte))); } diff --git a/lustre/kernel_patches/patches/uml_no_panic.patch b/lustre/kernel_patches/patches/uml_no_panic.patch index b0c305b..59069f9 100644 --- a/lustre/kernel_patches/patches/uml_no_panic.patch +++ b/lustre/kernel_patches/patches/uml_no_panic.patch @@ -1,11 +1,12 @@ - 0 files changed + arch/um/kernel/mem.c | 8 ++++++-- + 1 files changed, 6 insertions(+), 2 deletions(-) ---- linux-2.4.18-17.8.0/arch/um/kernel/mem.c~uml_no_panic 2002-12-06 14:52:30.000000000 -0800 -+++ linux-2.4.18-17.8.0-zab/arch/um/kernel/mem.c 2002-12-06 14:52:30.000000000 -0800 -@@ -559,7 +559,9 @@ struct mem_region *page_region(struct pa +--- linux-2.4.20/arch/um/kernel/mem.c~uml_no_panic 2003-04-08 23:34:57.000000000 -0600 ++++ linux-2.4.20-braam/arch/um/kernel/mem.c 2003-04-08 23:34:57.000000000 -0600 +@@ -742,7 +742,9 @@ struct mem_region *page_region(struct pa return(region); } } @@ -16,7 +17,7 @@ return(NULL); } -@@ -581,7 +583,9 @@ unsigned long region_pa(void *virt) +@@ -814,7 +816,9 @@ extern unsigned long region_pa(void *vir (addr <= region->start + region->len)) return(mk_phys(addr - region->start, i)); } diff --git a/lustre/kernel_patches/patches/vanilla-2.4.18.patch b/lustre/kernel_patches/patches/vanilla-2.4.18.patch deleted file mode 100644 index 00cc57c..0000000 --- a/lustre/kernel_patches/patches/vanilla-2.4.18.patch +++ /dev/null @@ -1,1672 +0,0 @@ ---- lum-pristine/include/linux/lustre_version.h Wed Dec 31 19:00:00 1969 -+++ lum/include/linux/lustre_version.h Tue Nov 26 07:02:14 2002 -@@ -0,0 +1,1 @@ -+#define LUSTRE_KERNEL_VERSION 5 ---- lum-pristine/arch/ia64/mm/init.c Fri Nov 9 17:26:17 2001 -+++ lum/arch/ia64/mm/init.c Thu Aug 1 18:07:35 2002 -@@ -37,6 +37,12 @@ - - static unsigned long totalram_pages; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int - do_check_pgt_cache (int low, int high) - { ---- lum-pristine/arch/i386/mm/init.c Fri Dec 21 12:41:53 2001 -+++ lum/arch/i386/mm/init.c Thu Aug 1 18:07:35 2002 -@@ -43,6 +43,12 @@ - static unsigned long totalram_pages; - static unsigned long totalhigh_pages; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int do_check_pgt_cache(int low, int high) - { - int freed = 0; ---- lum-pristine/drivers/block/blkpg.c Mon Feb 25 14:37:57 2002 -+++ lum/drivers/block/blkpg.c Thu Aug 1 18:07:35 2002 -@@ -294,3 +294,38 @@ - } - - EXPORT_SYMBOL(blk_ioctl); -+ -+#define NUM_DEV_NO_WRITE 16 -+static int dev_no_write[NUM_DEV_NO_WRITE]; -+ -+/* -+ * Debug code for turning block devices "read-only" (will discard writes -+ * silently). This is for filesystem crash/recovery testing. -+ */ -+void dev_set_rdonly(kdev_t dev, int no_write) -+{ -+ if (dev) { -+ printk(KERN_WARNING "Turning device %s read-only\n", -+ bdevname(dev)); -+ dev_no_write[no_write] = 0xdead0000 + dev; -+ } -+} -+ -+int dev_check_rdonly(kdev_t dev) { -+ int i; -+ -+ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { -+ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && -+ dev == (dev_no_write[i] & 0xffff)) -+ return 1; -+ } -+ return 0; -+} -+ -+void dev_clear_rdonly(int no_write) { -+ dev_no_write[no_write] = 0; -+} -+ -+EXPORT_SYMBOL(dev_set_rdonly); -+EXPORT_SYMBOL(dev_check_rdonly); -+EXPORT_SYMBOL(dev_clear_rdonly); ---- lum-pristine/drivers/block/loop.c Fri Dec 21 12:41:53 2001 -+++ lum/drivers/block/loop.c Thu Aug 1 18:07:35 2002 -@@ -471,6 +471,11 @@ - spin_unlock_irq(&lo->lo_lock); - - if (rw == WRITE) { -+#ifdef CONFIG_DEV_RDONLY -+ if (dev_check_rdonly(rbh->b_rdev)) -+ goto err; -+#endif -+ - if (lo->lo_flags & LO_FLAGS_READ_ONLY) - goto err; - } else if (rw == READA) { ---- lum-pristine/drivers/ide/ide-disk.c Fri Dec 21 12:41:54 2001 -+++ lum/drivers/ide/ide-disk.c Thu Aug 1 18:07:35 2002 -@@ -367,6 +367,12 @@ - */ - static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) - { -+#ifdef CONFIG_DEV_RDONLY -+ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { -+ ide_end_request(1, HWGROUP(drive)); -+ return ide_stopped; -+ } -+#endif - if (IDE_CONTROL_REG) - OUT_BYTE(drive->ctl,IDE_CONTROL_REG); - OUT_BYTE(0x00, IDE_FEATURE_REG); ---- lum-pristine/fs/ext3/Makefile Fri Dec 21 12:41:55 2001 -+++ lum/fs/ext3/Makefile Thu Aug 1 18:07:35 2002 -@@ -9,6 +9,8 @@ - - O_TARGET := ext3.o - -+export-objs := super.o -+ - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o - obj-m := $(O_TARGET) ---- lum-pristine/fs/ext3/super.c Mon Feb 25 14:38:08 2002 -+++ lum/fs/ext3/super.c Thu Aug 1 18:07:35 2002 -@@ -1744,7 +1744,7 @@ - unregister_filesystem(&ext3_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- lum-pristine/fs/jbd/commit.c Mon Feb 25 14:38:08 2002 -+++ lum/fs/jbd/commit.c Thu Aug 1 18:07:35 2002 -@@ -475,7 +475,7 @@ - transaction's t_log_list queue, and metadata buffers are on - the t_iobuf_list queue. - -- Wait for the transactions in reverse order. That way we are -+ Wait for the buffers in reverse order. That way we are - less likely to be woken up until all IOs have completed, and - so we incur less scheduling load. - */ -@@ -566,8 +566,10 @@ - - jbd_debug(3, "JBD: commit phase 6\n"); - -- if (is_journal_aborted(journal)) -+ if (is_journal_aborted(journal)) { -+ unlock_journal(journal); - goto skip_commit; -+ } - - /* Done it all: now write the commit record. We should have - * cleaned up our previous buffers by now, so if we are in abort -@@ -577,6 +579,7 @@ - descriptor = journal_get_descriptor_buffer(journal); - if (!descriptor) { - __journal_abort_hard(journal); -+ unlock_journal(journal); - goto skip_commit; - } - -@@ -600,7 +603,6 @@ - put_bh(bh); /* One for getblk() */ - journal_unlock_journal_head(descriptor); - } -- lock_journal(journal); - - /* End of a transaction! Finally, we can do checkpoint - processing: any buffers committed as a result of this -@@ -609,6 +611,25 @@ - - skip_commit: - -+ /* Call any callbacks that had been registered for handles in this -+ * transaction. It is up to the callback to free any allocated -+ * memory. -+ */ -+ if (!list_empty(&commit_transaction->t_jcb)) { -+ struct list_head *p, *n; -+ int error = is_journal_aborted(journal); -+ -+ list_for_each_safe(p, n, &commit_transaction->t_jcb) { -+ struct journal_callback *jcb; -+ -+ jcb = list_entry(p, struct journal_callback, jcb_list); -+ list_del(p); -+ jcb->jcb_func(jcb, error); -+ } -+ } -+ -+ lock_journal(journal); -+ - jbd_debug(3, "JBD: commit phase 7\n"); - - J_ASSERT(commit_transaction->t_sync_datalist == NULL); ---- lum-pristine/fs/jbd/journal.c Mon Feb 25 14:38:08 2002 -+++ lum/fs/jbd/journal.c Thu Aug 1 18:07:35 2002 -@@ -58,6 +58,7 @@ - #endif - EXPORT_SYMBOL(journal_flush); - EXPORT_SYMBOL(journal_revoke); -+EXPORT_SYMBOL(journal_callback_set); - - EXPORT_SYMBOL(journal_init_dev); - EXPORT_SYMBOL(journal_init_inode); ---- lum-pristine/fs/jbd/transaction.c Mon Feb 25 14:38:08 2002 -+++ lum/fs/jbd/transaction.c Thu Aug 1 18:07:35 2002 -@@ -57,6 +57,7 @@ - transaction->t_state = T_RUNNING; - transaction->t_tid = journal->j_transaction_sequence++; - transaction->t_expires = jiffies + journal->j_commit_interval; -+ INIT_LIST_HEAD(&transaction->t_jcb); - - /* Set up the commit timer for the new transaction. */ - J_ASSERT (!journal->j_commit_timer_active); -@@ -201,6 +202,20 @@ - return 0; - } - -+/* Allocate a new handle. This should probably be in a slab... */ -+static handle_t *new_handle(int nblocks) -+{ -+ handle_t *handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ if (!handle) -+ return NULL; -+ memset(handle, 0, sizeof (handle_t)); -+ handle->h_buffer_credits = nblocks; -+ handle->h_ref = 1; -+ INIT_LIST_HEAD(&handle->h_jcb); -+ -+ return handle; -+} -+ - /* - * Obtain a new handle. - * -@@ -227,14 +242,11 @@ - handle->h_ref++; - return handle; - } -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = start_this_handle(journal, handle); -@@ -333,14 +345,11 @@ - - if (is_journal_aborted(journal)) - return ERR_PTR(-EIO); -- -- handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); -+ -+ handle = new_handle(nblocks); - if (!handle) - return ERR_PTR(-ENOMEM); -- memset (handle, 0, sizeof (handle_t)); - -- handle->h_buffer_credits = nblocks; -- handle->h_ref = 1; - current->journal_info = handle; - - err = try_start_this_handle(journal, handle); -@@ -1328,6 +1337,28 @@ - #endif - - /* -+ * Register a callback function for this handle. The function will be -+ * called when the transaction that this handle is part of has been -+ * committed to disk with the original callback data struct and the -+ * error status of the journal as parameters. There is no guarantee of -+ * ordering between handles within a single transaction, nor between -+ * callbacks registered on the same handle. -+ * -+ * The caller is responsible for allocating the journal_callback struct. -+ * This is to allow the caller to add as much extra data to the callback -+ * as needed, but reduce the overhead of multiple allocations. The caller -+ * allocated struct must start with a struct journal_callback at offset 0, -+ * and has the caller-specific data afterwards. -+ */ -+void journal_callback_set(handle_t *handle, -+ void (*func)(struct journal_callback *jcb, int error), -+ struct journal_callback *jcb) -+{ -+ list_add(&jcb->jcb_list, &handle->h_jcb); -+ jcb->jcb_func = func; -+} -+ -+/* - * All done for a particular handle. - * - * There is not much action needed here. We just return any remaining -@@ -1383,7 +1415,10 @@ - wake_up(&journal->j_wait_transaction_locked); - } - -- /* -+ /* Move callbacks from the handle to the transaction. */ -+ list_splice(&handle->h_jcb, &transaction->t_jcb); -+ -+ /* - * If the handle is marked SYNC, we need to set another commit - * going! We also want to force a commit if the current - * transaction is occupying too much of the log, or if the ---- lum-pristine/include/linux/blkdev.h Mon Nov 26 08:29:17 2001 -+++ lum/include/linux/blkdev.h Mon Aug 12 11:48:39 2002 -@@ -228,4 +228,8 @@ - return retval; - } - -+#define CONFIG_DEV_RDONLY -+void dev_set_rdonly(kdev_t, int); -+int dev_check_rdonly(kdev_t); -+void dev_clear_rdonly(int); - #endif ---- lum-pristine/include/linux/slab.h Fri Dec 21 12:42:04 2001 -+++ lum/include/linux/slab.h Mon Aug 12 11:48:38 2002 -@@ -57,6 +57,7 @@ - extern int kmem_cache_shrink(kmem_cache_t *); - extern void *kmem_cache_alloc(kmem_cache_t *, int); - extern void kmem_cache_free(kmem_cache_t *, void *); -+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); - - extern void *kmalloc(size_t, int); - extern void kfree(const void *); ---- lum-pristine/include/linux/jbd.h Mon Feb 25 14:38:13 2002 -+++ lum/include/linux/jbd.h Mon Aug 12 11:50:09 2002 -@@ -249,6 +249,13 @@ - return bh->b_private; - } - -+#define HAVE_JOURNAL_CALLBACK_STATUS -+struct journal_callback { -+ struct list_head jcb_list; -+ void (*jcb_func)(struct journal_callback *jcb, int error); -+ /* user data goes here */ -+}; -+ - struct jbd_revoke_table_s; - - /* The handle_t type represents a single atomic update being performed -@@ -279,6 +286,12 @@ - operations */ - int h_err; - -+ /* List of application registered callbacks for this handle. -+ * The function(s) will be called after the transaction that -+ * this handle is part of has been committed to disk. -+ */ -+ struct list_head h_jcb; -+ - /* Flags */ - unsigned int h_sync: 1; /* sync-on-close */ - unsigned int h_jdata: 1; /* force data journaling */ -@@ -398,6 +411,10 @@ - - /* How many handles used this transaction? */ - int t_handle_count; -+ -+ /* List of registered callback functions for this transaction. -+ * Called when the transaction is committed. */ -+ struct list_head t_jcb; - }; - - -@@ -646,6 +663,9 @@ - extern int journal_try_to_free_buffers(journal_t *, struct page *, int); - extern int journal_stop(handle_t *); - extern int journal_flush (journal_t *); -+extern void journal_callback_set(handle_t *handle, -+ void (*fn)(struct journal_callback *,int), -+ struct journal_callback *jcb); - - extern void journal_lock_updates (journal_t *); - extern void journal_unlock_updates (journal_t *); ---- lum-pristine/kernel/ksyms.c Mon Feb 25 14:38:13 2002 -+++ lum/kernel/ksyms.c Thu Aug 1 18:07:35 2002 -@@ -260,6 +260,7 @@ - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); -@@ -271,6 +272,12 @@ - EXPORT_SYMBOL(lock_may_write); - EXPORT_SYMBOL(dcache_readdir); - -+/* lustre */ -+EXPORT_SYMBOL(panic_notifier_list); -+EXPORT_SYMBOL(pagecache_lock); -+EXPORT_SYMBOL(do_kern_mount); -+EXPORT_SYMBOL(kmem_cache_validate); -+ - /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ - EXPORT_SYMBOL(default_llseek); - EXPORT_SYMBOL(dentry_open); ---- lum-pristine/include/linux/dcache.h Thu Nov 22 14:46:18 2001 -+++ lum/include/linux/dcache.h Mon Aug 12 00:02:29 2002 -@@ -6,6 +6,34 @@ - #include <asm/atomic.h> - #include <linux/mount.h> - -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_MKDIR (1<<2) -+#define IT_LINK (1<<3) -+#define IT_LINK2 (1<<4) -+#define IT_SYMLINK (1<<5) -+#define IT_UNLINK (1<<6) -+#define IT_RMDIR (1<<7) -+#define IT_RENAME (1<<8) -+#define IT_RENAME2 (1<<9) -+#define IT_READDIR (1<<10) -+#define IT_GETATTR (1<<11) -+#define IT_SETATTR (1<<12) -+#define IT_READLINK (1<<13) -+#define IT_MKNOD (1<<14) -+#define IT_LOOKUP (1<<15) -+ -+struct lookup_intent { -+ int it_op; -+ int it_mode; -+ int it_disposition; -+ int it_status; -+ struct iattr *it_iattr; -+ __u64 it_lock_handle[2]; -+ int it_lock_mode; -+ void *it_data; -+}; -+ - /* - * linux/include/linux/dcache.h - * -@@ -78,6 +106,7 @@ - unsigned long d_time; /* used by d_revalidate */ - struct dentry_operations *d_op; - struct super_block * d_sb; /* The root of the dentry tree */ -+ struct lookup_intent *d_it; - unsigned long d_vfs_flags; - void * d_fsdata; /* fs-specific data */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ -@@ -91,6 +119,8 @@ - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); -+ void (*d_intent_release)(struct dentry *, struct lookup_intent *); - }; - - /* the dentry parameter passed to d_hash and d_compare is the parent ---- lum-pristine/include/linux/fs.h Mon Aug 12 11:02:53 2002 -+++ lum/include/linux/fs.h Mon Aug 12 11:48:38 2002 -@@ -536,6 +536,7 @@ - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_intent; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -779,7 +780,9 @@ - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it); - - /* - * File types -@@ -840,6 +843,7 @@ - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); - int (*link) (struct dentry *,struct inode *,struct dentry *); - int (*unlink) (struct inode *,struct dentry *); - int (*symlink) (struct inode *,struct dentry *,const char *); -@@ -850,6 +854,8 @@ - struct inode *, struct dentry *); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -@@ -986,7 +990,7 @@ - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); - extern long do_mount(char *, char *, char *, unsigned long, void *); -- -+struct vfsmount *do_kern_mount(char *type, int flags, char *name, void *data); - #define kern_umount mntput - - extern int vfs_statfs(struct super_block *, struct statfs *); -@@ -1307,6 +1311,7 @@ - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1317,6 +1322,8 @@ - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1422,6 +1428,8 @@ - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; ---- lum-pristine/fs/dcache.c Mon Feb 25 14:38:08 2002 -+++ lum/fs/dcache.c Thu Aug 1 18:07:35 2002 -@@ -617,6 +617,7 @@ - dentry->d_op = NULL; - dentry->d_fsdata = NULL; - dentry->d_mounted = 0; -+ dentry->d_it = NULL; - INIT_LIST_HEAD(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); ---- lum-pristine/fs/nfsd/vfs.c Fri Dec 21 12:41:55 2001 -+++ lum/fs/nfsd/vfs.c Thu Aug 1 18:07:35 2002 -@@ -1285,7 +1285,7 @@ - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); ---- lum-pristine/fs/namei.c Mon Feb 25 14:38:09 2002 -+++ lum/fs/namei.c Mon Aug 12 11:47:56 2002 -@@ -94,6 +94,12 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct dentry *de, struct lookup_intent *it) -+{ -+ if (it && de->d_op && de->d_op->d_intent_release) -+ de->d_op->d_intent_release(de, it); -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +268,19 @@ - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { -+ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,7 +298,8 @@ - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -@@ -300,6 +318,9 @@ - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup2) -+ result = dir->i_op->lookup2(dir, dentry, it); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +342,12 @@ - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate2) { -+ if (!result->d_op->d_revalidate2(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ result = ERR_PTR(-ENOENT); -+ } - } - return result; - } -@@ -334,7 +361,8 @@ - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= max_recursive_link) -@@ -348,10 +376,14 @@ - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -- err = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(dentry, it); - path_release(nd); - return -ELOOP; - } -@@ -445,7 +472,8 @@ - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -518,9 +546,9 @@ - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -537,8 +570,8 @@ - if (!inode->i_op) - goto out_dput; - -- if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if (inode->i_op->follow_link || inode->i_op->follow_link2) { -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -554,7 +582,7 @@ - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup2) - break; - continue; - /* here ends the main loop */ -@@ -581,9 +609,9 @@ - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -591,9 +625,9 @@ - while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) - ; - inode = dentry->d_inode; -- if ((lookup_flags & LOOKUP_FOLLOW) -- && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op && -+ (inode->i_op->follow_link || inode->i_op->follow_link2)) { -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -607,7 +635,8 @@ - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup2)) - break; - } - goto return_base; -@@ -630,12 +660,23 @@ - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ - /* returns 1 if everything is done */ - static int __emul_lookup_dentry(const char *name, struct nameidata *nd) -@@ -742,7 +786,8 @@ - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -765,13 +810,16 @@ - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup2) -+ dentry = inode->i_op->lookup2(inode, new, it); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -783,6 +831,12 @@ - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -804,7 +858,7 @@ - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -836,6 +890,23 @@ - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -970,7 +1041,8 @@ - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -985,7 +1057,7 @@ - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -994,6 +1067,10 @@ - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1011,7 +1089,7 @@ - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1020,6 +1098,7 @@ - goto exit; - } - -+ it->it_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - error = vfs_create(dir->d_inode, dentry, -@@ -1053,7 +1134,8 @@ - error = -ENOENT; - if (!dentry->d_inode) - goto exit_dput; -- if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) -+ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || -+ dentry->d_inode->i_op->follow_link2)) - goto do_link; - - dput(nd->dentry); -@@ -1139,8 +1219,10 @@ - return 0; - - exit_dput: -+ intent_release(dentry, it); - dput(dentry); - exit: -+ intent_release(nd->dentry, it); - path_release(nd); - return error; - -@@ -1160,7 +1242,12 @@ - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -- error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(dentry, it); - dput(dentry); - if (error) - return error; -@@ -1181,13 +1265,20 @@ - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1195,7 +1286,7 @@ - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1241,6 +1332,7 @@ - char * tmp; - struct dentry * dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode }; - - if (S_ISDIR(mode)) - return -EPERM; -@@ -1252,7 +1344,7 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - - mode &= ~current->fs->umask; -@@ -1270,6 +1363,7 @@ - default: - error = -EINVAL; - } -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1310,6 +1404,7 @@ - { - int error = 0; - char * tmp; -+ struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode }; - - tmp = getname(pathname); - error = PTR_ERR(tmp); -@@ -1321,11 +1416,12 @@ - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ dentry = lookup_create(&nd, 1, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_mkdir(nd.dentry->d_inode, dentry, - mode & ~current->fs->umask); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1407,6 +1504,7 @@ - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_RMDIR }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1429,10 +1527,11 @@ - goto exit1; - } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1476,6 +1576,7 @@ - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_UNLINK }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1489,14 +1590,15 @@ - if (nd.last_type != LAST_NORM) - goto exit1; - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) - goto slashes; - error = vfs_unlink(nd.dentry->d_inode, dentry); - exit2: -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1543,6 +1646,7 @@ - int error = 0; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_SYMLINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1557,10 +1661,12 @@ - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ it.it_data = from; -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1626,6 +1732,7 @@ - int error; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_LINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1639,7 +1745,7 @@ - - error = 0; - if (path_init(from, LOOKUP_POSITIVE, &old_nd)) -- error = path_walk(from, &old_nd); -+ error = path_walk_it(from, &old_nd, &it); - if (error) - goto exit; - if (path_init(to, LOOKUP_PARENT, &nd)) -@@ -1648,10 +1755,12 @@ - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ it.it_op = IT_LINK2; -+ new_dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -+ intent_release(new_dentry, &it); - dput(new_dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1694,7 +1803,8 @@ - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - struct inode *target; -@@ -1754,6 +1864,7 @@ - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - if (target) { - if (!error) - target->i_flags |= S_DEAD; -@@ -1775,7 +1887,8 @@ - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - -@@ -1806,6 +1919,7 @@ - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - double_up(&old_dir->i_zombie, &new_dir->i_zombie); - if (error) - return error; -@@ -1817,13 +1932,14 @@ - } - - int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - if (S_ISDIR(old_dentry->d_inode->i_mode)) -- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it); - else -- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it); - if (!error) { - if (old_dir == new_dir) - inode_dir_notify(old_dir, DN_RENAME); -@@ -1840,6 +1956,7 @@ - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; -+ struct lookup_intent it = { .it_op = IT_RENAME }; - struct nameidata oldnd, newnd; - - if (path_init(oldname, LOOKUP_PARENT, &oldnd)) -@@ -1868,7 +1985,7 @@ - - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1884,18 +2003,21 @@ - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ it.it_op = IT_RENAME2; -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, &it); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, &it); - unlock_kernel(); - -+ intent_release(new_dentry, &it); - dput(new_dentry); - exit4: -+ intent_release(old_dentry, &it); - dput(old_dentry); - exit3: - double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1965,7 +2094,8 @@ - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; -@@ -1978,7 +2108,7 @@ - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -2000,7 +2130,13 @@ - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2042,7 +2178,7 @@ - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); ---- lum-pristine/fs/open.c Fri Oct 12 16:48:42 2001 -+++ lum/fs/open.c Sun Aug 11 15:26:29 2002 -@@ -19,6 +19,9 @@ - #include <asm/uaccess.h> - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -94,12 +97,13 @@ - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -144,6 +149,7 @@ - put_write_access(inode); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -235,8 +241,9 @@ - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -262,6 +270,7 @@ - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -279,8 +288,9 @@ - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - - if (error) - goto out; -@@ -306,6 +317,7 @@ - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -322,6 +334,7 @@ - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -339,13 +352,14 @@ - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - -@@ -361,6 +375,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -369,7 +384,7 @@ - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -381,6 +397,7 @@ - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -421,6 +438,7 @@ - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -429,7 +447,7 @@ - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -446,6 +465,7 @@ - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -490,8 +510,9 @@ - struct inode * inode; - int error; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -511,6 +532,7 @@ - error = notify_change(nd.dentry, &newattrs); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -580,10 +602,12 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -593,10 +618,12 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -630,10 +658,16 @@ - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+ - struct file *filp_open(const char * filename, int flags, int mode) - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -641,14 +675,15 @@ - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -691,6 +726,7 @@ - } - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - -+ intent_release(dentry, it); - return f; - - cleanup_all: -@@ -705,11 +741,17 @@ - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(dentry, it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ ---- lum-pristine/fs/stat.c Thu Sep 13 19:04:43 2001 -+++ lum/fs/stat.c Mon Aug 12 00:04:39 2002 -@@ -13,6 +13,7 @@ - - #include <asm/uaccess.h> - -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - /* - * Revalidate the inode. This is required for proper NFS attribute caching. - */ -@@ -135,13 +135,15 @@ - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -151,13 +153,15 @@ - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -172,13 +176,15 @@ - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -189,13 +195,15 @@ - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -247,20 +255,21 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_READLINK }; - - if (bufsiz <= 0) - return -EINVAL; - -- error = user_path_walk_link(path, &nd); -+ error = user_path_walk_link_it(path, &nd, &it); - if (!error) { - struct inode * inode = nd.dentry->d_inode; -- - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && - !(error = do_revalidate(nd.dentry))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -333,12 +342,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -348,12 +359,14 @@ - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; ---- lum-pristine/mm/slab.c Fri Dec 21 12:42:05 2001 -+++ lum/mm/slab.c Thu Aug 1 18:07:35 2002 -@@ -1187,6 +1187,59 @@ - * Called with the cache-lock held. - */ - -+extern struct page *check_get_page(unsigned long kaddr); -+struct page *page_mem_map(struct page *page); -+static int kmem_check_cache_obj (kmem_cache_t * cachep, -+ slab_t *slabp, void * objp) -+{ -+ int i; -+ unsigned int objnr; -+ -+#if DEBUG -+ if (cachep->flags & SLAB_RED_ZONE) { -+ objp -= BYTES_PER_WORD; -+ if ( *(unsigned long *)objp != RED_MAGIC2) -+ /* Either write before start, or a double free. */ -+ return 0; -+ if (*(unsigned long *)(objp+cachep->objsize - -+ BYTES_PER_WORD) != RED_MAGIC2) -+ /* Either write past end, or a double free. */ -+ return 0; -+ } -+#endif -+ -+ objnr = (objp-slabp->s_mem)/cachep->objsize; -+ if (objnr >= cachep->num) -+ return 0; -+ if (objp != slabp->s_mem + objnr*cachep->objsize) -+ return 0; -+ -+ /* Check slab's freelist to see if this obj is there. */ -+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { -+ if (i == objnr) -+ return 0; -+ } -+ return 1; -+} -+ -+ -+int kmem_cache_validate(kmem_cache_t *cachep, void *objp) -+{ -+ struct page *page = check_get_page((unsigned long)objp); -+ -+ if (!VALID_PAGE(page)) -+ return 0; -+ -+ if (!PageSlab(page)) -+ return 0; -+ -+ /* XXX check for freed slab objects ? */ -+ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) -+ return 0; -+ -+ return (cachep == GET_PAGE_CACHE(page)); -+} -+ - #if DEBUG - static int kmem_extra_free_checks (kmem_cache_t * cachep, - slab_t *slabp, void * objp) diff --git a/lustre/kernel_patches/patches/vanilla-2.4.19.patch b/lustre/kernel_patches/patches/vanilla-2.4.19.patch deleted file mode 100644 index 4ed5bb9..0000000 --- a/lustre/kernel_patches/patches/vanilla-2.4.19.patch +++ /dev/null @@ -1,1576 +0,0 @@ - - - - arch/i386/mm/init.c | 6 - arch/ia64/mm/init.c | 6 - drivers/block/blkpg.c | 35 ++++ - drivers/block/loop.c | 5 - drivers/ide/ide-disk.c | 6 - fs/dcache.c | 1 - fs/ext3/Makefile | 2 - fs/ext3/super.c | 2 - fs/namei.c | 296 ++++++++++++++++++++++++++++++++++------- - fs/nfsd/vfs.c | 2 - fs/open.c | 63 ++++++-- - fs/stat.c | 30 +++- - include/linux/blkdev.h | 4 - include/linux/dcache.h | 31 ++++ - include/linux/fs.h | 23 +++ - include/linux/lustre_version.h | 1 - include/linux/slab.h | 1 - kernel/ksyms.c | 7 - mm/slab.c | 53 +++++++ - 19 files changed, 501 insertions(+), 73 deletions(-) - ---- /dev/null Fri Aug 30 17:31:37 2002 -+++ linux-2.4.19-root/include/linux/lustre_version.h Sun Jan 19 19:54:00 2003 -@@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 7 ---- linux-2.4.19/arch/ia64/mm/init.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/arch/ia64/mm/init.c Sun Jan 19 19:46:42 2003 -@@ -37,6 +37,12 @@ unsigned long MAX_DMA_ADDRESS = PAGE_OFF - - static unsigned long totalram_pages; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int - do_check_pgt_cache (int low, int high) - { ---- linux-2.4.19/arch/i386/mm/init.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/arch/i386/mm/init.c Sun Jan 19 19:46:42 2003 -@@ -43,6 +43,12 @@ unsigned long highstart_pfn, highend_pfn - static unsigned long totalram_pages; - static unsigned long totalhigh_pages; - -+struct page *check_get_page(unsigned long kaddr) -+{ -+#warning FIXME: Lustre team, is this solid? -+ return virt_to_page(kaddr); -+} -+ - int do_check_pgt_cache(int low, int high) - { - int freed = 0; ---- linux-2.4.19/drivers/block/blkpg.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/drivers/block/blkpg.c Sun Jan 19 19:46:42 2003 -@@ -296,3 +296,38 @@ int blk_ioctl(kdev_t dev, unsigned int c - } - - EXPORT_SYMBOL(blk_ioctl); -+ -+#define NUM_DEV_NO_WRITE 16 -+static int dev_no_write[NUM_DEV_NO_WRITE]; -+ -+/* -+ * Debug code for turning block devices "read-only" (will discard writes -+ * silently). This is for filesystem crash/recovery testing. -+ */ -+void dev_set_rdonly(kdev_t dev, int no_write) -+{ -+ if (dev) { -+ printk(KERN_WARNING "Turning device %s read-only\n", -+ bdevname(dev)); -+ dev_no_write[no_write] = 0xdead0000 + dev; -+ } -+} -+ -+int dev_check_rdonly(kdev_t dev) { -+ int i; -+ -+ for (i = 0; i < NUM_DEV_NO_WRITE; i++) { -+ if ((dev_no_write[i] & 0xffff0000) == 0xdead0000 && -+ dev == (dev_no_write[i] & 0xffff)) -+ return 1; -+ } -+ return 0; -+} -+ -+void dev_clear_rdonly(int no_write) { -+ dev_no_write[no_write] = 0; -+} -+ -+EXPORT_SYMBOL(dev_set_rdonly); -+EXPORT_SYMBOL(dev_check_rdonly); -+EXPORT_SYMBOL(dev_clear_rdonly); ---- linux-2.4.19/drivers/block/loop.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/drivers/block/loop.c Sun Jan 19 19:46:42 2003 -@@ -474,6 +474,11 @@ static int loop_make_request(request_que - spin_unlock_irq(&lo->lo_lock); - - if (rw == WRITE) { -+#ifdef CONFIG_DEV_RDONLY -+ if (dev_check_rdonly(rbh->b_rdev)) -+ goto err; -+#endif -+ - if (lo->lo_flags & LO_FLAGS_READ_ONLY) - goto err; - } else if (rw == READA) { ---- linux-2.4.19/drivers/ide/ide-disk.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/drivers/ide/ide-disk.c Sun Jan 19 19:46:42 2003 -@@ -551,6 +551,12 @@ static ide_startstop_t lba_48_rw_disk (i - */ - static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) - { -+#ifdef CONFIG_DEV_RDONLY -+ if (rq->cmd == WRITE && dev_check_rdonly(rq->rq_dev)) { -+ ide_end_request(1, HWGROUP(drive)); -+ return ide_stopped; -+ } -+#endif - if (IDE_CONTROL_REG) - OUT_BYTE(drive->ctl,IDE_CONTROL_REG); - ---- linux-2.4.19/fs/ext3/Makefile~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/ext3/Makefile Sun Jan 19 19:46:42 2003 -@@ -9,6 +9,8 @@ - - O_TARGET := ext3.o - -+export-objs := super.o -+ - obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o - obj-m := $(O_TARGET) ---- linux-2.4.19/fs/ext3/super.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/ext3/super.c Sun Jan 19 19:46:42 2003 -@@ -1744,7 +1744,7 @@ static void __exit exit_ext3_fs(void) - unregister_filesystem(&ext3_fs_type); - } - --EXPORT_NO_SYMBOLS; -+EXPORT_SYMBOL(ext3_bread); - - MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); - MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions"); ---- linux-2.4.19/include/linux/blkdev.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/include/linux/blkdev.h Sun Jan 19 21:05:55 2003 -@@ -240,4 +240,8 @@ static inline unsigned int block_size(kd - return retval; - } - -+#define CONFIG_DEV_RDONLY -+void dev_set_rdonly(kdev_t, int); -+int dev_check_rdonly(kdev_t); -+void dev_clear_rdonly(int); - #endif ---- linux-2.4.19/include/linux/slab.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/include/linux/slab.h Sun Jan 19 21:05:52 2003 -@@ -57,6 +57,7 @@ extern int kmem_cache_destroy(kmem_cache - extern int kmem_cache_shrink(kmem_cache_t *); - extern void *kmem_cache_alloc(kmem_cache_t *, int); - extern void kmem_cache_free(kmem_cache_t *, void *); -+extern int kmem_cache_validate(kmem_cache_t *cachep, void *objp); - - extern void *kmalloc(size_t, int); - extern void kfree(const void *); ---- linux-2.4.19/kernel/ksyms.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/kernel/ksyms.c Sun Jan 19 19:46:42 2003 -@@ -264,6 +264,7 @@ EXPORT_SYMBOL(read_cache_page); - EXPORT_SYMBOL(set_page_dirty); - EXPORT_SYMBOL(vfs_readlink); - EXPORT_SYMBOL(vfs_follow_link); -+EXPORT_SYMBOL(vfs_follow_link_it); - EXPORT_SYMBOL(page_readlink); - EXPORT_SYMBOL(page_follow_link); - EXPORT_SYMBOL(page_symlink_inode_operations); -@@ -280,6 +281,12 @@ EXPORT_SYMBOL(dcache_dir_fsync); - EXPORT_SYMBOL(dcache_readdir); - EXPORT_SYMBOL(dcache_dir_ops); - -+/* lustre */ -+EXPORT_SYMBOL(panic_notifier_list); -+EXPORT_SYMBOL(pagecache_lock_cacheline); -+EXPORT_SYMBOL(do_kern_mount); -+EXPORT_SYMBOL(kmem_cache_validate); -+ - /* for stackable file systems (lofs, wrapfs, cryptfs, etc.) */ - EXPORT_SYMBOL(default_llseek); - EXPORT_SYMBOL(dentry_open); ---- linux-2.4.19/include/linux/dcache.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/include/linux/dcache.h Sun Jan 19 19:46:42 2003 -@@ -6,6 +6,34 @@ - #include <asm/atomic.h> - #include <linux/mount.h> - -+#define IT_OPEN (1) -+#define IT_CREAT (1<<1) -+#define IT_MKDIR (1<<2) -+#define IT_LINK (1<<3) -+#define IT_LINK2 (1<<4) -+#define IT_SYMLINK (1<<5) -+#define IT_UNLINK (1<<6) -+#define IT_RMDIR (1<<7) -+#define IT_RENAME (1<<8) -+#define IT_RENAME2 (1<<9) -+#define IT_READDIR (1<<10) -+#define IT_GETATTR (1<<11) -+#define IT_SETATTR (1<<12) -+#define IT_READLINK (1<<13) -+#define IT_MKNOD (1<<14) -+#define IT_LOOKUP (1<<15) -+ -+struct lookup_intent { -+ int it_op; -+ int it_mode; -+ int it_disposition; -+ int it_status; -+ struct iattr *it_iattr; -+ __u64 it_lock_handle[2]; -+ int it_lock_mode; -+ void *it_data; -+}; -+ - /* - * linux/include/linux/dcache.h - * -@@ -78,6 +106,7 @@ struct dentry { - unsigned long d_time; /* used by d_revalidate */ - struct dentry_operations *d_op; - struct super_block * d_sb; /* The root of the dentry tree */ -+ struct lookup_intent *d_it; - unsigned long d_vfs_flags; - void * d_fsdata; /* fs-specific data */ - unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ -@@ -90,6 +119,8 @@ struct dentry_operations { - int (*d_delete)(struct dentry *); - void (*d_release)(struct dentry *); - void (*d_iput)(struct dentry *, struct inode *); -+ int (*d_revalidate2)(struct dentry *, int, struct lookup_intent *); -+ void (*d_intent_release)(struct dentry *, struct lookup_intent *); - }; - - /* the dentry parameter passed to d_hash and d_compare is the parent ---- linux-2.4.19/include/linux/fs.h~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/include/linux/fs.h Sun Jan 19 21:05:40 2003 -@@ -541,6 +541,7 @@ struct file { - - /* needed for tty driver, and maybe others */ - void *private_data; -+ struct lookup_intent *f_intent; - - /* preallocated helper kiobuf to speedup O_DIRECT */ - struct kiobuf *f_iobuf; -@@ -792,7 +793,9 @@ extern int vfs_symlink(struct inode *, s - extern int vfs_link(struct dentry *, struct inode *, struct dentry *); - extern int vfs_rmdir(struct inode *, struct dentry *); - extern int vfs_unlink(struct inode *, struct dentry *); --extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); -+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it); - - /* - * File types -@@ -853,16 +856,28 @@ struct file_operations { - struct inode_operations { - int (*create) (struct inode *,struct dentry *,int); - struct dentry * (*lookup) (struct inode *,struct dentry *); -+ struct dentry * (*lookup2) (struct inode *,struct dentry *, struct lookup_intent *); - int (*link) (struct dentry *,struct inode *,struct dentry *); -+ int (*link2) (struct inode *,struct inode *, const char *, int); - int (*unlink) (struct inode *,struct dentry *); -+ int (*unlink2) (struct inode *, char *, int); - int (*symlink) (struct inode *,struct dentry *,const char *); -+ int (*symlink2) (struct inode *,const char *, int, const char *); - int (*mkdir) (struct inode *,struct dentry *,int); -+ int (*mkdir2) (struct inode *,char *, int,int); - int (*rmdir) (struct inode *,struct dentry *); -+ int (*rmdir2) (struct inode *, char *, int); - int (*mknod) (struct inode *,struct dentry *,int,int); -+ int (*mknod2) (struct inode *,char *, int,int,int); - int (*rename) (struct inode *, struct dentry *, - struct inode *, struct dentry *); -+ int (*rename2) (struct inode *, struct inode *, -+ char *oldname, int oldlen, -+ char *newname, int newlen); - int (*readlink) (struct dentry *, char *,int); - int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, -+ struct lookup_intent *it); - void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*revalidate) (struct dentry *); -@@ -999,6 +1014,7 @@ extern int unregister_filesystem(struct - extern struct vfsmount *kern_mount(struct file_system_type *); - extern int may_umount(struct vfsmount *); - extern long do_mount(char *, char *, char *, unsigned long, void *); -+struct vfsmount *do_kern_mount(const char *fstype, int flags, char *name, void *data); - extern void umount_tree(struct vfsmount *); - - #define kern_umount mntput -@@ -1329,6 +1345,7 @@ typedef int (*read_actor_t)(read_descrip - extern loff_t default_llseek(struct file *file, loff_t offset, int origin); - - extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); -+extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); - extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); - extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1339,6 +1356,8 @@ extern struct dentry * lookup_one_len(co - extern struct dentry * lookup_hash(struct qstr *, struct dentry *); - #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) - #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) -+#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) -+#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - - extern void iput(struct inode *); - extern void force_delete(struct inode *); -@@ -1448,6 +1467,8 @@ extern struct file_operations generic_ro - - extern int vfs_readlink(struct dentry *, char *, int, const char *); - extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, -+ struct lookup_intent *it); - extern int page_readlink(struct dentry *, char *, int); - extern int page_follow_link(struct dentry *, struct nameidata *); - extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.19/fs/dcache.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/dcache.c Sun Jan 19 19:46:42 2003 -@@ -616,6 +616,7 @@ struct dentry * d_alloc(struct dentry * - dentry->d_op = NULL; - dentry->d_fsdata = NULL; - dentry->d_mounted = 0; -+ dentry->d_it = NULL; - INIT_LIST_HEAD(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-2.4.19/fs/nfsd/vfs.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/nfsd/vfs.c Sun Jan 19 19:46:42 2003 -@@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru - err = nfserr_perm; - } else - #endif -- err = vfs_rename(fdir, odentry, tdir, ndentry); -+ err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); - if (!err && EX_ISSYNC(tfhp->fh_export)) { - nfsd_sync_dir(tdentry); - nfsd_sync_dir(fdentry); ---- linux-2.4.19/fs/namei.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/namei.c Sun Jan 19 19:46:42 2003 -@@ -94,6 +94,12 @@ - * XEmacs seems to be relying on it... - */ - -+void intent_release(struct dentry *de, struct lookup_intent *it) -+{ -+ if (it && de->d_op && de->d_op->d_intent_release) -+ de->d_op->d_intent_release(de, it); -+} -+ - /* In order to reduce some races, while at the same time doing additional - * checking and hopefully speeding things up, we copy filenames to the - * kernel data space before using them.. -@@ -260,10 +266,19 @@ void path_release(struct nameidata *nd) - * Internal lookup() using the new generic dcache. - * SMP-safe - */ --static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * dentry = d_lookup(parent, name); - -+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { -+ if (!dentry->d_op->d_revalidate2(dentry, flags, it) && -+ !d_invalidate(dentry)) { -+ dput(dentry); -+ dentry = NULL; -+ } -+ return dentry; -+ } else - if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { - if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { - dput(dentry); -@@ -281,7 +296,8 @@ static struct dentry * cached_lookup(str - * make sure that nobody added the entry to the dcache in the meantime.. - * SMP-safe - */ --static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) -+static struct dentry *real_lookup(struct dentry *parent, struct qstr *name, -+ int flags, struct lookup_intent *it) - { - struct dentry * result; - struct inode *dir = parent->d_inode; -@@ -300,6 +316,9 @@ static struct dentry * real_lookup(struc - result = ERR_PTR(-ENOMEM); - if (dentry) { - lock_kernel(); -+ if (dir->i_op->lookup2) -+ result = dir->i_op->lookup2(dir, dentry, it); -+ else - result = dir->i_op->lookup(dir, dentry); - unlock_kernel(); - if (result) -@@ -321,6 +340,12 @@ static struct dentry * real_lookup(struc - dput(result); - result = ERR_PTR(-ENOENT); - } -+ } else if (result->d_op && result->d_op->d_revalidate2) { -+ if (!result->d_op->d_revalidate2(result, flags, it) && -+ !d_invalidate(result)) { -+ dput(result); -+ result = ERR_PTR(-ENOENT); -+ } - } - return result; - } -@@ -332,7 +357,8 @@ static struct dentry * real_lookup(struc - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ --static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, -+ struct lookup_intent *it) - { - int err; - if (current->link_count >= 5) -@@ -346,10 +372,14 @@ static inline int do_follow_link(struct - current->link_count++; - current->total_link_count++; - UPDATE_ATIME(dentry->d_inode); -- err = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ err = dentry->d_inode->i_op->follow_link(dentry, nd); - current->link_count--; - return err; - loop: -+ intent_release(dentry, it); - path_release(nd); - return -ELOOP; - } -@@ -447,7 +477,8 @@ static inline void follow_dotdot(struct - * - * We expect 'base' to be positive and a directory. - */ --int link_path_walk(const char * name, struct nameidata *nd) -+int link_path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it) - { - struct dentry *dentry; - struct inode *inode; -@@ -520,9 +551,9 @@ int link_path_walk(const char * name, st - break; - } - /* This does the actual lookups.. */ -- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -539,8 +570,8 @@ int link_path_walk(const char * name, st - if (!inode->i_op) - goto out_dput; - -- if (inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if (inode->i_op->follow_link || inode->i_op->follow_link2) { -+ err = do_follow_link(dentry, nd, NULL); - dput(dentry); - if (err) - goto return_err; -@@ -556,7 +587,7 @@ int link_path_walk(const char * name, st - nd->dentry = dentry; - } - err = -ENOTDIR; -- if (!inode->i_op->lookup) -+ if (!inode->i_op->lookup && !inode->i_op->lookup2) - break; - continue; - /* here ends the main loop */ -@@ -583,9 +614,9 @@ last_component: - if (err < 0) - break; - } -- dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); - if (!dentry) { -- dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - break; -@@ -593,9 +624,9 @@ last_component: - while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) - ; - inode = dentry->d_inode; -- if ((lookup_flags & LOOKUP_FOLLOW) -- && inode && inode->i_op && inode->i_op->follow_link) { -- err = do_follow_link(dentry, nd); -+ if ((lookup_flags & LOOKUP_FOLLOW) && inode && inode->i_op && -+ (inode->i_op->follow_link || inode->i_op->follow_link2)) { -+ err = do_follow_link(dentry, nd, it); - dput(dentry); - if (err) - goto return_err; -@@ -609,7 +640,8 @@ last_component: - goto no_inode; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; -- if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || -+ (!inode->i_op->lookup && !inode->i_op->lookup2)) - break; - } - goto return_base; -@@ -646,15 +678,28 @@ out_dput: - dput(dentry); - break; - } -+ if (err) -+ intent_release(nd->dentry, it); - path_release(nd); - return_err: - return err; - } - -+int link_path_walk(const char * name, struct nameidata *nd) -+{ -+ return link_path_walk_it(name, nd, NULL); -+} -+ -+int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it) -+{ -+ current->total_link_count = 0; -+ return link_path_walk_it(name, nd, it); -+} -+ - int path_walk(const char * name, struct nameidata *nd) - { - current->total_link_count = 0; -- return link_path_walk(name, nd); -+ return link_path_walk_it(name, nd, NULL); - } - - /* SMP-safe */ -@@ -757,7 +802,8 @@ int path_init(const char *name, unsigned - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ --struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base, -+ struct lookup_intent *it) - { - struct dentry * dentry; - struct inode *inode; -@@ -780,13 +826,16 @@ struct dentry * lookup_hash(struct qstr - goto out; - } - -- dentry = cached_lookup(base, name, 0); -+ dentry = cached_lookup(base, name, 0, it); - if (!dentry) { - struct dentry *new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - lock_kernel(); -+ if (inode->i_op->lookup2) -+ dentry = inode->i_op->lookup2(inode, new, it); -+ else - dentry = inode->i_op->lookup(inode, new); - unlock_kernel(); - if (!dentry) -@@ -798,6 +847,12 @@ out: - return dentry; - } - -+struct dentry * lookup_hash(struct qstr *name, struct dentry * base) -+{ -+ return lookup_hash_it(name, base, NULL); -+} -+ -+ - /* SMP-safe */ - struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) - { -@@ -819,7 +874,7 @@ struct dentry * lookup_one_len(const cha - } - this.hash = end_name_hash(hash); - -- return lookup_hash(&this, base); -+ return lookup_hash_it(&this, base, NULL); - access: - return ERR_PTR(-EACCES); - } -@@ -851,6 +906,23 @@ int __user_walk(const char *name, unsign - return err; - } - -+int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it) -+{ -+ char *tmp; -+ int err; -+ -+ tmp = getname(name); -+ err = PTR_ERR(tmp); -+ if (!IS_ERR(tmp)) { -+ err = 0; -+ if (path_init(tmp, flags, nd)) -+ err = path_walk_it(tmp, nd, it); -+ putname(tmp); -+ } -+ return err; -+} -+ - /* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. -@@ -987,7 +1059,8 @@ exit_lock: - * for symlinks (where the permissions are checked later). - * SMP-safe - */ --int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) -+int open_namei_it(const char *pathname, int flag, int mode, -+ struct nameidata *nd, struct lookup_intent *it) - { - int acc_mode, error = 0; - struct inode *inode; -@@ -1002,7 +1075,7 @@ int open_namei(const char * pathname, in - */ - if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); - if (error) - return error; - dentry = nd->dentry; -@@ -1012,6 +1085,10 @@ int open_namei(const char * pathname, in - /* - * Create - we need to know the parent. - */ -+ if (it) { -+ it->it_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); - if (error) -@@ -1028,7 +1105,7 @@ int open_namei(const char * pathname, in - - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - - do_last: - error = PTR_ERR(dentry); -@@ -1037,6 +1114,7 @@ do_last: - goto exit; - } - -+ it->it_mode = mode; - /* Negative dentry, just create the file */ - if (!dentry->d_inode) { - error = vfs_create(dir->d_inode, dentry, -@@ -1070,7 +1148,8 @@ do_last: - error = -ENOENT; - if (!dentry->d_inode) - goto exit_dput; -- if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) -+ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || -+ dentry->d_inode->i_op->follow_link2)) - goto do_link; - - dput(nd->dentry); -@@ -1156,8 +1235,10 @@ ok: - return 0; - - exit_dput: -+ intent_release(dentry, it); - dput(dentry); - exit: -+ intent_release(nd->dentry, it); - path_release(nd); - return error; - -@@ -1176,7 +1257,12 @@ do_link: - * are done. Procfs-like symlinks just set LAST_BIND. - */ - UPDATE_ATIME(dentry->d_inode); -- error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) -+ intent_release(dentry, it); - dput(dentry); - if (error) - return error; -@@ -1198,13 +1284,20 @@ do_link: - } - dir = nd->dentry; - down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - putname(nd->last.name); - goto do_last; - } - -+int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) -+{ -+ return open_namei_it(pathname, flag, mode, nd, NULL); -+} -+ -+ - /* SMP-safe */ --static struct dentry *lookup_create(struct nameidata *nd, int is_dir) -+static struct dentry *lookup_create(struct nameidata *nd, int is_dir, -+ struct lookup_intent *it) - { - struct dentry *dentry; - -@@ -1212,7 +1305,7 @@ static struct dentry *lookup_create(stru - dentry = ERR_PTR(-EEXIST); - if (nd->last_type != LAST_NORM) - goto fail; -- dentry = lookup_hash(&nd->last, nd->dentry); -+ dentry = lookup_hash_it(&nd->last, nd->dentry, it); - if (IS_ERR(dentry)) - goto fail; - if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1258,6 +1351,7 @@ asmlinkage long sys_mknod(const char * f - char * tmp; - struct dentry * dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_MKNOD, .it_mode = mode }; - - if (S_ISDIR(mode)) - return -EPERM; -@@ -1269,7 +1363,19 @@ asmlinkage long sys_mknod(const char * f - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ -+ if (nd.dentry->d_inode->i_op->mknod2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len, -+ mode, dev); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - - mode &= ~current->fs->umask; -@@ -1287,9 +1393,11 @@ asmlinkage long sys_mknod(const char * f - default: - error = -EINVAL; - } -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); - out: - putname(tmp); -@@ -1327,6 +1435,7 @@ asmlinkage long sys_mkdir(const char * p - { - int error = 0; - char * tmp; -+ struct lookup_intent it = { .it_op = IT_MKDIR, .it_mode = mode }; - - tmp = getname(pathname); - error = PTR_ERR(tmp); -@@ -1338,14 +1447,26 @@ asmlinkage long sys_mkdir(const char * p - error = path_walk(tmp, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 1); -+ if (nd.dentry->d_inode->i_op->mkdir2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len, -+ mode); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ dentry = lookup_create(&nd, 1, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_mkdir(nd.dentry->d_inode, dentry, - mode & ~current->fs->umask); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+out2: - path_release(&nd); - out: - putname(tmp); -@@ -1426,6 +1547,7 @@ asmlinkage long sys_rmdir(const char * p - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_RMDIR }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1447,11 +1569,21 @@ asmlinkage long sys_rmdir(const char * p - error = -EBUSY; - goto exit1; - } -+ if (nd.dentry->d_inode->i_op->rmdir2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->rmdir2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_rmdir(nd.dentry->d_inode, dentry); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1495,6 +1627,7 @@ asmlinkage long sys_unlink(const char * - char * name; - struct dentry *dentry; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_UNLINK }; - - name = getname(pathname); - if(IS_ERR(name)) -@@ -1507,8 +1640,17 @@ asmlinkage long sys_unlink(const char * - error = -EISDIR; - if (nd.last_type != LAST_NORM) - goto exit1; -+ if (nd.dentry->d_inode->i_op->unlink2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit1; -+ } - down(&nd.dentry->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - /* Why not before? Because we want correct error value */ -@@ -1516,6 +1658,7 @@ asmlinkage long sys_unlink(const char * - goto slashes; - error = vfs_unlink(nd.dentry->d_inode, dentry); - exit2: -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1562,6 +1705,7 @@ asmlinkage long sys_symlink(const char * - int error = 0; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_SYMLINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1576,15 +1720,28 @@ asmlinkage long sys_symlink(const char * - error = path_walk(to, &nd); - if (error) - goto out; -- dentry = lookup_create(&nd, 0); -+ if (nd.dentry->d_inode->i_op->symlink2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len, -+ from); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } -+ it.it_data = from; -+ dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(dentry); - if (!IS_ERR(dentry)) { - error = vfs_symlink(nd.dentry->d_inode, dentry, from); -+ intent_release(dentry, &it); - dput(dentry); - } - up(&nd.dentry->d_inode->i_sem); -+ out2: - path_release(&nd); --out: -+ out: - putname(to); - } - putname(from); -@@ -1645,6 +1802,7 @@ asmlinkage long sys_link(const char * ol - int error; - char * from; - char * to; -+ struct lookup_intent it = { .it_op = IT_LINK }; - - from = getname(oldname); - if(IS_ERR(from)) -@@ -1657,7 +1815,7 @@ asmlinkage long sys_link(const char * ol - - error = 0; - if (path_init(from, LOOKUP_POSITIVE, &old_nd)) -- error = path_walk(from, &old_nd); -+ error = path_walk_it(from, &old_nd, &it); - if (error) - goto exit; - if (path_init(to, LOOKUP_PARENT, &nd)) -@@ -1667,10 +1825,22 @@ asmlinkage long sys_link(const char * ol - error = -EXDEV; - if (old_nd.mnt != nd.mnt) - goto out_release; -- new_dentry = lookup_create(&nd, 0); -+ if (nd.dentry->d_inode->i_op->link2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link2(old_nd.dentry->d_inode, -+ nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out_release; -+ } -+ it.it_op = IT_LINK2; -+ new_dentry = lookup_create(&nd, 0, &it); - error = PTR_ERR(new_dentry); - if (!IS_ERR(new_dentry)) { - error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -+ intent_release(new_dentry, &it); - dput(new_dentry); - } - up(&nd.dentry->d_inode->i_sem); -@@ -1713,7 +1883,8 @@ exit: - * locking]. - */ - int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - struct inode *target; -@@ -1771,6 +1942,7 @@ int vfs_rename_dir(struct inode *old_dir - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - if (target) { - if (!error) - target->i_flags |= S_DEAD; -@@ -1792,7 +1964,8 @@ out_unlock: - } - - int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - -@@ -1823,6 +1996,7 @@ int vfs_rename_other(struct inode *old_d - error = -EBUSY; - else - error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); -+ intent_release(new_dentry, it); - double_up(&old_dir->i_zombie, &new_dir->i_zombie); - if (error) - return error; -@@ -1834,13 +2008,14 @@ int vfs_rename_other(struct inode *old_d - } - - int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, -- struct inode *new_dir, struct dentry *new_dentry) -+ struct inode *new_dir, struct dentry *new_dentry, -+ struct lookup_intent *it) - { - int error; - if (S_ISDIR(old_dentry->d_inode->i_mode)) -- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry,it); - else -- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); -+ error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry,it); - if (!error) { - if (old_dir == new_dir) - inode_dir_notify(old_dir, DN_RENAME); -@@ -1857,6 +2032,7 @@ static inline int do_rename(const char * - int error = 0; - struct dentry * old_dir, * new_dir; - struct dentry * old_dentry, *new_dentry; -+ struct lookup_intent it = { .it_op = IT_RENAME }; - struct nameidata oldnd, newnd; - - if (path_init(oldname, LOOKUP_PARENT, &oldnd)) -@@ -1883,9 +2059,23 @@ static inline int do_rename(const char * - if (newnd.last_type != LAST_NORM) - goto exit2; - -+ if (old_dir->d_inode->i_op->rename2) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, -+ new_dir->d_inode, -+ oldnd.last.name, -+ oldnd.last.len, -+ newnd.last.name, -+ newnd.last.len); -+ unlock_kernel(); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ - double_lock(new_dir, old_dir); - -- old_dentry = lookup_hash(&oldnd.last, old_dir); -+ old_dentry = lookup_hash_it(&oldnd.last, old_dir, &it); - error = PTR_ERR(old_dentry); - if (IS_ERR(old_dentry)) - goto exit3; -@@ -1901,18 +2091,21 @@ static inline int do_rename(const char * - if (newnd.last.name[newnd.last.len]) - goto exit4; - } -- new_dentry = lookup_hash(&newnd.last, new_dir); -+ it.it_op = IT_RENAME2; -+ new_dentry = lookup_hash_it(&newnd.last, new_dir, &it); - error = PTR_ERR(new_dentry); - if (IS_ERR(new_dentry)) - goto exit4; - - lock_kernel(); - error = vfs_rename(old_dir->d_inode, old_dentry, -- new_dir->d_inode, new_dentry); -+ new_dir->d_inode, new_dentry, &it); - unlock_kernel(); - -+ intent_release(new_dentry, &it); - dput(new_dentry); - exit4: -+ intent_release(old_dentry, &it); - dput(old_dentry); - exit3: - double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); -@@ -1961,7 +2154,8 @@ out: - } - - static inline int --__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) - { - int res = 0; - char *name; -@@ -1974,7 +2168,7 @@ __vfs_follow_link(struct nameidata *nd, - /* weird __emul_prefix() stuff did it */ - goto out; - } -- res = link_path_walk(link, nd); -+ res = link_path_walk_it(link, nd, it); - out: - if (current->link_count || res || nd->last_type!=LAST_NORM) - return res; -@@ -1996,7 +2190,13 @@ fail: - - int vfs_follow_link(struct nameidata *nd, const char *link) - { -- return __vfs_follow_link(nd, link); -+ return __vfs_follow_link(nd, link, NULL); -+} -+ -+int vfs_follow_link_it(struct nameidata *nd, const char *link, -+ struct lookup_intent *it) -+{ -+ return __vfs_follow_link(nd, link, it); - } - - /* get the link contents into pagecache */ -@@ -2038,7 +2238,7 @@ int page_follow_link(struct dentry *dent - { - struct page *page = NULL; - char *s = page_getlink(dentry, &page); -- int res = __vfs_follow_link(nd, s); -+ int res = __vfs_follow_link(nd, s, NULL); - if (page) { - kunmap(page); - page_cache_release(page); ---- linux-2.4.19/fs/open.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/open.c Sun Jan 19 19:46:42 2003 -@@ -19,6 +19,9 @@ - #include <asm/uaccess.h> - - #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -+extern int path_walk_it(const char *name, struct nameidata *nd, -+ struct lookup_intent *it); -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - - int vfs_statfs(struct super_block *sb, struct statfs *buf) - { -@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const - struct nameidata nd; - struct inode * inode; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - - error = -EINVAL; - if (length < 0) /* sorry, but loff_t says... */ - goto out; - -- error = user_path_walk(path, &nd); -+ error = user_path_walk_it(path, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const - put_write_access(inode); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena - struct nameidata nd; - struct inode * inode; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - - if (error) - goto out; -@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * - int old_fsuid, old_fsgid; - kernel_cap_t old_cap; - int res; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ - return -EINVAL; -@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * - else - current->cap_effective = current->cap_permitted; - -- res = user_path_walk(filename, &nd); -+ res = user_path_walk_it(filename, &nd, &it); - if (!res) { - res = permission(nd.dentry->d_inode, mode); - /* SuS v2 requires we report a read only fs too */ - if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) - && !special_file(nd.dentry->d_inode->i_mode)) - res = -EROFS; -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - -@@ -386,6 +397,7 @@ asmlinkage long sys_chdir(const char * f - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +406,7 @@ asmlinkage long sys_chdir(const char * f - - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -406,6 +418,7 @@ asmlinkage long sys_chdir(const char * f - set_fs_pwd(current->fs, nd.mnt, nd.dentry); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -446,6 +459,7 @@ asmlinkage long sys_chroot(const char * - int error; - struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +468,7 @@ asmlinkage long sys_chroot(const char * - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); - if (error) - goto out; -@@ -471,6 +485,7 @@ asmlinkage long sys_chroot(const char * - set_fs_altroot(); - error = 0; - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -515,8 +530,9 @@ asmlinkage long sys_chmod(const char * f - struct inode * inode; - int error; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) - goto out; - inode = nd.dentry->d_inode; -@@ -536,6 +552,7 @@ asmlinkage long sys_chmod(const char * f - error = notify_change(nd.dentry, &newattrs); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -605,10 +622,12 @@ asmlinkage long sys_chown(const char * f - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -618,10 +637,12 @@ asmlinkage long sys_lchown(const char * - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+ - struct file *filp_open(const char * filename, int flags, int mode) - { - int namei_flags, error; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN }; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +693,15 @@ struct file *filp_open(const char * file - if (namei_flags & O_TRUNC) - namei_flags |= 2; - -- error = open_namei(filename, namei_flags, mode, &nd); -- if (!error) -- return dentry_open(nd.dentry, nd.mnt, flags); -+ error = open_namei_it(filename, namei_flags, mode, &nd, &it); -+ if (error) -+ return ERR_PTR(error); - -- return ERR_PTR(error); -+ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); - } - --struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it) - { - struct file * f; - struct inode *inode; -@@ -716,6 +744,7 @@ struct file *dentry_open(struct dentry * - } - f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); - -+ intent_release(dentry, it); - return f; - - cleanup_all: -@@ -730,11 +759,17 @@ cleanup_all: - cleanup_file: - put_filp(f); - cleanup_dentry: -+ intent_release(dentry, it); - dput(dentry); - mntput(mnt); - return ERR_PTR(error); - } - -+struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) -+{ -+ return dentry_open_it(dentry, mnt, flags, NULL); -+} -+ - /* - * Find an empty file descriptor entry, and mark it busy. - */ ---- linux-2.4.19/fs/stat.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/fs/stat.c Sun Jan 19 19:46:42 2003 -@@ -13,6 +13,7 @@ - - #include <asm/uaccess.h> - -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - /* - * Revalidate the inode. This is required for proper NFS attribute caching. - */ -@@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in - asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -151,13 +154,15 @@ asmlinkage long sys_stat(char * filename - asmlinkage long sys_newstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -172,13 +177,15 @@ asmlinkage long sys_newstat(char * filen - asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_old_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -189,13 +196,15 @@ asmlinkage long sys_lstat(char * filenam - asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) - { - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - int error; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -247,20 +256,21 @@ asmlinkage long sys_readlink(const char - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_READLINK }; - - if (bufsiz <= 0) - return -EINVAL; - -- error = user_path_walk_link(path, &nd); -+ error = user_path_walk_link_it(path, &nd, &it); - if (!error) { - struct inode * inode = nd.dentry->d_inode; -- - error = -EINVAL; - if (inode->i_op && inode->i_op->readlink && - !(error = do_revalidate(nd.dentry))) { - UPDATE_ATIME(inode); - error = inode->i_op->readlink(nd.dentry, buf, bufsiz); - } -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -333,12 +343,14 @@ asmlinkage long sys_stat64(char * filena - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -348,12 +360,14 @@ asmlinkage long sys_lstat64(char * filen - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = do_revalidate(nd.dentry); - if (!error) - error = cp_new_stat64(nd.dentry->d_inode, statbuf); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; ---- linux-2.4.19/mm/slab.c~vanilla-2.4.19 Sun Jan 19 19:46:42 2003 -+++ linux-2.4.19-root/mm/slab.c Sun Jan 19 19:46:42 2003 -@@ -1207,6 +1207,59 @@ failed: - * Called with the cache-lock held. - */ - -+extern struct page *check_get_page(unsigned long kaddr); -+struct page *page_mem_map(struct page *page); -+static int kmem_check_cache_obj (kmem_cache_t * cachep, -+ slab_t *slabp, void * objp) -+{ -+ int i; -+ unsigned int objnr; -+ -+#if DEBUG -+ if (cachep->flags & SLAB_RED_ZONE) { -+ objp -= BYTES_PER_WORD; -+ if ( *(unsigned long *)objp != RED_MAGIC2) -+ /* Either write before start, or a double free. */ -+ return 0; -+ if (*(unsigned long *)(objp+cachep->objsize - -+ BYTES_PER_WORD) != RED_MAGIC2) -+ /* Either write past end, or a double free. */ -+ return 0; -+ } -+#endif -+ -+ objnr = (objp-slabp->s_mem)/cachep->objsize; -+ if (objnr >= cachep->num) -+ return 0; -+ if (objp != slabp->s_mem + objnr*cachep->objsize) -+ return 0; -+ -+ /* Check slab's freelist to see if this obj is there. */ -+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { -+ if (i == objnr) -+ return 0; -+ } -+ return 1; -+} -+ -+ -+int kmem_cache_validate(kmem_cache_t *cachep, void *objp) -+{ -+ struct page *page = check_get_page((unsigned long)objp); -+ -+ if (!VALID_PAGE(page)) -+ return 0; -+ -+ if (!PageSlab(page)) -+ return 0; -+ -+ /* XXX check for freed slab objects ? */ -+ if (!kmem_check_cache_obj(cachep, GET_PAGE_SLAB(page), objp)) -+ return 0; -+ -+ return (cachep == GET_PAGE_CACHE(page)); -+} -+ - #if DEBUG - static int kmem_extra_free_checks (kmem_cache_t * cachep, - slab_t *slabp, void * objp) - -_ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch index 5c1f090..141b5d4 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18.patch @@ -1,16 +1,17 @@ fs/dcache.c | 20 ++ - fs/exec.c | 18 +- - fs/namei.c | 338 ++++++++++++++++++++++++++++++++++++++++--------- + fs/exec.c | 19 +- + fs/namei.c | 378 +++++++++++++++++++++++++++++++++++++++++-------- fs/nfsd/vfs.c | 2 - fs/open.c | 120 +++++++++++++++-- + fs/open.c | 120 +++++++++++++-- + fs/proc/base.c | 1 fs/stat.c | 8 - - include/linux/dcache.h | 28 ++++ - include/linux/fs.h | 27 +++ + include/linux/dcache.h | 31 ++++ + include/linux/fs.h | 28 +++ kernel/ksyms.c | 1 - 9 files changed, 478 insertions(+), 84 deletions(-) + 10 files changed, 522 insertions(+), 85 deletions(-) ---- linux-2.4.18-18.8.0-l12/fs/dcache.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/dcache.c Wed Feb 26 17:31:36 2003 +--- linux-2.4.18-61chaos/fs/dcache.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/dcache.c Sun Jun 1 21:59:04 2003 @@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry) spin_unlock(&dcache_lock); return 0; @@ -56,8 +57,8 @@ } #define do_switch(x,y) do { \ ---- linux-2.4.18-18.8.0-l12/fs/namei.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/namei.c Wed Feb 26 16:54:17 2003 +--- linux-2.4.18-61chaos/fs/namei.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/namei.c Sun Jun 1 23:14:49 2003 @@ -94,6 +94,13 @@ * XEmacs seems to be relying on it... */ @@ -142,15 +143,22 @@ { int err; if (current->link_count >= max_recursive_link) -@@ -348,10 +377,14 @@ static inline int do_follow_link(struct +@@ -348,10 +377,21 @@ static inline int do_follow_link(struct current->link_count++; current->total_link_count++; UPDATE_ATIME(dentry->d_inode); - err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; + if (dentry->d_inode->i_op->follow_link2) + err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); + else + err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(dentry, it); ++ path_release(nd); ++ err = -ENOLINK; ++ } current->link_count--; return err; loop: @@ -158,7 +166,7 @@ path_release(nd); return -ELOOP; } -@@ -381,15 +414,26 @@ int follow_up(struct vfsmount **mnt, str +@@ -381,15 +421,26 @@ int follow_up(struct vfsmount **mnt, str return __follow_up(mnt, dentry); } @@ -186,7 +194,7 @@ dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); -@@ -401,7 +445,7 @@ static inline int __follow_down(struct v +@@ -401,7 +452,7 @@ static inline int __follow_down(struct v int follow_down(struct vfsmount **mnt, struct dentry **dentry) { @@ -195,7 +203,7 @@ } static inline void follow_dotdot(struct nameidata *nd) -@@ -437,7 +481,7 @@ static inline void follow_dotdot(struct +@@ -437,7 +488,7 @@ static inline void follow_dotdot(struct mntput(nd->mnt); nd->mnt = parent; } @@ -204,7 +212,7 @@ ; } -@@ -449,7 +493,8 @@ static inline void follow_dotdot(struct +@@ -449,7 +500,8 @@ static inline void follow_dotdot(struct * * We expect 'base' to be positive and a directory. */ @@ -214,7 +222,7 @@ { struct dentry *dentry; struct inode *inode; -@@ -526,18 +571,18 @@ int link_path_walk(const char * name, st +@@ -526,18 +578,18 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ @@ -236,7 +244,7 @@ ; err = -ENOENT; -@@ -548,8 +593,8 @@ int link_path_walk(const char * name, st +@@ -548,8 +600,8 @@ int link_path_walk(const char * name, st if (!inode->i_op) goto out_dput; @@ -247,7 +255,7 @@ dput(dentry); if (err) goto return_err; -@@ -565,7 +610,7 @@ int link_path_walk(const char * name, st +@@ -565,7 +617,7 @@ int link_path_walk(const char * name, st nd->dentry = dentry; } err = -ENOTDIR; @@ -256,7 +264,7 @@ break; continue; /* here ends the main loop */ -@@ -592,22 +637,23 @@ last_component: +@@ -592,22 +644,23 @@ last_component: if (err < 0) break; } @@ -285,7 +293,7 @@ dput(dentry); if (err) goto return_err; -@@ -621,7 +667,8 @@ last_component: +@@ -621,7 +674,8 @@ last_component: goto no_inode; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; @@ -295,7 +303,33 @@ break; } goto return_base; -@@ -658,15 +705,28 @@ out_dput: +@@ -645,7 +699,24 @@ return_reval: + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; +- if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { ++ revalidate_again: ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate2(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ d_invalidate(dentry); ++ dput(dentry); ++ dentry = new; ++ goto revalidate_again; ++ } ++ } ++ else if (dentry && dentry->d_op && dentry->d_op->d_revalidate){ + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { + d_invalidate(dentry); +@@ -658,15 +729,28 @@ out_dput: dput(dentry); break; } @@ -325,7 +359,7 @@ } /* SMP-safe */ -@@ -751,6 +811,17 @@ walk_init_root(const char *name, struct +@@ -751,6 +835,17 @@ walk_init_root(const char *name, struct } /* SMP-safe */ @@ -343,7 +377,15 @@ int path_lookup(const char *path, unsigned flags, struct nameidata *nd) { int error = 0; -@@ -779,7 +850,8 @@ int path_init(const char *name, unsigned +@@ -765,6 +860,7 @@ int path_init(const char *name, unsigned + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->it = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -779,7 +875,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -353,7 +395,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -802,13 +874,16 @@ struct dentry * lookup_hash(struct qstr +@@ -802,13 +899,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -371,7 +413,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -820,6 +895,12 @@ out: +@@ -820,6 +920,12 @@ out: return dentry; } @@ -384,7 +426,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -841,7 +922,7 @@ struct dentry * lookup_one_len(const cha +@@ -841,7 +947,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -393,7 +435,7 @@ access: return ERR_PTR(-EACCES); } -@@ -872,6 +953,23 @@ int __user_walk(const char *name, unsign +@@ -872,6 +978,23 @@ int __user_walk(const char *name, unsign return err; } @@ -417,7 +459,7 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1045,14 +1143,17 @@ int may_open(struct nameidata *nd, int a +@@ -1045,14 +1168,17 @@ int may_open(struct nameidata *nd, int a return get_lease(inode, flag); } @@ -434,9 +476,9 @@ struct nameidata nd; + struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags }; int count = 0; - - if ((flag+1) & O_ACCMODE) -@@ -1066,7 +1167,7 @@ struct file *filp_open(const char * path + + if (!capable(CAP_SYS_ADMIN)) +@@ -1069,7 +1195,7 @@ struct file *filp_open(const char * path * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { @@ -445,7 +487,7 @@ if (error) return ERR_PTR(error); dentry = nd.dentry; -@@ -1076,6 +1177,8 @@ struct file *filp_open(const char * path +@@ -1079,6 +1205,8 @@ struct file *filp_open(const char * path /* * Create - we need to know the parent. */ @@ -454,7 +496,7 @@ error = path_lookup(pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); -@@ -1091,7 +1194,7 @@ struct file *filp_open(const char * path +@@ -1094,7 +1222,7 @@ struct file *filp_open(const char * path dir = nd.dentry; down(&dir->d_inode->i_sem); @@ -463,7 +505,7 @@ do_last: error = PTR_ERR(dentry); -@@ -1100,6 +1203,7 @@ do_last: +@@ -1103,6 +1231,7 @@ do_last: goto exit; } @@ -471,7 +513,7 @@ /* Negative dentry, just create the file */ if (!dentry->d_inode) { error = vfs_create(dir->d_inode, dentry, -@@ -1129,12 +1233,13 @@ do_last: +@@ -1132,12 +1261,13 @@ do_last: error = -ELOOP; if (flag & O_NOFOLLOW) goto exit_dput; @@ -487,7 +529,7 @@ goto do_link; dput(nd.dentry); -@@ -1149,11 +1254,13 @@ ok: +@@ -1152,11 +1282,13 @@ ok: if (!S_ISREG(nd.dentry->d_inode->i_mode)) open_flags &= ~O_TRUNC; @@ -502,17 +544,24 @@ path_release(&nd); return ERR_PTR(error); -@@ -1172,10 +1279,15 @@ do_link: +@@ -1175,10 +1307,22 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); - error = dentry->d_inode->i_op->follow_link(dentry, &nd); ++ nd.it = ⁢ + if (dentry->d_inode->i_op->follow_link2) + error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it); + else + error = dentry->d_inode->i_op->follow_link(dentry, &nd); -+ if (error) ++ if (error) { ++ intent_release(dentry, &it); ++ } else if (!(it.it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ + intent_release(dentry, &it); ++ path_release(&nd); ++ error = -ENOLINK; ++ } dput(dentry); if (error) - return error; @@ -520,7 +569,7 @@ if (nd.last_type == LAST_BIND) { dentry = nd.dentry; goto ok; -@@ -1194,13 +1306,15 @@ do_link: +@@ -1197,13 +1341,15 @@ do_link: } dir = nd.dentry; down(&dir->d_inode->i_sem); @@ -538,7 +587,7 @@ { struct dentry *dentry; -@@ -1208,7 +1322,7 @@ static struct dentry *lookup_create(stru +@@ -1211,7 +1357,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -547,7 +596,7 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1264,7 +1378,19 @@ asmlinkage long sys_mknod(const char * f +@@ -1267,7 +1413,19 @@ asmlinkage long sys_mknod(const char * f error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; @@ -568,7 +617,7 @@ error = PTR_ERR(dentry); mode &= ~current->fs->umask; -@@ -1285,6 +1411,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1288,6 +1446,7 @@ asmlinkage long sys_mknod(const char * f dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -576,7 +625,7 @@ path_release(&nd); out: putname(tmp); -@@ -1332,7 +1459,17 @@ asmlinkage long sys_mkdir(const char * p +@@ -1335,7 +1494,17 @@ asmlinkage long sys_mkdir(const char * p error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; @@ -595,7 +644,7 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_mkdir(nd.dentry->d_inode, dentry, -@@ -1340,6 +1477,7 @@ asmlinkage long sys_mkdir(const char * p +@@ -1343,6 +1512,7 @@ asmlinkage long sys_mkdir(const char * p dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -603,7 +652,7 @@ path_release(&nd); out: putname(tmp); -@@ -1440,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p +@@ -1443,8 +1613,33 @@ asmlinkage long sys_rmdir(const char * p error = -EBUSY; goto exit1; } @@ -638,7 +687,7 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1499,8 +1662,17 @@ asmlinkage long sys_unlink(const char * +@@ -1502,8 +1697,17 @@ asmlinkage long sys_unlink(const char * error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; @@ -657,7 +706,7 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1567,15 +1739,26 @@ asmlinkage long sys_symlink(const char * +@@ -1570,15 +1774,26 @@ asmlinkage long sys_symlink(const char * error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; @@ -686,7 +735,7 @@ putname(to); } putname(from); -@@ -1642,7 +1825,7 @@ asmlinkage long sys_link(const char * ol +@@ -1645,7 +1860,7 @@ asmlinkage long sys_link(const char * ol struct dentry *new_dentry; struct nameidata nd, old_nd; @@ -695,7 +744,7 @@ if (error) goto exit; error = path_lookup(to, LOOKUP_PARENT, &nd); -@@ -1651,7 +1834,17 @@ asmlinkage long sys_link(const char * ol +@@ -1654,7 +1869,17 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; @@ -714,7 +763,7 @@ error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1695,7 +1888,8 @@ exit: +@@ -1698,7 +1923,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -724,7 +773,7 @@ { int error; struct inode *target; -@@ -1753,6 +1947,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1756,6 +1982,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -732,7 +781,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1774,7 +1969,8 @@ out_unlock: +@@ -1777,7 +2004,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -742,7 +791,7 @@ { int error; -@@ -1805,6 +2001,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1808,6 +2036,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -750,7 +799,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1816,13 +2013,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1819,13 +2048,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -768,7 +817,7 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1864,7 +2062,7 @@ static inline int do_rename(const char * +@@ -1867,7 +2097,7 @@ static inline int do_rename(const char * double_lock(new_dir, old_dir); @@ -777,7 +826,7 @@ error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1880,16 +2078,37 @@ static inline int do_rename(const char * +@@ -1883,16 +2113,37 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } @@ -818,7 +867,7 @@ dput(new_dentry); exit4: dput(old_dentry); -@@ -1940,7 +2159,8 @@ out: +@@ -1943,12 +2194,19 @@ out: } static inline int @@ -828,7 +877,18 @@ { int res = 0; char *name; -@@ -1953,7 +2173,7 @@ __vfs_follow_link(struct nameidata *nd, + if (IS_ERR(link)) + goto fail; ++ if (it == NULL) ++ it = nd->it; ++ else if (it != nd->it) ++ printk("it != nd->it: tell phil@clusterfs.com\n"); ++ if (it != NULL) ++ it->it_int_flags |= IT_FL_FOLLOWED; + + if (*link == '/') { + path_release(nd); +@@ -1956,7 +2214,7 @@ __vfs_follow_link(struct nameidata *nd, /* weird __emul_prefix() stuff did it */ goto out; } @@ -837,7 +897,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -1975,7 +2195,13 @@ fail: +@@ -1978,7 +2236,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -852,7 +912,7 @@ } /* get the link contents into pagecache */ -@@ -2017,7 +2243,7 @@ int page_follow_link(struct dentry *dent +@@ -2020,7 +2284,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -861,8 +921,8 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.18-18.8.0-l12/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/nfsd/vfs.c Wed Feb 26 16:54:17 2003 +--- linux-2.4.18-61chaos/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/nfsd/vfs.c Sun Jun 1 21:59:04 2003 @@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else @@ -872,8 +932,8 @@ unlock_kernel(); if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); ---- linux-2.4.18-18.8.0-l12/fs/open.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/open.c Wed Feb 26 16:54:17 2003 +--- linux-2.4.18-61chaos/fs/open.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/open.c Sun Jun 1 21:59:04 2003 @@ -19,6 +19,8 @@ #include <asm/uaccess.h> @@ -1185,8 +1245,8 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.18-18.8.0-l12/fs/stat.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/stat.c Wed Feb 26 16:54:17 2003 +--- linux-2.4.18-61chaos/fs/stat.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/stat.c Sun Jun 1 21:59:04 2003 @@ -104,10 +104,12 @@ int vfs_stat(char *name, struct kstat *s { struct nameidata nd; @@ -1215,9 +1275,9 @@ path_release(&nd); } return error; ---- linux-2.4.18-18.8.0-l12/fs/exec.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/fs/exec.c Wed Feb 26 16:54:17 2003 -@@ -103,13 +103,18 @@ static inline void put_binfmt(struct lin +--- linux-2.4.18-61chaos/fs/exec.c~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/fs/exec.c Sun Jun 1 21:59:04 2003 +@@ -112,13 +112,18 @@ static inline void put_binfmt(struct lin * * Also note that we take the address to load from from the file itself. */ @@ -1237,7 +1297,7 @@ if (error) goto out; -@@ -121,7 +126,8 @@ asmlinkage long sys_uselib(const char * +@@ -130,7 +135,8 @@ asmlinkage long sys_uselib(const char * if (error) goto exit; @@ -1247,7 +1307,7 @@ error = PTR_ERR(file); if (IS_ERR(file)) goto out; -@@ -350,8 +356,9 @@ struct file *open_exec(const char *name) +@@ -359,8 +365,9 @@ struct file *open_exec(const char *name) struct inode *inode; struct file *file; int err = 0; @@ -1258,17 +1318,26 @@ file = ERR_PTR(err); if (!err) { inode = nd.dentry->d_inode; -@@ -363,7 +370,8 @@ struct file *open_exec(const char *name) +@@ -372,8 +379,9 @@ struct file *open_exec(const char *name) err = -EACCES; file = ERR_PTR(err); if (!err) { - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); + file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(nd.dentry, &it); if (!IS_ERR(file)) { ++ intent_release(nd.dentry, &it); err = deny_write_access(file); if (err) { -@@ -976,7 +984,7 @@ int do_coredump(long signr, struct pt_re + fput(file); +@@ -384,6 +392,7 @@ out: + return file; + } + } ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + goto out; +@@ -1104,7 +1113,7 @@ int do_coredump(long signr, struct pt_re goto close_fail; if (!file->f_op->write) goto close_fail; @@ -1277,9 +1346,9 @@ goto close_fail; retval = binfmt->core_dump(signr, regs, file); ---- linux-2.4.18-18.8.0-l12/include/linux/dcache.h~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/include/linux/dcache.h Wed Feb 26 17:01:30 2003 -@@ -6,6 +6,25 @@ +--- linux-2.4.18-61chaos/include/linux/dcache.h~vfs_intent-2.4.18-18 Sun Jun 1 21:55:14 2003 ++++ linux-2.4.18-61chaos-root/include/linux/dcache.h Sun Jun 1 22:02:31 2003 +@@ -6,6 +6,28 @@ #include <asm/atomic.h> #include <linux/mount.h> @@ -1290,13 +1359,16 @@ +#define IT_LOOKUP (1<<4) +#define IT_UNLINK (1<<5) + ++#define IT_FL_LOCKED (1) ++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */ ++ +struct lookup_intent { + int it_op; + int it_mode; + int it_flags; + int it_disposition; + int it_status; -+ struct iattr *it_iattr; ++ int it_int_flags; + __u64 it_lock_handle[2]; + int it_lock_mode; + void *it_data; @@ -1305,7 +1377,7 @@ /* * linux/include/linux/dcache.h * -@@ -78,6 +97,7 @@ struct dentry { +@@ -78,6 +100,7 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ @@ -1313,7 +1385,7 @@ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ void * d_extra_attributes; /* TUX-specific data */ -@@ -91,8 +111,15 @@ struct dentry_operations { +@@ -91,8 +114,15 @@ struct dentry_operations { int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -1329,7 +1401,7 @@ /* the dentry parameter passed to d_hash and d_compare is the parent * directory of the entries to be compared. It is used in case these * functions need any directory specific information for determining -@@ -124,6 +151,7 @@ d_iput: no no yes +@@ -124,6 +154,7 @@ d_iput: no no yes * s_nfsd_free_path semaphore will be down */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ @@ -1337,9 +1409,9 @@ extern spinlock_t dcache_lock; ---- linux-2.4.18-18.8.0-l12/include/linux/fs.h~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/include/linux/fs.h Wed Feb 26 17:31:42 2003 -@@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he +--- linux-2.4.18-61chaos/include/linux/fs.h~vfs_intent-2.4.18-18 Sun Jun 1 21:59:03 2003 ++++ linux-2.4.18-61chaos-root/include/linux/fs.h Sun Jun 1 22:01:46 2003 +@@ -339,6 +339,8 @@ extern void set_bh_page(struct buffer_he #define ATTR_MTIME_SET 256 #define ATTR_FORCE 512 /* Not a change, but a change it */ #define ATTR_ATTR_FLAG 1024 @@ -1348,7 +1420,7 @@ /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -576,6 +578,7 @@ struct file { +@@ -578,6 +580,7 @@ struct file { /* needed for tty driver, and maybe others */ void *private_data; @@ -1356,7 +1428,15 @@ /* preallocated helper kiobuf to speedup O_DIRECT */ struct kiobuf *f_iobuf; -@@ -836,7 +839,9 @@ extern int vfs_symlink(struct inode *, s +@@ -707,6 +710,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *it; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -840,7 +844,9 @@ extern int vfs_symlink(struct inode *, s extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *); @@ -1367,7 +1447,7 @@ /* * File types -@@ -897,20 +902,33 @@ struct file_operations { +@@ -901,20 +907,33 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); @@ -1401,7 +1481,7 @@ int (*getattr) (struct dentry *, struct iattr *); }; -@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode +@@ -1119,7 +1138,7 @@ static inline int get_lease(struct inode asmlinkage long sys_open(const char *, int, int); asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ @@ -1410,7 +1490,7 @@ extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -@@ -1381,6 +1399,7 @@ typedef int (*read_actor_t)(read_descrip +@@ -1388,6 +1407,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); @@ -1418,7 +1498,7 @@ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); -@@ -1392,6 +1411,8 @@ extern struct dentry * lookup_one_len(co +@@ -1399,6 +1419,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) @@ -1427,7 +1507,7 @@ extern void inode_init_once(struct inode *); extern void iput(struct inode *); -@@ -1492,6 +1513,8 @@ extern struct file_operations generic_ro +@@ -1499,6 +1521,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); @@ -1436,9 +1516,9 @@ extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.18-18.8.0-l12/kernel/ksyms.c~vfs_intent-2.4.18-18 Wed Feb 26 16:54:17 2003 -+++ linux-2.4.18-18.8.0-l12-phil/kernel/ksyms.c Wed Feb 26 16:54:17 2003 -@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); +--- linux-2.4.18-61chaos/kernel/ksyms.c~vfs_intent-2.4.18-18 Sun Jun 1 21:59:03 2003 ++++ linux-2.4.18-61chaos-root/kernel/ksyms.c Sun Jun 1 21:59:04 2003 +@@ -294,6 +294,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); EXPORT_SYMBOL(vfs_follow_link); @@ -1446,3 +1526,17 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(page_follow_link); EXPORT_SYMBOL(page_symlink_inode_operations); + +_ +--- linux/fs/proc/base.c.old Sat Jun 7 00:55:09 2003 ++++ linux/fs/proc/base.c Sat Jun 7 00:55:33 2003 +@@ -465,6 +465,9 @@ + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->it != NULL) ++ nd->it->it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } diff --git a/lustre/kernel_patches/patches/vfs_intent.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch similarity index 50% rename from lustre/kernel_patches/patches/vfs_intent.patch rename to lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch index 75e404b..710cdc9 100644 --- a/lustre/kernel_patches/patches/vfs_intent.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch @@ -1,22 +1,14 @@ - fs/dcache.c | 8 + - fs/namei.c | 287 ++++++++++++++++++++++++++++++++++++++++--------- - fs/nfsd/vfs.c | 2 - fs/open.c | 53 +++++++-- - fs/stat.c | 9 + - include/linux/dcache.h | 25 ++++ - include/linux/fs.h | 22 +++ - kernel/ksyms.c | 1 - 8 files changed, 344 insertions(+), 63 deletions(-) + 0 files changed ---- linux-2.4.18-18.8.0-l7/fs/dcache.c~vfs_intent-2.4.18-18 Mon Jan 20 08:28:00 2003 -+++ linux-2.4.18-18.8.0-l7-root/fs/dcache.c Mon Jan 20 08:54:54 2003 -@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry) +--- linux-2.4.20-rh/fs/dcache.c~vfs_intent-2.4.20-rh 2003-04-11 14:04:58.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/dcache.c 2003-06-09 23:18:07.000000000 +0800 +@@ -186,6 +186,13 @@ int d_invalidate(struct dentry * dentry) spin_unlock(&dcache_lock); return 0; } + + /* network invalidation by Lustre */ -+ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { ++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) { + spin_unlock(&dcache_lock); + return 0; + } @@ -24,17 +16,40 @@ /* * Check whether to do a partial shrink_dcache * to get rid of unused child entries. -@@ -645,6 +654,7 @@ struct dentry * d_alloc(struct dentry * +@@ -624,6 +631,7 @@ struct dentry * d_alloc(struct dentry * dentry->d_fsdata = NULL; dentry->d_extra_attributes = NULL; dentry->d_mounted = 0; -+ dentry->d_it = NULL; ++ dentry->d_it = NULL; + dentry->d_cookie = NULL; INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); - INIT_LIST_HEAD(&dentry->d_subdirs); ---- linux-2.4.18-18.8.0-l7/fs/namei.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/fs/namei.c Wed Jan 22 22:53:28 2003 -@@ -94,6 +97,13 @@ +@@ -839,13 +847,19 @@ void d_delete(struct dentry * dentry) + * Adds a dentry to the hash according to its name. + */ + +-void d_rehash(struct dentry * entry) ++void __d_rehash(struct dentry * entry, int lock) + { + struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash); + if (!list_empty(&entry->d_hash)) BUG(); +- spin_lock(&dcache_lock); ++ if (lock) spin_lock(&dcache_lock); + list_add(&entry->d_hash, list); +- spin_unlock(&dcache_lock); ++ if (lock) spin_unlock(&dcache_lock); ++} ++EXPORT_SYMBOL(__d_rehash); ++ ++void d_rehash(struct dentry * entry) ++{ ++ __d_rehash(entry, 1); + } + + #define do_switch(x,y) do { \ +--- linux-2.4.20-rh/fs/namei.c~vfs_intent-2.4.20-rh 2003-04-11 14:04:57.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/namei.c 2003-06-09 23:18:07.000000000 +0800 +@@ -94,6 +94,13 @@ * XEmacs seems to be relying on it... */ @@ -48,7 +63,7 @@ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. -@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd) +@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd) * Internal lookup() using the new generic dcache. * SMP-safe */ @@ -69,7 +84,7 @@ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { dput(dentry); -@@ -281,7 +301,8 @@ static struct dentry * cached_lookup(str +@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str * make sure that nobody added the entry to the dcache in the meantime.. * SMP-safe */ @@ -79,7 +94,13 @@ { struct dentry * result; struct inode *dir = parent->d_inode; -@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc + ++again: ++ + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created +@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc result = ERR_PTR(-ENOMEM); if (dentry) { lock_kernel(); @@ -89,7 +110,7 @@ result = dir->i_op->lookup(dir, dentry); unlock_kernel(); if (result) -@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc +@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc dput(result); result = ERR_PTR(-ENOENT); } @@ -97,30 +118,37 @@ + if (!result->d_op->d_revalidate2(result, flags, it) && + !d_invalidate(result)) { + dput(result); -+ result = ERR_PTR(-ENOENT); ++ goto again; + } } return result; } -@@ -334,7 +364,8 @@ int max_recursive_link = 5; +@@ -334,7 +362,8 @@ int max_recursive_link = 5; * Without that kind of total limit, nasty chains of consecutive * symlinks can cause almost arbitrarily long lookups. */ -static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) -+static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, ++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd, + struct lookup_intent *it) { int err; if (current->link_count >= max_recursive_link) -@@ -348,10 +379,14 @@ static inline int do_follow_link(struct +@@ -348,10 +377,21 @@ static inline int do_follow_link(struct current->link_count++; current->total_link_count++; UPDATE_ATIME(dentry->d_inode); - err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; + if (dentry->d_inode->i_op->follow_link2) + err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else ++ else + err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(dentry, it); ++ path_release(nd); ++ err = -ENOLINK; ++ } current->link_count--; return err; loop: @@ -128,7 +156,53 @@ path_release(nd); return -ELOOP; } -@@ -449,7 +484,8 @@ static inline void follow_dotdot(struct +@@ -381,15 +421,26 @@ int follow_up(struct vfsmount **mnt, str + return __follow_up(mnt, dentry); + } + +-static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) ++static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry, ++ struct lookup_intent *it) + { + struct vfsmount *mounted; + + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (mounted) { ++ int opc = 0, mode = 0; + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); ++ if (it) { ++ opc = it->it_op; ++ mode = it->it_mode; ++ } ++ intent_release(*dentry, it); ++ if (it) { ++ it->it_op = opc; ++ it->it_mode = mode; ++ } + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); +@@ -401,7 +452,7 @@ static inline int __follow_down(struct v + + int follow_down(struct vfsmount **mnt, struct dentry **dentry) + { +- return __follow_down(mnt,dentry); ++ return __follow_down(mnt,dentry,NULL); + } + + static inline void follow_dotdot(struct nameidata *nd) +@@ -437,7 +488,7 @@ static inline void follow_dotdot(struct + mntput(nd->mnt); + nd->mnt = parent; + } +- while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry)) ++ while (d_mountpoint(nd->dentry) && __follow_down(&nd->mnt, &nd->dentry, NULL)) + ; + } + +@@ -449,7 +500,8 @@ static inline void follow_dotdot(struct * * We expect 'base' to be positive and a directory. */ @@ -138,22 +212,29 @@ { struct dentry *dentry; struct inode *inode; -@@ -526,12 +562,12 @@ int link_path_walk(const char * name, st +@@ -526,18 +578,18 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ - dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); ++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); if (!dentry) { err = -EWOULDBLOCKIO; if (atomic) break; - dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); -+ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); ++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL); err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; -@@ -548,8 +584,8 @@ int link_path_walk(const char * name, st + } + /* Check mountpoints.. */ +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, NULL)) + ; + + err = -ENOENT; +@@ -548,8 +600,8 @@ int link_path_walk(const char * name, st if (!inode->i_op) goto out_dput; @@ -164,7 +245,7 @@ dput(dentry); if (err) goto return_err; -@@ -565,7 +601,7 @@ int link_path_walk(const char * name, st +@@ -565,7 +617,7 @@ int link_path_walk(const char * name, st nd->dentry = dentry; } err = -ENOTDIR; @@ -173,44 +254,70 @@ break; continue; /* here ends the main loop */ -@@ -592,12 +628,12 @@ last_component: +@@ -592,22 +644,23 @@ last_component: if (err < 0) break; } - dentry = cached_lookup(nd->dentry, &this, 0); -+ dentry = cached_lookup(nd->dentry, &this, 0, it); ++ dentry = cached_lookup(nd->dentry, &this, 0, it); if (!dentry) { err = -EWOULDBLOCKIO; if (atomic) break; - dentry = real_lookup(nd->dentry, &this, 0); -+ dentry = real_lookup(nd->dentry, &this, 0, it); ++ dentry = real_lookup(nd->dentry, &this, 0, it); err = PTR_ERR(dentry); if (IS_ERR(dentry)) break; -@@ -606,8 +642,9 @@ last_component: + } +- while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) ++ while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry, it)) ; inode = dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { - err = do_follow_link(dentry, nd); -+ && inode && inode->i_op && -+ (inode->i_op->follow_link || inode->i_op->follow_link2)) { -+ err = do_follow_link(dentry, nd, it); ++ && inode && inode->i_op && ++ (inode->i_op->follow_link || inode->i_op->follow_link2)) { ++ err = do_follow_link(dentry, nd, it); dput(dentry); if (err) goto return_err; -@@ -621,7 +659,8 @@ last_component: +@@ -621,7 +674,8 @@ last_component: goto no_inode; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op || !inode->i_op->lookup) -+ if (!inode->i_op || (!inode->i_op->lookup && -+ !inode->i_op->lookup2)) ++ if (!inode->i_op || ++ (!inode->i_op->lookup && !inode->i_op->lookup2)) break; } goto return_base; -@@ -658,15 +697,28 @@ out_dput: +@@ -645,6 +699,23 @@ return_reval: + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; ++ revalidate_again: ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate2(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ d_invalidate(dentry); ++ dput(dentry); ++ dentry = new; ++ goto revalidate_again; ++ } ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { +@@ -658,15 +729,28 @@ out_dput: dput(dentry); break; } @@ -240,7 +347,7 @@ } /* SMP-safe */ -@@ -751,6 +803,17 @@ walk_init_root(const char *name, struct +@@ -751,6 +835,17 @@ walk_init_root(const char *name, struct } /* SMP-safe */ @@ -258,7 +365,15 @@ int path_lookup(const char *path, unsigned flags, struct nameidata *nd) { int error = 0; -@@ -779,7 +842,8 @@ int path_init(const char *name, unsigned +@@ -765,6 +860,7 @@ int path_init(const char *name, unsigned + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->it = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -779,7 +875,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -268,7 +383,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -802,13 +866,16 @@ struct dentry * lookup_hash(struct qstr +@@ -802,13 +899,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -286,7 +401,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -820,6 +887,12 @@ out: +@@ -820,6 +920,12 @@ out: return dentry; } @@ -299,7 +414,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -841,7 +914,7 @@ struct dentry * lookup_one_len(const cha +@@ -841,7 +947,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -308,7 +423,7 @@ access: return ERR_PTR(-EACCES); } -@@ -872,6 +945,23 @@ int __user_walk(const char *name, unsign +@@ -872,6 +978,23 @@ int __user_walk(const char *name, unsign return err; } @@ -332,109 +447,125 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -1045,14 +1135,17 @@ int may_open(struct nameidata *nd, int a - return get_lease(inode, flag); - } - -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+ - struct file *filp_open(const char * pathname, int open_flags, int mode) +@@ -1010,7 +1133,8 @@ exit_lock: + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) ++int open_namei_it(const char *pathname, int flag, int mode, ++ struct nameidata *nd, struct lookup_intent *it) { int acc_mode, error = 0; -- struct inode *inode; - struct dentry *dentry; - struct dentry *dir; - int flag = open_flags; - struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = open_flags }; - int count = 0; - - if ((flag+1) & O_ACCMODE) -@@ -1066,7 +1159,7 @@ struct file *filp_open(const char * path + struct inode *inode; +@@ -1024,7 +1148,7 @@ int open_namei(const char * pathname, in * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { -- error = path_lookup(pathname, lookup_flags(flag), &nd); -+ error = path_lookup_it(pathname, lookup_flags(flag), &nd, &it); +- error = path_lookup(pathname, lookup_flags(flag), nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd, it); if (error) - return ERR_PTR(error); - dentry = nd.dentry; -@@ -1076,6 +1169,8 @@ struct file *filp_open(const char * path + return error; + dentry = nd->dentry; +@@ -1034,6 +1158,10 @@ int open_namei(const char * pathname, in /* * Create - we need to know the parent. */ -+ it.it_mode = mode; -+ it.it_op |= IT_CREAT; - error = path_lookup(pathname, LOOKUP_PARENT, &nd); ++ if (it) { ++ it->it_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + error = path_lookup(pathname, LOOKUP_PARENT, nd); if (error) - return ERR_PTR(error); -@@ -1091,7 +1186,7 @@ struct file *filp_open(const char * path + return error; +@@ -1049,7 +1177,7 @@ int open_namei(const char * pathname, in - dir = nd.dentry; + dir = nd->dentry; down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); do_last: error = PTR_ERR(dentry); -@@ -1100,6 +1195,7 @@ do_last: +@@ -1058,6 +1186,7 @@ do_last: goto exit; } -+ it.it_mode = mode; ++ it->it_mode = mode; /* Negative dentry, just create the file */ if (!dentry->d_inode) { error = vfs_create(dir->d_inode, dentry, -@@ -1134,7 +1230,8 @@ do_last: +@@ -1086,12 +1215,13 @@ do_last: + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; +- while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry)); ++ while (__follow_down(&nd->mnt,&dentry,it) && d_mountpoint(dentry)); + } error = -ENOENT; if (!dentry->d_inode) goto exit_dput; - if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) -+ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || ++ if (dentry->d_inode->i_op && (dentry->d_inode->i_op->follow_link || + dentry->d_inode->i_op->follow_link2)) goto do_link; - dput(nd.dentry); -@@ -1149,11 +1246,13 @@ ok: - if (!S_ISREG(nd.dentry->d_inode->i_mode)) - open_flags &= ~O_TRUNC; - -- return dentry_open(nd.dentry, nd.mnt, open_flags); -+ return dentry_open_it(nd.dentry, nd.mnt, open_flags, &it); + dput(nd->dentry); +@@ -1165,7 +1295,7 @@ ok: + if (!error) { + DQUOT_INIT(inode); + +- error = do_truncate(dentry, 0); ++ error = do_truncate(dentry, 0, 1); + } + put_write_access(inode); + if (error) +@@ -1177,8 +1307,10 @@ ok: + return 0; exit_dput: -+ intent_release(dentry, &it); ++ intent_release(dentry, it); dput(dentry); exit: -+ intent_release(nd.dentry, &it); - path_release(&nd); - return ERR_PTR(error); ++ intent_release(nd->dentry, it); + path_release(nd); + return error; -@@ -1172,7 +1271,12 @@ do_link: +@@ -1197,7 +1329,19 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); -- error = dentry->d_inode->i_op->follow_link(dentry, &nd); -+ if (dentry->d_inode->i_op->follow_link2) -+ error = dentry->d_inode->i_op->follow_link2(dentry, &nd, &it); -+ else -+ error = dentry->d_inode->i_op->follow_link(dentry, &nd); -+ if (error) -+ intent_release(dentry, &it); +- error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; ++ if (dentry->d_inode->i_op->follow_link2) ++ error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); ++ else ++ error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (error) { ++ intent_release(dentry, it); ++ } else if (it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(dentry, it); ++ path_release(nd); ++ error = -ENOLINK; ++ } dput(dentry); if (error) return error; -@@ -1194,13 +1298,15 @@ do_link: +@@ -1219,13 +1363,20 @@ do_link: } - dir = nd.dentry; + dir = nd->dentry; down(&dir->d_inode->i_sem); -- dentry = lookup_hash(&nd.last, nd.dentry); -+ dentry = lookup_hash_it(&nd.last, nd.dentry, &it); - putname(nd.last.name); +- dentry = lookup_hash(&nd->last, nd->dentry); ++ dentry = lookup_hash_it(&nd->last, nd->dentry, it); + putname(nd->last.name); goto do_last; } ++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd) ++{ ++ return open_namei_it(pathname, flag, mode, nd, NULL); ++} ++ + /* SMP-safe */ -static struct dentry *lookup_create(struct nameidata *nd, int is_dir) @@ -443,7 +574,7 @@ { struct dentry *dentry; -@@ -1208,7 +1314,7 @@ static struct dentry *lookup_create(stru +@@ -1233,7 +1384,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -452,7 +583,7 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1264,7 +1370,19 @@ asmlinkage long sys_mknod(const char * f +@@ -1289,7 +1440,19 @@ asmlinkage long sys_mknod(const char * f error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; @@ -460,11 +591,11 @@ + + if (nd.dentry->d_inode->i_op->mknod2) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mknod2(nd.dentry->d_inode, -+ nd.last.name, ++ error = op->mknod2(nd.dentry->d_inode, ++ nd.last.name, + nd.last.len, + mode, dev); -+ /* the file system want to use normal vfs path now */ ++ /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto out2; + } @@ -473,34 +604,34 @@ error = PTR_ERR(dentry); mode &= ~current->fs->umask; -@@ -1285,6 +1403,7 @@ asmlinkage long sys_mknod(const char * f +@@ -1310,6 +1473,7 @@ asmlinkage long sys_mknod(const char * f dput(dentry); } up(&nd.dentry->d_inode->i_sem); -+ out2: ++out2: path_release(&nd); out: putname(tmp); -@@ -1332,7 +1451,17 @@ asmlinkage long sys_mkdir(const char * p +@@ -1357,7 +1521,17 @@ asmlinkage long sys_mkdir(const char * p error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); -+ if (nd.dentry->d_inode->i_op->mkdir2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len, -+ mode); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 1, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_mkdir(nd.dentry->d_inode, dentry, -@@ -1340,6 +1469,7 @@ asmlinkage long sys_mkdir(const char * p +@@ -1365,6 +1539,7 @@ asmlinkage long sys_mkdir(const char * p dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -508,16 +639,32 @@ path_release(&nd); out: putname(tmp); -@@ -1440,8 +1570,17 @@ asmlinkage long sys_rmdir(const char * p +@@ -1465,8 +1640,33 @@ asmlinkage long sys_rmdir(const char * p error = -EBUSY; goto exit1; } + if (nd.dentry->d_inode->i_op->rmdir2) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->rmdir2(nd.dentry->d_inode, -+ nd.last.name, ++ struct dentry *last; ++ ++ down(&nd.dentry->d_inode->i_sem); ++ last = lookup_hash_it(&nd.last, nd.dentry, NULL); ++ up(&nd.dentry->d_inode->i_sem); ++ if (IS_ERR(last)) { ++ error = PTR_ERR(last); ++ goto exit1; ++ } ++ if (d_mountpoint(last)) { ++ dput(last); ++ error = -EBUSY; ++ goto exit1; ++ } ++ dput(last); ++ ++ error = op->rmdir2(nd.dentry->d_inode, ++ nd.last.name, + nd.last.len); -+ /* the file system want to use normal vfs path now */ ++ /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto exit1; + } @@ -527,16 +674,16 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1499,8 +1638,17 @@ asmlinkage long sys_unlink(const char * +@@ -1524,8 +1724,17 @@ asmlinkage long sys_unlink(const char * error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; + if (nd.dentry->d_inode->i_op->unlink2) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->unlink2(nd.dentry->d_inode, -+ nd.last.name, ++ error = op->unlink2(nd.dentry->d_inode, ++ nd.last.name, + nd.last.len); -+ /* the file system want to use normal vfs path now */ ++ /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto exit1; + } @@ -546,18 +693,18 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1567,15 +1715,26 @@ asmlinkage long sys_symlink(const char * +@@ -1592,15 +1801,26 @@ asmlinkage long sys_symlink(const char * error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); + if (nd.dentry->d_inode->i_op->symlink2) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->symlink2(nd.dentry->d_inode, -+ nd.last.name, ++ error = op->symlink2(nd.dentry->d_inode, ++ nd.last.name, + nd.last.len, + from); -+ /* the file system want to use normal vfs path now */ ++ /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto out2; + } @@ -575,27 +722,18 @@ putname(to); } putname(from); -@@ -1642,7 +1801,7 @@ asmlinkage long sys_link(const char * ol - struct dentry *new_dentry; - struct nameidata nd, old_nd; - -- error = __user_walk(oldname, LOOKUP_POSITIVE, &old_nd); -+ error = __user_walk_it(oldname, LOOKUP_POSITIVE, &old_nd, NULL); - if (error) - goto exit; - error = path_lookup(to, LOOKUP_PARENT, &nd); -@@ -1651,7 +1810,17 @@ asmlinkage long sys_link(const char * ol +@@ -1676,7 +1896,17 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; - new_dentry = lookup_create(&nd, 0); + if (nd.dentry->d_inode->i_op->link2) { + struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->link2(old_nd.dentry->d_inode, -+ nd.dentry->d_inode, -+ nd.last.name, ++ error = op->link2(old_nd.dentry->d_inode, ++ nd.dentry->d_inode, ++ nd.last.name, + nd.last.len); -+ /* the file system want to use normal vfs path now */ ++ /* the file system wants to use normal vfs path now */ + if (error != -EOPNOTSUPP) + goto out_release; + } @@ -603,7 +741,7 @@ error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1695,7 +1864,8 @@ exit: +@@ -1720,7 +1950,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -613,7 +751,7 @@ { int error; struct inode *target; -@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1778,6 +2009,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -621,7 +759,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1774,7 +1945,8 @@ out_unlock: +@@ -1799,7 +2031,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -631,7 +769,7 @@ { int error; -@@ -1805,6 +1977,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1830,6 +2063,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -639,7 +777,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1816,13 +1989,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1841,13 +2075,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -657,24 +795,8 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1862,9 +2036,23 @@ static inline int do_rename(const char * - if (newnd.last_type != LAST_NORM) - goto exit2; +@@ -1889,7 +2124,7 @@ static inline int do_rename(const char * -+ if (old_dir->d_inode->i_op->rename2) { -+ lock_kernel(); -+ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, -+ new_dir->d_inode, -+ oldnd.last.name, -+ oldnd.last.len, -+ newnd.last.name, -+ newnd.last.len); -+ unlock_kernel(); -+ /* the file system want to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto exit2; -+ } -+ double_lock(new_dir, old_dir); - old_dentry = lookup_hash(&oldnd.last, old_dir); @@ -682,7 +804,7 @@ error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1880,14 +2068,14 @@ static inline int do_rename(const char * +@@ -1905,16 +2140,37 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } @@ -692,24 +814,60 @@ if (IS_ERR(new_dentry)) goto exit4; ++ if (old_dir->d_inode->i_op->rename2) { ++ lock_kernel(); ++ /* don't rename mount point. mds will take care of ++ * the rest sanity checking */ ++ if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) { ++ error = -EBUSY; ++ goto exit5; ++ } ++ ++ error = old_dir->d_inode->i_op->rename2(old_dir->d_inode, ++ new_dir->d_inode, ++ oldnd.last.name, ++ oldnd.last.len, ++ newnd.last.name, ++ newnd.last.len); ++ unlock_kernel(); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto exit5; ++ } ++ lock_kernel(); error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry); + new_dir->d_inode, new_dentry, NULL); unlock_kernel(); - +- ++exit5: dput(new_dentry); -@@ -1940,7 +2127,8 @@ out: + exit4: + dput(old_dentry); +@@ -1965,20 +2221,28 @@ out: } static inline int -__vfs_follow_link(struct nameidata *nd, const char *link) -+__vfs_follow_link(struct nameidata *nd, const char *link, ++__vfs_follow_link(struct nameidata *nd, const char *link, + struct lookup_intent *it) { int res = 0; char *name; -@@ -1953,7 +2141,7 @@ __vfs_follow_link(struct nameidata *nd, + if (IS_ERR(link)) + goto fail; + ++ if (it == NULL) ++ it = nd->it; ++ else if (it != nd->it) ++ printk("it != nd->it: tell phil@clusterfs.com\n"); ++ if (it != NULL) ++ it->it_int_flags |= IT_FL_FOLLOWED; ++ + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) /* weird __emul_prefix() stuff did it */ goto out; } @@ -718,7 +876,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -1975,7 +2163,13 @@ fail: +@@ -2002,7 +2266,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -726,14 +884,14 @@ + return __vfs_follow_link(nd, link, NULL); +} + -+int vfs_follow_link_it(struct nameidata *nd, const char *link, ++int vfs_follow_link_it(struct nameidata *nd, const char *link, + struct lookup_intent *it) +{ + return __vfs_follow_link(nd, link, it); } /* get the link contents into pagecache */ -@@ -2017,7 +2211,7 @@ int page_follow_link(struct dentry *dent +@@ -2044,7 +2314,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -742,34 +900,61 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.18-18.8.0-l7/fs/nfsd/vfs.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/fs/nfsd/vfs.c Mon Jan 20 12:25:10 2003 -@@ -1298,7 +1298,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru +--- linux-2.4.20-rh/fs/nfsd/vfs.c~vfs_intent-2.4.20-rh 2003-04-11 14:04:48.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/nfsd/vfs.c 2003-06-09 23:18:07.000000000 +0800 +@@ -1293,7 +1293,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else #endif - err = vfs_rename(fdir, odentry, tdir, ndentry); + err = vfs_rename(fdir, odentry, tdir, ndentry, NULL); - unlock_kernel(); if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); ---- linux-2.4.18-18.8.0-l7/fs/open.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/fs/open.c Wed Jan 22 10:39:31 2003 -@@ -19,6 +19,9 @@ + nfsd_sync_dir(fdentry); +--- linux-2.4.20-rh/fs/open.c~vfs_intent-2.4.20-rh 2003-04-11 14:04:57.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/open.c 2003-06-09 23:18:07.000000000 +0800 +@@ -19,6 +19,8 @@ #include <asm/uaccess.h> #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) +extern int path_walk_it(const char *name, struct nameidata *nd, + struct lookup_intent *it); -+extern void intent_release(struct dentry *de, struct lookup_intent *it); int vfs_statfs(struct super_block *sb, struct statfs *buf) { -@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const +@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct + write_unlock(&files->file_lock); + } + +-int do_truncate(struct dentry *dentry, loff_t length) ++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open) + { + struct inode *inode = dentry->d_inode; ++ struct inode_operations *op = dentry->d_inode->i_op; + int error; + struct iattr newattrs; + +@@ -108,7 +111,14 @@ int do_truncate(struct dentry *dentry, l + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; +- error = notify_change(dentry, &newattrs); ++ if (called_from_open) ++ newattrs.ia_valid |= ATTR_FROM_OPEN; ++ if (op->setattr_raw) { ++ newattrs.ia_valid |= ATTR_RAW; ++ newattrs.ia_ctime = CURRENT_TIME; ++ error = op->setattr_raw(inode, &newattrs); ++ } else ++ error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + return error; + } +@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const struct nameidata nd; struct inode * inode; int error; -+ struct lookup_intent it = { .it_op = IT_TRUNC }; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; error = -EINVAL; if (length < 0) /* sorry, but loff_t says... */ @@ -780,7 +965,14 @@ if (error) goto out; inode = nd.dentry->d_inode; -@@ -168,6 +172,7 @@ static inline long do_sys_truncate(const +@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); +- error = do_truncate(nd.dentry, length); ++ intent_release(nd.dentry, &it); ++ error = do_truncate(nd.dentry, length, 0); + } put_write_access(inode); dput_and_out: @@ -788,45 +980,100 @@ path_release(&nd); out: return error; -@@ -259,8 +264,9 @@ asmlinkage long sys_utime(char * filenam - struct nameidata nd; +@@ -215,7 +228,7 @@ static inline long do_sys_ftruncate(unsi + + error = locks_verify_truncate(inode, file, length); + if (!error) +- error = do_truncate(dentry, length); ++ error = do_truncate(dentry, length, 0); + out_putf: + fput(file); + out: +@@ -260,11 +273,13 @@ asmlinkage long sys_utime(char * filenam struct inode * inode; struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); ++ error = user_path_walk_it(filename, &nd, NULL); if (error) goto out; inode = nd.dentry->d_inode; -@@ -286,6 +292,7 @@ asmlinkage long sys_utime(char * filenam + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -279,11 +294,29 @@ asmlinkage long sys_utime(char * filenam + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EROFS; ++ if (IS_RDONLY(inode)) ++ goto dput_and_out; ++ ++ error = -EPERM; ++ if (!times) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; } ++ error = notify_change(nd.dentry, &newattrs); dput_and_out: -+ intent_release(nd.dentry, &it); path_release(&nd); - out: - return error; -@@ -303,8 +310,9 @@ asmlinkage long sys_utimes(char * filena - struct nameidata nd; +@@ -304,12 +337,14 @@ asmlinkage long sys_utimes(char * filena struct inode * inode; struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; - error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); ++ error = user_path_walk_it(filename, &nd, NULL); if (error) goto out; -@@ -331,6 +339,7 @@ asmlinkage long sys_utimes(char * filena - } - error = notify_change(nd.dentry, &newattrs); - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * + inode = nd.dentry->d_inode; + ++ /* this is safe without a Lustre lock because it only depends ++ on the super block */ + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; +@@ -324,7 +359,20 @@ asmlinkage long sys_utimes(char * filena + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; +- } else { ++ } ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ ++ error = -EPERM; ++ if (!utimes) { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; +@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * int old_fsuid, old_fsgid; kernel_cap_t old_cap; int res; @@ -834,7 +1081,7 @@ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; -@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * +@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * else current->cap_effective = current->cap_permitted; @@ -850,20 +1097,18 @@ path_release(&nd); } -@@ -385,8 +396,11 @@ asmlinkage long sys_chdir(const char * f +@@ -385,8 +435,9 @@ asmlinkage long sys_chdir(const char * f { int error; struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; - error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); -+ error = __user_walk_it(filename, -+ LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, -+ &nd, &it); ++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it); if (error) goto out; -@@ -397,6 +411,7 @@ asmlinkage long sys_chdir(const char * f +@@ -397,6 +448,7 @@ asmlinkage long sys_chdir(const char * f set_fs_pwd(current->fs, nd.mnt, nd.dentry); dput_and_out: @@ -871,20 +1116,20 @@ path_release(&nd); out: return error; -@@ -436,9 +451,10 @@ asmlinkage long sys_chroot(const char * +@@ -436,9 +488,10 @@ asmlinkage long sys_chroot(const char * { int error; struct nameidata nd; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; - error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | -+ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; -@@ -454,6 +470,7 @@ asmlinkage long sys_chroot(const char * +@@ -454,6 +507,7 @@ asmlinkage long sys_chroot(const char * set_fs_altroot(); error = 0; dput_and_out: @@ -892,56 +1137,68 @@ path_release(&nd); out: return error; -@@ -498,8 +515,9 @@ asmlinkage long sys_chmod(const char * f - struct inode * inode; - int error; - struct iattr newattrs; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; +@@ -508,6 +562,18 @@ asmlinkage long sys_chmod(const char * f + if (IS_RDONLY(inode)) + goto dput_and_out; -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (error) ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ ++ newattrs.ia_mode = mode; ++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto dput_and_out; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto dput_and_out; +@@ -538,6 +604,20 @@ static int chown_common(struct dentry * + error = -EROFS; + if (IS_RDONLY(inode)) goto out; - inode = nd.dentry->d_inode; -@@ -519,6 +537,7 @@ asmlinkage long sys_chmod(const char * f - error = notify_change(nd.dentry, &newattrs); - - dput_and_out: -+ intent_release(nd.dentry, &it); - path_release(&nd); - out: - return error; -@@ -588,10 +607,12 @@ asmlinkage long sys_chown(const char * f ++ ++ if (inode->i_op->setattr_raw) { ++ struct inode_operations *op = dentry->d_inode->i_op; ++ ++ newattrs.ia_uid = user; ++ newattrs.ia_gid = group; ++ newattrs.ia_valid = ATTR_UID | ATTR_GID; ++ newattrs.ia_valid |= ATTR_RAW; ++ error = op->setattr_raw(inode, &newattrs); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ return error; ++ } ++ + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; +@@ -642,6 +722,7 @@ struct file *filp_open(const char * file { + int namei_flags, error; struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = flags }; + + flags &= ~O_DIRECT; -- error = user_path_walk(filename, &nd); -+ error = user_path_walk_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -601,10 +622,12 @@ asmlinkage long sys_lchown(const char * - { - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_SETATTR }; +@@ -651,14 +732,15 @@ struct file *filp_open(const char * file + if (namei_flags & O_TRUNC) + namei_flags |= 2; -- error = user_path_walk_link(filename, &nd); -+ error = user_path_walk_link_it(filename, &nd, &it); - if (!error) { - error = chown_common(nd.dentry, user, group); -+ intent_release(nd.dentry, &it); - path_release(&nd); - } - return error; -@@ -628,7 +651,8 @@ extern ssize_t do_readahead(struct file - /* for files over a certains size it doesn't pay to do readahead on open */ - #define READAHEAD_CUTOFF 48000 +- error = open_namei(filename, namei_flags, mode, &nd); +- if (!error) +- return dentry_open(nd.dentry, nd.mnt, flags); ++ error = open_namei_it(filename, namei_flags, mode, &nd, &it); ++ if (error) ++ return ERR_PTR(error); + +- return ERR_PTR(error); ++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it); + } -struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) +struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, @@ -949,15 +1206,15 @@ { struct file * f; struct inode *inode; -@@ -693,6 +717,7 @@ struct file *dentry_open(struct dentry * - do_readahead(f, 0, (48 * 1024) >> PAGE_SHIFT); - +@@ -701,6 +783,7 @@ struct file *dentry_open(struct dentry * + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + intent_release(dentry, it); return f; cleanup_all: -@@ -707,11 +732,17 @@ cleanup_all: +@@ -715,11 +798,17 @@ cleanup_all: cleanup_file: put_filp(f); cleanup_dentry: @@ -975,58 +1232,53 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.18-18.8.0-l7/fs/stat.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/fs/stat.c Mon Jan 20 12:25:10 2003 -@@ -13,6 +13,7 @@ - - #include <asm/uaccess.h> - -+extern void intent_release(struct dentry *de, struct lookup_intent *it); - /* - * Revalidate the inode. This is required for proper NFS attribute caching. - */ -@@ -104,10 +105,12 @@ int vfs_stat(char *name, struct kstat *s +--- linux-2.4.20-rh/fs/stat.c~vfs_intent-2.4.20-rh 2003-04-11 14:05:08.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/stat.c 2003-06-09 23:18:07.000000000 +0800 +@@ -110,11 +110,13 @@ static int do_getattr(struct vfsmount *m + int vfs_stat(char *name, struct kstat *stat) { struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - error = user_path_walk(name, &nd); -+ error = user_path_walk_it(name, &nd, &it); ++ error = user_path_walk_it(name, &nd, &it); if (!error) { error = do_getattr(nd.mnt, nd.dentry, stat); -+ intent_release(nd.dentry, &it); ++ intent_release(nd.dentry, &it); path_release(&nd); } return error; -@@ -117,10 +120,12 @@ int vfs_lstat(char *name, struct kstat * +@@ -123,11 +125,13 @@ int vfs_stat(char *name, struct kstat *s + int vfs_lstat(char *name, struct kstat *stat) { struct nameidata nd; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; int error; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - error = user_path_walk_link(name, &nd); -+ error = user_path_walk_link_it(name, &nd, &it); ++ error = user_path_walk_link_it(name, &nd, &it); if (!error) { error = do_getattr(nd.mnt, nd.dentry, stat); -+ intent_release(nd.dentry, &it); ++ intent_release(nd.dentry, &it); path_release(&nd); } return error; ---- linux-2.4.18-18.8.0-l7/include/linux/dcache.h~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/include/linux/dcache.h Wed Jan 22 19:38:12 2003 -@@ -6,6 +6,27 @@ - #include <asm/atomic.h> +--- linux-2.4.20-rh/include/linux/dcache.h~vfs_intent-2.4.20-rh 2003-04-12 15:46:39.000000000 +0800 ++++ linux-2.4.20-rh-root/include/linux/dcache.h 2003-06-09 23:18:07.000000000 +0800 +@@ -7,6 +7,28 @@ #include <linux/mount.h> + #include <linux/kernel.h> +#define IT_OPEN (1) +#define IT_CREAT (1<<1) +#define IT_READDIR (1<<2) +#define IT_GETATTR (1<<3) -+#define IT_SETATTR (1<<4) -+#define IT_TRUNC (1<<5) -+#define IT_READLINK (1<<6) -+#define IT_LOOKUP (1<<7) ++#define IT_LOOKUP (1<<4) ++#define IT_UNLINK (1<<5) ++ ++#define IT_FL_LOCKED (1) ++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */ + +struct lookup_intent { + int it_op; @@ -1034,7 +1286,7 @@ + int it_flags; + int it_disposition; + int it_status; -+ struct iattr *it_iattr; ++ int it_int_flags; + __u64 it_lock_handle[2]; + int it_lock_mode; + void *it_data; @@ -1043,7 +1295,7 @@ /* * linux/include/linux/dcache.h * -@@ -78,6 +99,7 @@ struct dentry { +@@ -82,6 +104,7 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ @@ -1051,7 +1303,7 @@ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ void * d_extra_attributes; /* TUX-specific data */ -@@ -91,6 +113,8 @@ struct dentry_operations { +@@ -96,8 +119,15 @@ struct dentry_operations { int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -1059,8 +1311,15 @@ + void (*d_intent_release)(struct dentry *, struct lookup_intent *); }; ++/* defined in fs/namei.c */ ++extern void intent_release(struct dentry *de, struct lookup_intent *it); ++/* defined in fs/dcache.c */ ++extern void __d_rehash(struct dentry * entry, int lock); ++ /* the dentry parameter passed to d_hash and d_compare is the parent -@@ -124,6 +148,7 @@ d_iput: no no yes + * directory of the entries to be compared. It is used in case these + * functions need any directory specific information for determining +@@ -129,6 +159,7 @@ d_iput: no no yes * s_nfsd_free_path semaphore will be down */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ @@ -1068,9 +1327,18 @@ extern spinlock_t dcache_lock; ---- linux-2.4.18-18.8.0-l7/include/linux/fs.h~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/include/linux/fs.h Wed Jan 22 22:46:13 2003 -@@ -576,6 +576,7 @@ struct file { +--- linux-2.4.20-rh/include/linux/fs.h~vfs_intent-2.4.20-rh 2003-05-30 02:07:39.000000000 +0800 ++++ linux-2.4.20-rh-root/include/linux/fs.h 2003-06-09 23:18:07.000000000 +0800 +@@ -337,6 +337,8 @@ extern void set_bh_page(struct buffer_he + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 ++#define ATTR_RAW 2048 /* file system, not vfs will massage attrs */ ++#define ATTR_FROM_OPEN 4096 /* called from open path, ie O_TRUNC */ + + /* + * This is the Inode Attributes structure, used for notify_change(). It +@@ -574,6 +576,7 @@ struct file { /* needed for tty driver, and maybe others */ void *private_data; @@ -1078,7 +1346,15 @@ /* preallocated helper kiobuf to speedup O_DIRECT */ struct kiobuf *f_iobuf; -@@ -836,7 +837,9 @@ extern int vfs_symlink(struct inode *, s +@@ -701,6 +704,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *it; + }; + + /* +@@ -821,7 +825,9 @@ extern int vfs_symlink(struct inode *, s extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *); @@ -1089,7 +1365,7 @@ /* * File types -@@ -897,16 +900,28 @@ struct file_operations { +@@ -882,20 +888,33 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); @@ -1108,17 +1384,38 @@ + int (*mknod2) (struct inode *, const char *, int,int,int); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); -+ int (*rename2) (struct inode *, struct inode *, -+ const char *oldname, int oldlen, ++ int (*rename2) (struct inode *, struct inode *, ++ const char *oldname, int oldlen, + const char *newname, int newlen); int (*readlink) (struct dentry *, char *,int); int (*follow_link) (struct dentry *, struct nameidata *); -+ int (*follow_link2) (struct dentry *, struct nameidata *, ++ int (*follow_link2) (struct dentry *, struct nameidata *, + struct lookup_intent *it); void (*truncate) (struct inode *); int (*permission) (struct inode *, int); int (*revalidate) (struct dentry *); -@@ -1381,6 +1396,7 @@ typedef int (*read_actor_t)(read_descrip + int (*setattr) (struct dentry *, struct iattr *); ++ int (*setattr_raw) (struct inode *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + int (*setxattr) (struct dentry *, const char *, void *, size_t, int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); +@@ -1091,10 +1110,14 @@ static inline int get_lease(struct inode + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +-extern int do_truncate(struct dentry *, loff_t start); ++extern int do_truncate(struct dentry *, loff_t start, int called_from_open); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1385,6 +1408,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); @@ -1126,7 +1423,7 @@ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); -@@ -1392,6 +1408,8 @@ extern struct dentry * lookup_one_len(co +@@ -1396,6 +1420,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) @@ -1135,18 +1432,18 @@ extern void inode_init_once(struct inode *); extern void iput(struct inode *); -@@ -1492,6 +1510,8 @@ extern struct file_operations generic_ro +@@ -1495,6 +1521,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); -+extern int vfs_follow_link_it(struct nameidata *, const char *, ++extern int vfs_follow_link_it(struct nameidata *, const char *, + struct lookup_intent *it); extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.18-18.8.0-l7/kernel/ksyms.c~vfs_intent-2.4.18-18 Mon Jan 20 12:25:10 2003 -+++ linux-2.4.18-18.8.0-l7-root/kernel/ksyms.c Mon Jan 20 12:25:10 2003 -@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); +--- linux-2.4.20-rh/kernel/ksyms.c~vfs_intent-2.4.20-rh 2003-05-30 02:07:42.000000000 +0800 ++++ linux-2.4.20-rh-root/kernel/ksyms.c 2003-06-09 23:18:07.000000000 +0800 +@@ -298,6 +298,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); EXPORT_SYMBOL(vfs_follow_link); @@ -1154,5 +1451,79 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(page_follow_link); EXPORT_SYMBOL(page_symlink_inode_operations); +--- linux-2.4.20-rh/fs/exec.c~vfs_intent-2.4.20-rh 2003-04-13 10:07:02.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/exec.c 2003-06-09 23:18:07.000000000 +0800 +@@ -114,8 +114,9 @@ asmlinkage long sys_uselib(const char * + struct file * file; + struct nameidata nd; + int error; +- +- error = user_path_walk(library, &nd); ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; ++ ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -127,7 +128,8 @@ asmlinkage long sys_uselib(const char * + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(nd.dentry, &it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -382,8 +384,9 @@ struct file *open_exec(const char *name) + struct inode *inode; + struct file *file; + int err = 0; +- +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; ++ ++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -395,7 +398,7 @@ struct file *open_exec(const char *name) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -404,6 +407,7 @@ struct file *open_exec(const char *name) + } + } + out: ++ intent_release(nd.dentry, &it); + return file; + } + } +@@ -1283,7 +1287,7 @@ int do_coredump(long signr, int exit_cod + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; + + retval = binfmt->core_dump(signr, regs, file); +--- linux-2.4.20-rh/fs/proc/base.c~vfs_intent-2.4.20-rh 2003-06-09 23:16:51.000000000 +0800 ++++ linux-2.4.20-rh-root/fs/proc/base.c 2003-06-09 23:18:52.000000000 +0800 +@@ -464,6 +464,9 @@ static int proc_pid_follow_link(struct d + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->it != NULL) ++ nd->it->it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } _ diff --git a/lustre/kernel_patches/patches/vfs_intent_hp.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch similarity index 76% rename from lustre/kernel_patches/patches/vfs_intent_hp.patch rename to lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch index fa0998a..09bcb22 100644 --- a/lustre/kernel_patches/patches/vfs_intent_hp.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch @@ -1,17 +1,79 @@ - fs/dcache.c | 8 - fs/namei.c | 335 +++++++++++++++++---- + fs/dcache.c | 20 ++ + fs/exec.c | 15 + + fs/namei.c | 378 ++++++++++++++++++++++++++++++++++++++++++------- fs/nfsd/vfs.c | 2 - fs/open.c | 142 +++++++- - fs/stat.c | 24 + - include/linux/dcache.h | 26 + - include/linux/fs.h | 27 + + fs/open.c | 126 ++++++++++++++-- + fs/proc/base.c | 3 + fs/stat.c | 24 ++- + include/linux/dcache.h | 31 ++++ + include/linux/fs.h | 32 +++- kernel/ksyms.c | 1 - fs/exec.c | 18 - - 9 files changed, 487 insertions(+), 96 deletions(-) + 10 files changed, 543 insertions(+), 89 deletions(-) + +--- linux-2.4.20-l18/fs/exec.c~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/fs/exec.c Wed May 28 01:39:18 2003 +@@ -107,8 +107,9 @@ asmlinkage long sys_uselib(const char * + struct file * file; + struct nameidata nd; + int error; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- error = user_path_walk(library, &nd); ++ error = user_path_walk_it(library, &nd, &it); + if (error) + goto out; + +@@ -120,7 +121,8 @@ asmlinkage long sys_uselib(const char * + if (error) + goto exit; + +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(nd.dentry, &it); + error = PTR_ERR(file); + if (IS_ERR(file)) + goto out; +@@ -363,8 +365,9 @@ struct file *open_exec(const char *name) + struct inode *inode; + struct file *file; + int err = 0; ++ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; + +- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); ++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it); + file = ERR_PTR(err); + if (!err) { + inode = nd.dentry->d_inode; +@@ -376,7 +379,8 @@ struct file *open_exec(const char *name) + err = -EACCES; + file = ERR_PTR(err); + if (!err) { +- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); ++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); ++ intent_release(nd.dentry, &it); + if (!IS_ERR(file)) { + err = deny_write_access(file); + if (err) { +@@ -388,6 +392,7 @@ out: + return file; + } + } ++ intent_release(nd.dentry, &it); + path_release(&nd); + } + goto out; +@@ -989,7 +994,7 @@ int do_coredump(long signr, struct pt_re + goto close_fail; + if (!file->f_op->write) + goto close_fail; +- if (do_truncate(file->f_dentry, 0) != 0) ++ if (do_truncate(file->f_dentry, 0, 0) != 0) + goto close_fail; ---- linux-2.4.19-hp2_pnnl4/fs/dcache.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/dcache.c Sun Jan 19 19:04:47 2003 -@@ -186,6 +188,13 @@ int d_invalidate(struct dentry * dentry) + retval = binfmt->core_dump(signr, regs, file); +--- linux-2.4.20-l18/fs/dcache.c~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/fs/dcache.c Wed May 28 01:39:18 2003 +@@ -181,6 +181,13 @@ int d_invalidate(struct dentry * dentry) spin_unlock(&dcache_lock); return 0; } @@ -25,7 +87,7 @@ /* * Check whether to do a partial shrink_dcache * to get rid of unused child entries. -@@ -616,6 +618,7 @@ struct dentry * d_alloc(struct dentry * +@@ -616,6 +623,7 @@ struct dentry * d_alloc(struct dentry * dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; @@ -33,7 +95,7 @@ INIT_LIST_HEAD(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); -@@ -859,13 +867,19 @@ void d_delete(struct dentry * dentry) +@@ -830,13 +838,19 @@ void d_delete(struct dentry * dentry) * Adds a dentry to the hash according to its name. */ @@ -56,9 +118,9 @@ } #define do_switch(x,y) do { \ ---- linux-2.4.19-hp2_pnnl4/fs/namei.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/namei.c Sun Jan 19 19:35:55 2003 -@@ -94,6 +97,13 @@ +--- linux-2.4.20-l18/fs/namei.c~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/fs/namei.c Sun Jun 1 23:41:35 2003 +@@ -94,6 +94,13 @@ * XEmacs seems to be relying on it... */ @@ -72,7 +134,7 @@ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. -@@ -260,10 +271,19 @@ void path_release(struct nameidata *nd) +@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd) * Internal lookup() using the new generic dcache. * SMP-safe */ @@ -93,7 +155,7 @@ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { dput(dentry); -@@ -281,11 +301,14 @@ static struct dentry * cached_lookup(str +@@ -281,11 +297,14 @@ static struct dentry * cached_lookup(str * make sure that nobody added the entry to the dcache in the meantime.. * SMP-safe */ @@ -109,7 +171,7 @@ down(&dir->i_sem); /* * First re-do the cached lookup just in case it was created -@@ -300,6 +321,9 @@ static struct dentry * real_lookup(struc +@@ -300,6 +319,9 @@ static struct dentry * real_lookup(struc result = ERR_PTR(-ENOMEM); if (dentry) { lock_kernel(); @@ -119,7 +181,7 @@ result = dir->i_op->lookup(dir, dentry); unlock_kernel(); if (result) -@@ -321,6 +345,12 @@ static struct dentry * real_lookup(struc +@@ -321,6 +343,12 @@ static struct dentry * real_lookup(struc dput(result); result = ERR_PTR(-ENOENT); } @@ -132,7 +194,7 @@ } return result; } -@@ -332,7 +362,8 @@ static struct dentry * real_lookup(struc +@@ -332,7 +360,8 @@ static struct dentry * real_lookup(struc * Without that kind of total limit, nasty chains of consecutive * symlinks can cause almost arbitrarily long lookups. */ @@ -142,15 +204,22 @@ { int err; if (current->link_count >= 5) -@@ -346,10 +377,14 @@ static inline int do_follow_link(struct +@@ -346,10 +375,21 @@ static inline int do_follow_link(struct current->link_count++; current->total_link_count++; UPDATE_ATIME(dentry->d_inode); - err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; + if (dentry->d_inode->i_op->follow_link2) + err = dentry->d_inode->i_op->follow_link2(dentry, nd, it); -+ else ++ else + err = dentry->d_inode->i_op->follow_link(dentry, nd); ++ if (!err && it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ ++ intent_release(dentry, it); ++ path_release(nd); ++ err = -ENOLINK; ++ } current->link_count--; return err; loop: @@ -158,7 +227,7 @@ path_release(nd); return -ELOOP; } -@@ -381,15 +416,26 @@ int follow_up(struct vfsmount **mnt, str +@@ -379,15 +419,26 @@ int follow_up(struct vfsmount **mnt, str return __follow_up(mnt, dentry); } @@ -186,7 +255,7 @@ dput(*dentry); mntput(mounted->mnt_parent); *dentry = dget(mounted->mnt_root); -@@ -401,7 +447,7 @@ static inline int __follow_down(struct v +@@ -399,7 +450,7 @@ static inline int __follow_down(struct v int follow_down(struct vfsmount **mnt, struct dentry **dentry) { @@ -195,7 +264,7 @@ } static inline void follow_dotdot(struct nameidata *nd) -@@ -437,7 +483,7 @@ static inline void follow_dotdot(struct +@@ -435,7 +486,7 @@ static inline void follow_dotdot(struct mntput(nd->mnt); nd->mnt = parent; } @@ -204,7 +273,7 @@ ; } -@@ -447,7 +482,8 @@ static inline void follow_dotdot(struct +@@ -447,7 +498,8 @@ static inline void follow_dotdot(struct * * We expect 'base' to be positive and a directory. */ @@ -214,7 +283,7 @@ { struct dentry *dentry; struct inode *inode; -@@ -520,15 +556,15 @@ int link_path_walk(const char * name, st +@@ -520,15 +572,15 @@ int link_path_walk(const char * name, st break; } /* This does the actual lookups.. */ @@ -233,7 +302,7 @@ ; err = -ENOENT; -@@ -539,8 +575,8 @@ int link_path_walk(const char * name, st +@@ -539,8 +591,8 @@ int link_path_walk(const char * name, st if (!inode->i_op) goto out_dput; @@ -244,7 +313,7 @@ dput(dentry); if (err) goto return_err; -@@ -556,7 +592,7 @@ int link_path_walk(const char * name, st +@@ -556,7 +608,7 @@ int link_path_walk(const char * name, st nd->dentry = dentry; } err = -ENOTDIR; @@ -253,7 +322,7 @@ break; continue; /* here ends the main loop */ -@@ -583,19 +619,20 @@ last_component: +@@ -583,19 +635,20 @@ last_component: if (err < 0) break; } @@ -279,7 +348,7 @@ dput(dentry); if (err) goto return_err; -@@ -609,7 +647,8 @@ last_component: +@@ -609,7 +662,8 @@ last_component: goto no_inode; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; @@ -289,7 +358,31 @@ break; } goto return_base; -@@ -646,15 +685,28 @@ out_dput: +@@ -633,6 +687,23 @@ return_reval: + * Check the cached dentry for staleness. + */ + dentry = nd->dentry; ++ revalidate_again: ++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate2) { ++ err = -ESTALE; ++ if (!dentry->d_op->d_revalidate2(dentry, 0, it)) { ++ struct dentry *new; ++ err = permission(dentry->d_parent->d_inode, ++ MAY_EXEC); ++ if (err) ++ break; ++ new = real_lookup(dentry->d_parent, ++ &dentry->d_name, 0, NULL); ++ d_invalidate(dentry); ++ dput(dentry); ++ dentry = new; ++ goto revalidate_again; ++ } ++ } else + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + err = -ESTALE; + if (!dentry->d_op->d_revalidate(dentry, 0)) { +@@ -646,15 +717,28 @@ out_dput: dput(dentry); break; } @@ -319,7 +412,33 @@ } /* SMP-safe */ -@@ -757,7 +809,8 @@ int path_init(const char *name, unsigned +@@ -739,6 +823,17 @@ walk_init_root(const char *name, struct + } + + /* SMP-safe */ ++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, ++ struct lookup_intent *it) ++{ ++ int error = 0; ++ if (path_init(path, flags, nd)) ++ error = path_walk_it(path, nd, it); ++ return error; ++} ++ ++ ++/* SMP-safe */ + int path_lookup(const char *path, unsigned flags, struct nameidata *nd) + { + int error = 0; +@@ -753,6 +848,7 @@ int path_init(const char *name, unsigned + { + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; ++ nd->it = NULL; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); +@@ -767,7 +863,8 @@ int path_init(const char *name, unsigned * needs parent already locked. Doesn't follow mounts. * SMP-safe. */ @@ -329,7 +448,7 @@ { struct dentry * dentry; struct inode *inode; -@@ -780,13 +833,16 @@ struct dentry * lookup_hash(struct qstr +@@ -790,13 +887,16 @@ struct dentry * lookup_hash(struct qstr goto out; } @@ -347,7 +466,7 @@ dentry = inode->i_op->lookup(inode, new); unlock_kernel(); if (!dentry) -@@ -798,6 +854,12 @@ out: +@@ -808,6 +908,12 @@ out: return dentry; } @@ -360,7 +479,7 @@ /* SMP-safe */ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) { -@@ -819,7 +881,7 @@ struct dentry * lookup_one_len(const cha +@@ -829,7 +935,7 @@ struct dentry * lookup_one_len(const cha } this.hash = end_name_hash(hash); @@ -369,7 +488,7 @@ access: return ERR_PTR(-EACCES); } -@@ -851,6 +913,23 @@ int __user_walk(const char *name, unsign +@@ -860,6 +966,23 @@ int __user_walk(const char *name, unsign return err; } @@ -393,7 +512,7 @@ /* * It's inline, so penalty for filesystems that don't use sticky bit is * minimal. -@@ -987,7 +1066,8 @@ exit_lock: +@@ -996,7 +1119,8 @@ exit_lock: * for symlinks (where the permissions are checked later). * SMP-safe */ @@ -403,27 +522,27 @@ { int acc_mode, error = 0; struct inode *inode; -@@ -1002,7 +1082,7 @@ int open_namei(const char * pathname, in +@@ -1010,7 +1134,7 @@ int open_namei(const char * pathname, in + * The simplest case - just a plain lookup. */ if (!(flag & O_CREAT)) { - if (path_init(pathname, lookup_flags(flag), nd)) -- error = path_walk(pathname, nd); -+ error = path_walk_it(pathname, nd, it); +- error = path_lookup(pathname, lookup_flags(flag), nd); ++ error = path_lookup_it(pathname, lookup_flags(flag), nd, it); if (error) return error; dentry = nd->dentry; -@@ -1012,6 +1092,10 @@ int open_namei(const char * pathname, in +@@ -1020,6 +1144,10 @@ int open_namei(const char * pathname, in /* * Create - we need to know the parent. */ -+ if (it) { -+ it->it_mode = mode; -+ it->it_op |= IT_CREAT; -+ } - if (path_init(pathname, LOOKUP_PARENT, nd)) - error = path_walk(pathname, nd); ++ if (it) { ++ it->it_mode = mode; ++ it->it_op |= IT_CREAT; ++ } + error = path_lookup(pathname, LOOKUP_PARENT, nd); if (error) -@@ -1028,7 +1112,7 @@ int open_namei(const char * pathname, in + return error; +@@ -1035,7 +1163,7 @@ int open_namei(const char * pathname, in dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -432,15 +551,15 @@ do_last: error = PTR_ERR(dentry); -@@ -1037,6 +1121,7 @@ do_last: +@@ -1044,6 +1172,7 @@ do_last: goto exit; } + it->it_mode = mode; /* Negative dentry, just create the file */ if (!dentry->d_inode) { - if (!IS_POSIXACL(dir->d_inode)) -@@ -1066,12 +1151,13 @@ do_last: + error = vfs_create(dir->d_inode, dentry, +@@ -1072,12 +1201,13 @@ do_last: error = -ELOOP; if (flag & O_NOFOLLOW) goto exit_dput; @@ -456,7 +575,7 @@ goto do_link; dput(nd->dentry); -@@ -1145,7 +1231,7 @@ do_last: +@@ -1151,7 +1281,7 @@ ok: if (!error) { DQUOT_INIT(inode); @@ -465,7 +584,7 @@ } put_write_access(inode); if (error) -@@ -1157,8 +1243,10 @@ ok: +@@ -1163,8 +1293,10 @@ ok: return 0; exit_dput: @@ -476,21 +595,28 @@ path_release(nd); return error; -@@ -1177,7 +1265,12 @@ do_link: +@@ -1183,7 +1315,19 @@ do_link: * are done. Procfs-like symlinks just set LAST_BIND. */ UPDATE_ATIME(dentry->d_inode); - error = dentry->d_inode->i_op->follow_link(dentry, nd); ++ nd->it = it; + if (dentry->d_inode->i_op->follow_link2) + error = dentry->d_inode->i_op->follow_link2(dentry, nd, it); + else + error = dentry->d_inode->i_op->follow_link(dentry, nd); -+ if (error) ++ if (error) { ++ intent_release(dentry, it); ++ } else if (it != NULL && !(it->it_int_flags & IT_FL_FOLLOWED)) { ++ /* vfs_follow_link was never called */ + intent_release(dentry, it); ++ path_release(nd); ++ error = -ENOLINK; ++ } dput(dentry); if (error) return error; -@@ -1199,13 +1292,20 @@ do_link: +@@ -1205,13 +1349,20 @@ do_link: } dir = nd->dentry; down(&dir->d_inode->i_sem); @@ -513,7 +639,7 @@ { struct dentry *dentry; -@@ -1213,7 +1313,7 @@ static struct dentry *lookup_create(stru +@@ -1219,7 +1370,7 @@ static struct dentry *lookup_create(stru dentry = ERR_PTR(-EEXIST); if (nd->last_type != LAST_NORM) goto fail; @@ -522,8 +648,8 @@ if (IS_ERR(dentry)) goto fail; if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) -@@ -1270,7 +1371,19 @@ asmlinkage long sys_mknod(const char * f - error = path_walk(tmp, &nd); +@@ -1275,7 +1426,19 @@ asmlinkage long sys_mknod(const char * f + error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); @@ -542,8 +668,8 @@ + dentry = lookup_create(&nd, 0, NULL); error = PTR_ERR(dentry); - if (!IS_POSIXACL(nd.dentry->d_inode)) -@@ -1289,6 +1402,7 @@ asmlinkage long sys_mknod(const char * f + mode &= ~current->fs->umask; +@@ -1296,6 +1459,7 @@ asmlinkage long sys_mknod(const char * f dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -551,29 +677,26 @@ path_release(&nd); out: putname(tmp); -@@ -1340,15 +1456,25 @@ asmlinkage long sys_mkdir(const char * p - error = path_walk(tmp, &nd); +@@ -1343,7 +1507,17 @@ asmlinkage long sys_mkdir(const char * p + error = path_lookup(tmp, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 1); -+ if (nd.dentry->d_inode->i_op->mkdir2) { -+ struct inode_operations *op = nd.dentry->d_inode->i_op; -+ error = op->mkdir2(nd.dentry->d_inode, -+ nd.last.name, -+ nd.last.len, -+ mode); -+ /* the file system wants to use normal vfs path now */ -+ if (error != -EOPNOTSUPP) -+ goto out2; -+ } ++ if (nd.dentry->d_inode->i_op->mkdir2) { ++ struct inode_operations *op = nd.dentry->d_inode->i_op; ++ error = op->mkdir2(nd.dentry->d_inode, ++ nd.last.name, ++ nd.last.len, ++ mode); ++ /* the file system wants to use normal vfs path now */ ++ if (error != -EOPNOTSUPP) ++ goto out2; ++ } + dentry = lookup_create(&nd, 1, NULL); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { -- if (!IS_POSIXACL(nd.dentry->d_inode)) -- mode &= ~current->fs->umask; -- error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); -+ error = vfs_mkdir(nd.dentry->d_inode, dentry, -+ mode & ~current->fs->umask); + error = vfs_mkdir(nd.dentry->d_inode, dentry, +@@ -1351,6 +1525,7 @@ asmlinkage long sys_mkdir(const char * p dput(dentry); } up(&nd.dentry->d_inode->i_sem); @@ -581,7 +704,7 @@ path_release(&nd); out: putname(tmp); -@@ -1450,8 +1578,33 @@ asmlinkage long sys_rmdir(const char * p +@@ -1451,8 +1626,33 @@ asmlinkage long sys_rmdir(const char * p error = -EBUSY; goto exit1; } @@ -616,7 +739,7 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); -@@ -1510,8 +1649,17 @@ asmlinkage long sys_unlink(const char * +@@ -1510,8 +1710,17 @@ asmlinkage long sys_unlink(const char * error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; @@ -635,8 +758,8 @@ error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ -@@ -1579,15 +1729,26 @@ asmlinkage long sys_symlink(const char * - error = path_walk(to, &nd); +@@ -1578,15 +1787,26 @@ asmlinkage long sys_symlink(const char * + error = path_lookup(to, LOOKUP_PARENT, &nd); if (error) goto out; - dentry = lookup_create(&nd, 0); @@ -664,16 +787,7 @@ putname(to); } putname(from); -@@ -1660,7 +1824,7 @@ asmlinkage long sys_link(const char * ol - - error = 0; - if (path_init(from, LOOKUP_POSITIVE, &old_nd)) -- error = path_walk(from, &old_nd); -+ error = path_walk_it(from, &old_nd, NULL); - if (error) - goto exit; - if (path_init(to, LOOKUP_PARENT, &nd)) -@@ -1670,7 +1834,17 @@ asmlinkage long sys_link(const char * ol +@@ -1662,7 +1882,17 @@ asmlinkage long sys_link(const char * ol error = -EXDEV; if (old_nd.mnt != nd.mnt) goto out_release; @@ -692,7 +806,7 @@ error = PTR_ERR(new_dentry); if (!IS_ERR(new_dentry)) { error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); -@@ -1716,7 +1892,8 @@ exit: +@@ -1706,7 +1936,8 @@ exit: * locking]. */ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -702,7 +816,7 @@ { int error; struct inode *target; -@@ -1753,6 +1923,7 @@ int vfs_rename_dir(struct inode *old_dir +@@ -1764,6 +1995,7 @@ int vfs_rename_dir(struct inode *old_dir error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -710,7 +824,7 @@ if (target) { if (!error) target->i_flags |= S_DEAD; -@@ -1795,7 +1973,8 @@ out_unlock: +@@ -1785,7 +2017,8 @@ out_unlock: } int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, @@ -720,7 +834,7 @@ { int error; -@@ -1826,6 +2005,7 @@ int vfs_rename_other(struct inode *old_d +@@ -1816,6 +2049,7 @@ int vfs_rename_other(struct inode *old_d error = -EBUSY; else error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); @@ -728,7 +842,7 @@ double_up(&old_dir->i_zombie, &new_dir->i_zombie); if (error) return error; -@@ -1837,13 +2017,14 @@ int vfs_rename_other(struct inode *old_d +@@ -1827,13 +2061,14 @@ int vfs_rename_other(struct inode *old_d } int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -746,7 +860,7 @@ if (!error) { if (old_dir == new_dir) inode_dir_notify(old_dir, DN_RENAME); -@@ -1888,7 +2070,7 @@ static inline int do_rename(const char * +@@ -1875,7 +2110,7 @@ static inline int do_rename(const char * double_lock(new_dir, old_dir); @@ -755,7 +869,7 @@ error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; -@@ -1904,16 +2086,37 @@ static inline int do_rename(const char * +@@ -1891,16 +2126,37 @@ static inline int do_rename(const char * if (newnd.last.name[newnd.last.len]) goto exit4; } @@ -796,7 +910,7 @@ dput(new_dentry); exit4: dput(old_dentry); -@@ -1964,7 +2163,8 @@ out: +@@ -1951,20 +2207,28 @@ out: } static inline int @@ -806,7 +920,19 @@ { int res = 0; char *name; -@@ -1977,7 +2177,7 @@ __vfs_follow_link(struct nameidata *nd, + if (IS_ERR(link)) + goto fail; + ++ if (it == NULL) ++ it = nd->it; ++ else if (it != nd->it) ++ printk("it != nd->it: tell phil@clusterfs.com\n"); ++ if (it != NULL) ++ it->it_int_flags |= IT_FL_FOLLOWED; ++ + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) /* weird __emul_prefix() stuff did it */ goto out; } @@ -815,7 +941,7 @@ out: if (current->link_count || res || nd->last_type!=LAST_NORM) return res; -@@ -1999,7 +2199,13 @@ fail: +@@ -1986,7 +2250,13 @@ fail: int vfs_follow_link(struct nameidata *nd, const char *link) { @@ -830,7 +956,7 @@ } /* get the link contents into pagecache */ -@@ -2041,7 +2247,7 @@ int page_follow_link(struct dentry *dent +@@ -2028,7 +2298,7 @@ int page_follow_link(struct dentry *dent { struct page *page = NULL; char *s = page_getlink(dentry, &page); @@ -839,9 +965,9 @@ if (page) { kunmap(page); page_cache_release(page); ---- linux-2.4.19-hp2_pnnl4/fs/nfsd/vfs.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/nfsd/vfs.c Sun Jan 19 19:37:57 2003 -@@ -1295,7 +1295,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru +--- linux-2.4.20-l18/fs/nfsd/vfs.c~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/fs/nfsd/vfs.c Wed May 28 01:39:18 2003 +@@ -1291,7 +1291,7 @@ nfsd_rename(struct svc_rqst *rqstp, stru err = nfserr_perm; } else #endif @@ -850,8 +976,8 @@ if (!err && EX_ISSYNC(tfhp->fh_export)) { nfsd_sync_dir(tdentry); nfsd_sync_dir(fdentry); ---- linux-2.4.19-hp2_pnnl4/fs/open.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/open.c Sun Jan 19 19:41:00 2003 +--- linux-2.4.20-l18/fs/open.c~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/fs/open.c Wed May 28 01:39:18 2003 @@ -19,6 +19,8 @@ #include <asm/uaccess.h> @@ -889,7 +1015,7 @@ up(&inode->i_sem); return error; } -@@ -118,12 +121,13 @@ static inline long do_sys_truncate(const +@@ -118,12 +128,13 @@ static inline long do_sys_truncate(const struct nameidata nd; struct inode * inode; int error; @@ -904,7 +1030,7 @@ if (error) goto out; inode = nd.dentry->d_inode; -@@ -163,11 +167,13 @@ static inline long do_sys_truncate(const +@@ -163,11 +174,13 @@ static inline long do_sys_truncate(const error = locks_verify_truncate(inode, NULL, length); if (!error) { DQUOT_INIT(inode); @@ -1012,7 +1138,7 @@ if (current->fsuid != inode->i_uid && (error = permission(inode,MAY_WRITE)) != 0) goto dput_and_out; -@@ -347,6 +356,7 @@ asmlinkage long sys_access(const char * +@@ -347,6 +395,7 @@ asmlinkage long sys_access(const char * int old_fsuid, old_fsgid; kernel_cap_t old_cap; int res; @@ -1020,7 +1146,7 @@ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ return -EINVAL; -@@ -364,13 +374,14 @@ asmlinkage long sys_access(const char * +@@ -364,13 +413,14 @@ asmlinkage long sys_access(const char * else current->cap_effective = current->cap_permitted; @@ -1036,24 +1162,18 @@ path_release(&nd); } -@@ -386,6 +397,7 @@ asmlinkage long sys_chdir(const char * f +@@ -385,8 +435,9 @@ asmlinkage long sys_chdir(const char * f + { int error; struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; - - name = getname(filename); - error = PTR_ERR(name); -@@ -394,7 +406,7 @@ asmlinkage long sys_chdir(const char * f ++ struct lookup_intent it = { .it_op = IT_GETATTR }; - error = 0; - if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); +- error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd); ++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it); if (error) goto out; -@@ -406,6 +418,7 @@ asmlinkage long sys_chdir(const char * f + +@@ -397,6 +448,7 @@ asmlinkage long sys_chdir(const char * f set_fs_pwd(current->fs, nd.mnt, nd.dentry); dput_and_out: @@ -1061,24 +1181,20 @@ path_release(&nd); out: return error; -@@ -446,6 +459,7 @@ asmlinkage long sys_chroot(const char * +@@ -436,9 +488,10 @@ asmlinkage long sys_chroot(const char * + { int error; struct nameidata nd; - char *name; -+ struct lookup_intent it = { .it_op = IT_GETATTR }; ++ struct lookup_intent it = { .it_op = IT_GETATTR }; - name = getname(filename); - error = PTR_ERR(name); -@@ -454,7 +468,7 @@ asmlinkage long sys_chroot(const char * - - path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | - LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); -- error = path_walk(name, &nd); -+ error = path_walk_it(name, &nd, &it); - putname(name); +- error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | +- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); ++ error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW | ++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it); if (error) goto out; -@@ -471,6 +485,7 @@ asmlinkage long sys_chroot(const char * + +@@ -454,6 +507,7 @@ asmlinkage long sys_chroot(const char * set_fs_altroot(); error = 0; dput_and_out: @@ -1086,7 +1202,7 @@ path_release(&nd); out: return error; -@@ -508,6 +564,18 @@ asmlinkage long sys_chmod(const char * f +@@ -508,6 +562,18 @@ asmlinkage long sys_chmod(const char * f if (IS_RDONLY(inode)) goto dput_and_out; @@ -1105,7 +1221,7 @@ error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto dput_and_out; -@@ -538,6 +606,20 @@ static int chown_common(struct dentry * +@@ -538,6 +604,20 @@ static int chown_common(struct dentry * error = -EROFS; if (IS_RDONLY(inode)) goto out; @@ -1126,14 +1242,10 @@ error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; -@@ -655,10 +676,16 @@ asmlinkage long sys_fchown(unsigned int +@@ -638,10 +718,12 @@ asmlinkage long sys_fchown(unsigned int * for the internal routines (ie open_namei()/follow_link() etc). 00 is * used by symlinks. */ -+extern int open_namei_it(const char *filename, int namei_flags, int mode, -+ struct nameidata *nd, struct lookup_intent *it); -+struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); + struct file *filp_open(const char * filename, int flags, int mode) { @@ -1143,7 +1255,7 @@ namei_flags = flags; if ((namei_flags+1) & O_ACCMODE) -@@ -666,14 +693,15 @@ struct file *filp_open(const char * file +@@ -649,14 +731,15 @@ struct file *filp_open(const char * file if (namei_flags & O_TRUNC) namei_flags |= 2; @@ -1164,7 +1276,7 @@ { struct file * f; struct inode *inode; -@@ -716,6 +744,7 @@ struct file *dentry_open(struct dentry * +@@ -699,6 +782,7 @@ struct file *dentry_open(struct dentry * } f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -1172,7 +1284,7 @@ return f; cleanup_all: -@@ -730,11 +759,17 @@ cleanup_all: +@@ -713,11 +797,17 @@ cleanup_all: cleanup_file: put_filp(f); cleanup_dentry: @@ -1190,9 +1302,9 @@ /* * Find an empty file descriptor entry, and mark it busy. */ ---- linux-2.4.19-hp2_pnnl4/fs/stat.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/stat.c Sun Jan 19 19:44:51 2003 -@@ -135,13 +136,15 @@ static int cp_new_stat(struct inode * in +--- linux-2.4.20-l18/fs/stat.c~vfs_intent-2.4.20-vanilla Thu Sep 13 19:04:43 2001 ++++ linux-2.4.20-l18-phil/fs/stat.c Wed May 28 01:39:18 2003 +@@ -135,13 +135,15 @@ static int cp_new_stat(struct inode * in asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf) { struct nameidata nd; @@ -1209,7 +1321,7 @@ path_release(&nd); } return error; -@@ -151,13 +154,15 @@ asmlinkage long sys_stat(char * filename +@@ -151,13 +153,15 @@ asmlinkage long sys_stat(char * filename asmlinkage long sys_newstat(char * filename, struct stat * statbuf) { struct nameidata nd; @@ -1226,7 +1338,7 @@ path_release(&nd); } return error; -@@ -172,13 +177,15 @@ asmlinkage long sys_newstat(char * filen +@@ -172,13 +176,15 @@ asmlinkage long sys_newstat(char * filen asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf) { struct nameidata nd; @@ -1243,7 +1355,7 @@ path_release(&nd); } return error; -@@ -189,13 +196,15 @@ asmlinkage long sys_lstat(char * filenam +@@ -189,13 +195,15 @@ asmlinkage long sys_lstat(char * filenam asmlinkage long sys_newlstat(char * filename, struct stat * statbuf) { struct nameidata nd; @@ -1260,7 +1372,7 @@ path_release(&nd); } return error; -@@ -333,12 +344,14 @@ asmlinkage long sys_stat64(char * filena +@@ -333,12 +341,14 @@ asmlinkage long sys_stat64(char * filena { struct nameidata nd; int error; @@ -1276,7 +1388,7 @@ path_release(&nd); } return error; -@@ -348,12 +361,14 @@ asmlinkage long sys_lstat64(char * filen +@@ -348,12 +358,14 @@ asmlinkage long sys_lstat64(char * filen { struct nameidata nd; int error; @@ -1292,74 +1404,23 @@ path_release(&nd); } return error; ---- linux-2.4.19-hp2_pnnl4/fs/exec.c~vfs_intent_hp Sun Feb 9 01:14:52 2003 -+++ linux-2.4.19-hp2_pnnl4-root/fs/exec.c Sun Feb 9 01:29:49 2003 -@@ -103,13 +104,18 @@ static inline void put_binfmt(struct lin - * - * Also note that we take the address to load from from the file itself. - */ -+extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, -+ int flags, struct lookup_intent *it); -+int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd, -+ struct lookup_intent *it); - asmlinkage long sys_uselib(const char * library) - { - struct file * file; - struct nameidata nd; - int error; -+ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; - -- error = user_path_walk(library, &nd); -+ error = user_path_walk_it(library, &nd, &it); - if (error) - goto out; - -@@ -121,7 +127,8 @@ asmlinkage long sys_uselib(const char * - if (error) - goto exit; - -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(nd.dentry, &it); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; -@@ -350,9 +350,10 @@ struct file *open_exec(const char *name) - struct inode *inode; - struct file *file; - int err = 0; -+ struct lookup_intent it = { .it_op = IT_OPEN, .it_flags = O_RDONLY }; +--- linux-2.4.20-l18/fs/proc/base.c~vfs_intent-2.4.20-vanilla Wed Jun 4 22:53:14 2003 ++++ linux-2.4.20-l18-phil/fs/proc/base.c Wed Jun 4 22:50:35 2003 +@@ -464,6 +464,9 @@ static int proc_pid_follow_link(struct d - if (path_init(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd)) -- err = path_walk(name, &nd); -+ err = path_walk_it(name, &nd, &it); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; -@@ -363,7 +369,8 @@ struct file *open_exec(const char *name) - err = -EACCES; - file = ERR_PTR(err); - if (!err) { -- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); -+ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it); -+ intent_release(nd.dentry, &it); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { -@@ -976,7 +986,7 @@ int do_coredump(long signr, struct pt_re - goto close_fail; - if (!file->f_op->write) - goto close_fail; -- if (do_truncate(file->f_dentry, 0) != 0) -+ if (do_truncate(file->f_dentry, 0, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); ---- linux-2.4.19-hp2_pnnl4/include/linux/dcache.h~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/include/linux/dcache.h Sun Jan 19 19:04:48 2003 -@@ -6,6 +6,25 @@ - #include <asm/atomic.h> + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; ++ ++ if (nd->it != NULL) ++ nd->it->it_int_flags |= IT_FL_FOLLOWED; + out: + return error; + } +--- linux-2.4.20-l18/include/linux/dcache.h~vfs_intent-2.4.20-vanilla Thu Nov 28 18:53:15 2002 ++++ linux-2.4.20-l18-phil/include/linux/dcache.h Sun Jun 1 22:35:10 2003 +@@ -7,6 +7,28 @@ #include <linux/mount.h> + #include <linux/kernel.h> +#define IT_OPEN (1) +#define IT_CREAT (1<<1) @@ -1368,13 +1429,16 @@ +#define IT_LOOKUP (1<<4) +#define IT_UNLINK (1<<5) + ++#define IT_FL_LOCKED (1) ++#define IT_FL_FOLLOWED (1<<1) /* set by vfs_follow_link */ ++ +struct lookup_intent { + int it_op; + int it_mode; + int it_flags; + int it_disposition; + int it_status; -+ struct iattr *it_iattr; ++ int it_int_flags; + __u64 it_lock_handle[2]; + int it_lock_mode; + void *it_data; @@ -1383,7 +1447,7 @@ /* * linux/include/linux/dcache.h * -@@ -78,6 +106,7 @@ struct dentry { +@@ -79,6 +101,7 @@ struct dentry { unsigned long d_time; /* used by d_revalidate */ struct dentry_operations *d_op; struct super_block * d_sb; /* The root of the dentry tree */ @@ -1391,7 +1455,7 @@ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ -@@ -90,8 +119,15 @@ struct dentry_operations { +@@ -91,8 +114,15 @@ struct dentry_operations { int (*d_delete)(struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -1407,7 +1471,7 @@ /* the dentry parameter passed to d_hash and d_compare is the parent * directory of the entries to be compared. It is used in case these * functions need any directory specific information for determining -@@ -124,6 +149,7 @@ d_iput: no no yes +@@ -124,6 +154,7 @@ d_iput: no no yes * s_nfsd_free_path semaphore will be down */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ @@ -1415,8 +1479,8 @@ extern spinlock_t dcache_lock; ---- linux-2.4.19-hp2_pnnl4/include/linux/fs.h~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/include/linux/fs.h Sun Jan 19 19:04:48 2003 +--- linux-2.4.20-l18/include/linux/fs.h~vfs_intent-2.4.20-vanilla Wed May 28 01:39:17 2003 ++++ linux-2.4.20-l18-phil/include/linux/fs.h Sun Jun 1 22:07:11 2003 @@ -338,6 +338,8 @@ extern void set_bh_page(struct buffer_he #define ATTR_MTIME_SET 256 #define ATTR_FORCE 512 /* Not a change, but a change it */ @@ -1426,7 +1490,7 @@ /* * This is the Inode Attributes structure, used for notify_change(). It -@@ -575,6 +575,7 @@ struct file { +@@ -542,6 +544,7 @@ struct file { /* needed for tty driver, and maybe others */ void *private_data; @@ -1434,7 +1498,15 @@ /* preallocated helper kiobuf to speedup O_DIRECT */ struct kiobuf *f_iobuf; -@@ -815,7 +816,9 @@ extern int vfs_symlink(struct inode *, s +@@ -661,6 +664,7 @@ struct nameidata { + struct qstr last; + unsigned int flags; + int last_type; ++ struct lookup_intent *it; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ +@@ -794,7 +798,9 @@ extern int vfs_symlink(struct inode *, s extern int vfs_link(struct dentry *, struct inode *, struct dentry *); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *); @@ -1445,7 +1517,7 @@ /* * File types -@@ -876,20 +879,33 @@ struct file_operations { +@@ -855,20 +861,33 @@ struct file_operations { struct inode_operations { int (*create) (struct inode *,struct dentry *,int); struct dentry * (*lookup) (struct inode *,struct dentry *); @@ -1479,7 +1551,7 @@ int (*getattr) (struct dentry *, struct iattr *); int (*setxattr) (struct dentry *, const char *, void *, size_t, int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); -@@ -1112,7 +1130,7 @@ static inline int get_lease(struct inode +@@ -1070,10 +1089,14 @@ static inline int get_lease(struct inode asmlinkage long sys_open(const char *, int, int); asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ @@ -1488,24 +1560,31 @@ extern struct file *filp_open(const char *, int, int); extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); -@@ -1354,6 +1369,7 @@ typedef int (*read_actor_t)(read_descrip ++extern int open_namei_it(const char *filename, int namei_flags, int mode, ++ struct nameidata *nd, struct lookup_intent *it); ++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt, ++ int flags, struct lookup_intent *it); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + +@@ -1335,6 +1358,7 @@ typedef int (*read_actor_t)(read_descrip extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); +extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it)); extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); extern int FASTCALL(path_walk(const char *, struct nameidata *)); - extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); -@@ -1364,6 +1380,8 @@ extern struct dentry * lookup_one_len(co + extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *)); +@@ -1346,6 +1370,8 @@ extern struct dentry * lookup_one_len(co extern struct dentry * lookup_hash(struct qstr *, struct dentry *); #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) +#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it) +#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it) - extern void inode_init_once(struct inode *); extern void iput(struct inode *); -@@ -1499,6 +1517,8 @@ extern struct file_operations generic_ro + extern void force_delete(struct inode *); +@@ -1455,6 +1481,8 @@ extern struct file_operations generic_ro extern int vfs_readlink(struct dentry *, char *, int, const char *); extern int vfs_follow_link(struct nameidata *, const char *); @@ -1514,9 +1593,9 @@ extern int page_readlink(struct dentry *, char *, int); extern int page_follow_link(struct dentry *, struct nameidata *); extern struct inode_operations page_symlink_inode_operations; ---- linux-2.4.19-hp2_pnnl4/kernel/ksyms.c~vfs_intent_hp Sun Jan 19 19:04:47 2003 -+++ linux-2.4.19-hp2_pnnl4-root/kernel/ksyms.c Sun Jan 19 19:04:48 2003 -@@ -293,6 +293,7 @@ EXPORT_SYMBOL(read_cache_page); +--- linux-2.4.20-l18/kernel/ksyms.c~vfs_intent-2.4.20-vanilla Wed May 28 01:39:18 2003 ++++ linux-2.4.20-l18-phil/kernel/ksyms.c Wed May 28 01:39:18 2003 +@@ -269,6 +269,7 @@ EXPORT_SYMBOL(read_cache_page); EXPORT_SYMBOL(set_page_dirty); EXPORT_SYMBOL(vfs_readlink); EXPORT_SYMBOL(vfs_follow_link); @@ -1524,3 +1603,5 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(page_follow_link); EXPORT_SYMBOL(page_symlink_inode_operations); + +_ diff --git a/lustre/kernel_patches/pc/dev_read_only_hp.pc b/lustre/kernel_patches/pc/dev_read_only_2.4.20-rh.pc similarity index 100% rename from lustre/kernel_patches/pc/dev_read_only_hp.pc rename to lustre/kernel_patches/pc/dev_read_only_2.4.20-rh.pc diff --git a/lustre/kernel_patches/pc/dev_read_only_2.4.20.pc b/lustre/kernel_patches/pc/dev_read_only_2.4.20.pc new file mode 100644 index 0000000..4760ad1 --- /dev/null +++ b/lustre/kernel_patches/pc/dev_read_only_2.4.20.pc @@ -0,0 +1,3 @@ +drivers/block/blkpg.c +drivers/block/loop.c +drivers/ide/ide-disk.c diff --git a/lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc b/lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc new file mode 100644 index 0000000..4760ad1 --- /dev/null +++ b/lustre/kernel_patches/pc/dev_read_only_hp_2.4.20.pc @@ -0,0 +1,3 @@ +drivers/block/blkpg.c +drivers/block/loop.c +drivers/ide/ide-disk.c diff --git a/lustre/kernel_patches/pc/dsp.pc b/lustre/kernel_patches/pc/dsp.pc new file mode 100644 index 0000000..fdbf418 --- /dev/null +++ b/lustre/kernel_patches/pc/dsp.pc @@ -0,0 +1,6 @@ +kernel/bootimg.c +kernel/bootimg_pic.c +include/asm-i386/apic.h +include/linux/crash.h +arch/i386/kernel/crash.c +arch/i386/kernel/nmi.c diff --git a/lustre/kernel_patches/pc/export-truncate-2.5.63.pc b/lustre/kernel_patches/pc/export-truncate-2.5.63.pc new file mode 100644 index 0000000..3f61c00 --- /dev/null +++ b/lustre/kernel_patches/pc/export-truncate-2.5.63.pc @@ -0,0 +1,2 @@ +include/linux/mm.h +mm/truncate.c diff --git a/lustre/kernel_patches/pc/export-truncate.pc b/lustre/kernel_patches/pc/export-truncate.pc new file mode 100644 index 0000000..bd58c82 --- /dev/null +++ b/lustre/kernel_patches/pc/export-truncate.pc @@ -0,0 +1,2 @@ +include/linux/mm.h +mm/filemap.c diff --git a/lustre/kernel_patches/pc/exports_hp.pc b/lustre/kernel_patches/pc/exports_2.4.20-rh-hp.pc similarity index 100% rename from lustre/kernel_patches/pc/exports_hp.pc rename to lustre/kernel_patches/pc/exports_2.4.20-rh-hp.pc diff --git a/lustre/kernel_patches/pc/exports_2.4.20.pc b/lustre/kernel_patches/pc/exports_2.4.20.pc new file mode 100644 index 0000000..6472a11 --- /dev/null +++ b/lustre/kernel_patches/pc/exports_2.4.20.pc @@ -0,0 +1,4 @@ +fs/ext3/Makefile +fs/ext3/super.c +include/linux/fs.h +kernel/ksyms.c diff --git a/lustre/kernel_patches/pc/exports_hp_2.4.20.pc b/lustre/kernel_patches/pc/exports_hp_2.4.20.pc new file mode 100644 index 0000000..6472a11 --- /dev/null +++ b/lustre/kernel_patches/pc/exports_hp_2.4.20.pc @@ -0,0 +1,4 @@ +fs/ext3/Makefile +fs/ext3/super.c +include/linux/fs.h +kernel/ksyms.c diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc b/lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc new file mode 100644 index 0000000..634b944 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-1-chaos.pc @@ -0,0 +1,11 @@ +fs/ext3/Makefile +fs/ext3/dir.c +fs/ext3/file.c +fs/ext3/hash.c +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h +include/linux/ext3_jbd.h +include/linux/rbtree.h +lib/rbtree.c diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-1.pc b/lustre/kernel_patches/pc/ext-2.4-patch-1.pc new file mode 100644 index 0000000..634b944 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-1.pc @@ -0,0 +1,11 @@ +fs/ext3/Makefile +fs/ext3/dir.c +fs/ext3/file.c +fs/ext3/hash.c +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h +include/linux/ext3_jbd.h +include/linux/rbtree.h +lib/rbtree.c diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-2.pc b/lustre/kernel_patches/pc/ext-2.4-patch-2.pc new file mode 100644 index 0000000..9b16759 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-2.pc @@ -0,0 +1 @@ +fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-3.pc b/lustre/kernel_patches/pc/ext-2.4-patch-3.pc new file mode 100644 index 0000000..65d4845 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-3.pc @@ -0,0 +1,3 @@ +fs/ext3/dir.c +fs/ext3/namei.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext-2.4-patch-4.pc b/lustre/kernel_patches/pc/ext-2.4-patch-4.pc new file mode 100644 index 0000000..9b16759 --- /dev/null +++ b/lustre/kernel_patches/pc/ext-2.4-patch-4.pc @@ -0,0 +1 @@ +fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/ext3-2.4-ino_t.pc b/lustre/kernel_patches/pc/ext3-2.4-ino_t.pc new file mode 100644 index 0000000..4cef979 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4-ino_t.pc @@ -0,0 +1,3 @@ +fs/ext3/ialloc.c +fs/ext3/namei.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc b/lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc new file mode 100644 index 0000000..0822c5e --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4.18-fixes.pc @@ -0,0 +1,7 @@ +fs/ext3/balloc.c +fs/ext3/file.c +fs/ext3/fsync.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/namei.c +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc new file mode 100644 index 0000000..cd21583 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4.18-ino_sb_macro.pc @@ -0,0 +1,10 @@ +fs/ext3/balloc.c +fs/ext3/dir.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/ioctl.c +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/symlink.c +include/linux/ext3_fs.h +include/linux/ext3_jbd.h diff --git a/lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc b/lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc new file mode 100644 index 0000000..441ced8 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.4.20-fixes.pc @@ -0,0 +1 @@ +fs/ext3/balloc.c diff --git a/lustre/kernel_patches/pc/ext3-2.5-noread.pc b/lustre/kernel_patches/pc/ext3-2.5-noread.pc new file mode 100644 index 0000000..9c3cea8 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.5-noread.pc @@ -0,0 +1,3 @@ +fs/ext3/ialloc.c +fs/ext3/inode.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-2.5.63.pc b/lustre/kernel_patches/pc/ext3-2.5.63.pc new file mode 100644 index 0000000..b1e5de5 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-2.5.63.pc @@ -0,0 +1,4 @@ +fs/ext3/xattr.c +fs/ext3/inode.c +fs/ext3/super.c +fs/ext3/xattr.h diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc new file mode 100644 index 0000000..5770132 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.18.pc @@ -0,0 +1,3 @@ +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc new file mode 100644 index 0000000..5770132 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-delete_thread-2.4.20.pc @@ -0,0 +1,3 @@ +fs/ext3/super.c +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-largefile.pc b/lustre/kernel_patches/pc/ext3-largefile.pc new file mode 100644 index 0000000..76d683f --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-largefile.pc @@ -0,0 +1 @@ +fs/ext3/inode.c diff --git a/lustre/kernel_patches/pc/ext3-noread-2.4.20.pc b/lustre/kernel_patches/pc/ext3-noread-2.4.20.pc new file mode 100644 index 0000000..9c3cea8 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-noread-2.4.20.pc @@ -0,0 +1,3 @@ +fs/ext3/ialloc.c +fs/ext3/inode.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/ext3-orphan_lock.pc b/lustre/kernel_patches/pc/ext3-orphan_lock.pc new file mode 100644 index 0000000..98aebb0 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-orphan_lock.pc @@ -0,0 +1,3 @@ +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/ext3-san-2.4.20.pc b/lustre/kernel_patches/pc/ext3-san-2.4.20.pc new file mode 100644 index 0000000..9ed5141 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-san-2.4.20.pc @@ -0,0 +1,2 @@ +fs/ext3/inode.c +fs/ext3/ext3-exports.c diff --git a/lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc b/lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc new file mode 100644 index 0000000..76d683f --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-truncate_blocks-chaos.patch.pc @@ -0,0 +1 @@ +fs/ext3/inode.c diff --git a/lustre/kernel_patches/pc/ext3-truncate_blocks.pc b/lustre/kernel_patches/pc/ext3-truncate_blocks.pc new file mode 100644 index 0000000..76d683f --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-truncate_blocks.pc @@ -0,0 +1 @@ +fs/ext3/inode.c diff --git a/lustre/kernel_patches/pc/ext3-unmount_sync.pc b/lustre/kernel_patches/pc/ext3-unmount_sync.pc new file mode 100644 index 0000000..08795de --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-unmount_sync.pc @@ -0,0 +1 @@ +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/ext3-use-after-free.pc b/lustre/kernel_patches/pc/ext3-use-after-free.pc new file mode 100644 index 0000000..daf8787 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3-use-after-free.pc @@ -0,0 +1 @@ +./fs/ext3/namei.c diff --git a/lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc b/lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc new file mode 100644 index 0000000..98aebb0 --- /dev/null +++ b/lustre/kernel_patches/pc/ext3_orphan_lock-2.4.20-rh.pc @@ -0,0 +1,3 @@ +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs_sb.h diff --git a/lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc b/lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc new file mode 100644 index 0000000..7191405 --- /dev/null +++ b/lustre/kernel_patches/pc/extN-2.4.18-ino_sb_fixup.pc @@ -0,0 +1 @@ +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/extN-delete_thread.pc b/lustre/kernel_patches/pc/extN-delete_thread.pc new file mode 100644 index 0000000..bc81732 --- /dev/null +++ b/lustre/kernel_patches/pc/extN-delete_thread.pc @@ -0,0 +1,3 @@ +include/linux/ext3_fs.h +include/linux/ext3_fs_sb.h +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/extN-iget-debug.pc b/lustre/kernel_patches/pc/extN-iget-debug.pc new file mode 100644 index 0000000..e9fe01e --- /dev/null +++ b/lustre/kernel_patches/pc/extN-iget-debug.pc @@ -0,0 +1,2 @@ +fs/ext3/namei.c +fs/ext3/inode.c diff --git a/lustre/kernel_patches/pc/extN-misc-fixup.pc b/lustre/kernel_patches/pc/extN-misc-fixup.pc new file mode 100644 index 0000000..08795de --- /dev/null +++ b/lustre/kernel_patches/pc/extN-misc-fixup.pc @@ -0,0 +1 @@ +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/extN-noread.pc b/lustre/kernel_patches/pc/extN-noread.pc new file mode 100644 index 0000000..9c3cea8 --- /dev/null +++ b/lustre/kernel_patches/pc/extN-noread.pc @@ -0,0 +1,3 @@ +fs/ext3/ialloc.c +fs/ext3/inode.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/extN-san.pc b/lustre/kernel_patches/pc/extN-san.pc new file mode 100644 index 0000000..231df0e --- /dev/null +++ b/lustre/kernel_patches/pc/extN-san.pc @@ -0,0 +1,2 @@ +fs/ext3/inode.c +fs/ext3/super.c diff --git a/lustre/kernel_patches/pc/extN-wantedi.pc b/lustre/kernel_patches/pc/extN-wantedi.pc new file mode 100644 index 0000000..31901ee --- /dev/null +++ b/lustre/kernel_patches/pc/extN-wantedi.pc @@ -0,0 +1,4 @@ +fs/ext3/namei.c +fs/ext3/ialloc.c +fs/ext3/ioctl.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/htree-ext3-2.4.18.pc b/lustre/kernel_patches/pc/htree-ext3-2.4.18.pc new file mode 100644 index 0000000..6499778 --- /dev/null +++ b/lustre/kernel_patches/pc/htree-ext3-2.4.18.pc @@ -0,0 +1,4 @@ +fs/ext3/super.c +fs/ext3/namei.c +include/linux/ext3_fs.h +include/linux/ext3_jbd.h diff --git a/lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc b/lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc new file mode 100644 index 0000000..1d4ed77 --- /dev/null +++ b/lustre/kernel_patches/pc/invalidate_show-2.4.20-rh.pc @@ -0,0 +1,4 @@ +fs/inode.c +fs/super.c +include/linux/fs.h +fs/smbfs/inode.c diff --git a/lustre/kernel_patches/pc/invalidate_show.pc b/lustre/kernel_patches/pc/invalidate_show.pc index 1f565ab..1d4ed77 100644 --- a/lustre/kernel_patches/pc/invalidate_show.pc +++ b/lustre/kernel_patches/pc/invalidate_show.pc @@ -1,5 +1,4 @@ fs/inode.c -fs/block_dev.c -fs/devfs/base.c fs/super.c include/linux/fs.h +fs/smbfs/inode.c diff --git a/lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc b/lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc new file mode 100644 index 0000000..07288b0 --- /dev/null +++ b/lustre/kernel_patches/pc/iod-rmap-exports-2.4.20.pc @@ -0,0 +1,5 @@ +fs/inode.c +fs/Makefile +mm/vmscan.c +mm/Makefile +mm/page_alloc.c diff --git a/lustre/kernel_patches/pc/iod-rmap-exports.pc b/lustre/kernel_patches/pc/iod-rmap-exports.pc index 1218f55..07288b0 100644 --- a/lustre/kernel_patches/pc/iod-rmap-exports.pc +++ b/lustre/kernel_patches/pc/iod-rmap-exports.pc @@ -1,6 +1,5 @@ fs/inode.c fs/Makefile -mm/filemap.c mm/vmscan.c mm/Makefile mm/page_alloc.c diff --git a/lustre/kernel_patches/pc/iod-stock-24-exports.pc b/lustre/kernel_patches/pc/iod-stock-24-exports.pc new file mode 100644 index 0000000..e4eceee --- /dev/null +++ b/lustre/kernel_patches/pc/iod-stock-24-exports.pc @@ -0,0 +1,3 @@ +fs/inode.c +fs/Makefile +mm/page_alloc.c diff --git a/lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc b/lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc new file mode 100644 index 0000000..e4eceee --- /dev/null +++ b/lustre/kernel_patches/pc/iod-stock-24-exports_hp.pc @@ -0,0 +1,3 @@ +fs/inode.c +fs/Makefile +mm/page_alloc.c diff --git a/lustre/kernel_patches/pc/iopen-2.4.18.pc b/lustre/kernel_patches/pc/iopen-2.4.18.pc new file mode 100644 index 0000000..b40b1f3 --- /dev/null +++ b/lustre/kernel_patches/pc/iopen-2.4.18.pc @@ -0,0 +1,8 @@ +Documentation/filesystems/ext2.txt +fs/ext3/Makefile +fs/ext3/inode.c +fs/ext3/iopen.c +fs/ext3/iopen.h +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/iopen-2.4.20.pc b/lustre/kernel_patches/pc/iopen-2.4.20.pc new file mode 100644 index 0000000..b40b1f3 --- /dev/null +++ b/lustre/kernel_patches/pc/iopen-2.4.20.pc @@ -0,0 +1,8 @@ +Documentation/filesystems/ext2.txt +fs/ext3/Makefile +fs/ext3/inode.c +fs/ext3/iopen.c +fs/ext3/iopen.h +fs/ext3/namei.c +fs/ext3/super.c +include/linux/ext3_fs.h diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc new file mode 100644 index 0000000..a0a6297 --- /dev/null +++ b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20-rh.pc @@ -0,0 +1,5 @@ +arch/i386/mm/init.c +arch/ia64/mm/init.c +include/linux/slab.h +kernel/ksyms.c +mm/slab.c diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc new file mode 100644 index 0000000..bdba884 --- /dev/null +++ b/lustre/kernel_patches/pc/kmem_cache_validate_2.4.20.pc @@ -0,0 +1,5 @@ +arch/ia64/mm/init.c +include/linux/slab.h +kernel/ksyms.c +mm/slab.c +arch/i386/mm/init.c diff --git a/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc b/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc index a0a6297..bdba884 100644 --- a/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc +++ b/lustre/kernel_patches/pc/kmem_cache_validate_hp.pc @@ -1,5 +1,5 @@ -arch/i386/mm/init.c arch/ia64/mm/init.c include/linux/slab.h kernel/ksyms.c mm/slab.c +arch/i386/mm/init.c diff --git a/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc new file mode 100644 index 0000000..b647d5a --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.18ea-0.8.26.pc @@ -0,0 +1,10 @@ +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/xattr.c +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/xattr.h +fs/ext3/Makefile diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc new file mode 100644 index 0000000..dbf35cb --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-chaos.pc @@ -0,0 +1,62 @@ +Documentation/Configure.help +arch/alpha/defconfig +arch/alpha/kernel/entry.S +arch/arm/defconfig +arch/arm/kernel/calls.S +arch/i386/defconfig +arch/ia64/defconfig +arch/m68k/defconfig +arch/mips/defconfig +arch/mips64/defconfig +arch/ppc/defconfig +arch/ppc64/kernel/misc.S +arch/s390/defconfig +arch/s390/kernel/entry.S +arch/s390x/defconfig +arch/s390x/kernel/entry.S +arch/s390x/kernel/wrapper32.S +arch/sparc/defconfig +arch/sparc/kernel/systbls.S +arch/sparc64/defconfig +arch/sparc64/kernel/systbls.S +fs/Config.in +fs/Makefile +fs/ext2/Makefile +fs/ext2/file.c +fs/ext2/ialloc.c +fs/ext2/inode.c +fs/ext2/namei.c +fs/ext2/super.c +fs/ext2/symlink.c +fs/ext2/xattr.c +fs/ext2/xattr_user.c +fs/ext3/Makefile +fs/ext3/file.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/symlink.c +fs/ext3/xattr.c +fs/ext3/xattr_user.c +fs/jfs/jfs_xattr.h +fs/jfs/xattr.c +fs/mbcache.c +include/asm-arm/unistd.h +include/asm-ppc64/unistd.h +include/asm-s390/unistd.h +include/asm-s390x/unistd.h +include/asm-sparc/unistd.h +include/asm-sparc64/unistd.h +include/linux/cache_def.h +include/linux/errno.h +include/linux/ext2_fs.h +include/linux/ext2_xattr.h +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/fs.h +include/linux/mbcache.h +kernel/ksyms.c +mm/vmscan.c +fs/ext3/ext3-exports.c diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc new file mode 100644 index 0000000..1e8cf75 --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54-hp.pc @@ -0,0 +1,62 @@ +Documentation/Configure.help +arch/alpha/defconfig +arch/alpha/kernel/entry.S +arch/arm/defconfig +arch/arm/kernel/calls.S +arch/i386/defconfig +arch/ia64/defconfig +arch/m68k/defconfig +arch/mips/defconfig +arch/mips64/defconfig +arch/ppc/defconfig +arch/ppc64/kernel/misc.S +arch/s390/defconfig +arch/s390/kernel/entry.S +arch/s390x/defconfig +arch/s390x/kernel/entry.S +arch/s390x/kernel/wrapper32.S +arch/sparc/defconfig +arch/sparc/kernel/systbls.S +arch/sparc64/defconfig +arch/sparc64/kernel/systbls.S +fs/Config.in +fs/Makefile +fs/ext2/Makefile +fs/ext2/file.c +fs/ext2/ialloc.c +fs/ext2/inode.c +fs/ext2/namei.c +fs/ext2/super.c +fs/ext2/symlink.c +fs/ext2/xattr.c +fs/ext2/xattr_user.c +fs/ext3/Makefile +fs/ext3/file.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/symlink.c +fs/ext3/xattr.c +fs/ext3/xattr_user.c +fs/ext3/ext3-exports.c +fs/jfs/jfs_xattr.h +fs/jfs/xattr.c +fs/mbcache.c +include/asm-arm/unistd.h +include/asm-ppc64/unistd.h +include/asm-s390/unistd.h +include/asm-s390x/unistd.h +include/asm-sparc/unistd.h +include/asm-sparc64/unistd.h +include/linux/cache_def.h +include/linux/errno.h +include/linux/ext2_fs.h +include/linux/ext2_xattr.h +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/fs.h +include/linux/mbcache.h +kernel/ksyms.c +mm/vmscan.c diff --git a/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc new file mode 100644 index 0000000..2de1b2c --- /dev/null +++ b/lustre/kernel_patches/pc/linux-2.4.20-xattr-0.8.54.pc @@ -0,0 +1,64 @@ +Documentation/Configure.help +arch/alpha/defconfig +arch/alpha/kernel/entry.S +arch/arm/defconfig +arch/arm/kernel/calls.S +arch/i386/defconfig +arch/ia64/defconfig +arch/ia64/kernel/entry.S +arch/m68k/defconfig +arch/mips/defconfig +arch/mips64/defconfig +arch/ppc/defconfig +arch/ppc64/kernel/misc.S +arch/s390/defconfig +arch/s390/kernel/entry.S +arch/s390x/defconfig +arch/s390x/kernel/entry.S +arch/s390x/kernel/wrapper32.S +arch/sparc/defconfig +arch/sparc/kernel/systbls.S +arch/sparc64/defconfig +arch/sparc64/kernel/systbls.S +fs/Config.in +fs/Makefile +fs/ext2/Makefile +fs/ext2/file.c +fs/ext2/ialloc.c +fs/ext2/inode.c +fs/ext2/namei.c +fs/ext2/super.c +fs/ext2/symlink.c +fs/ext2/xattr.c +fs/ext2/xattr_user.c +fs/ext3/Makefile +fs/ext3/file.c +fs/ext3/ialloc.c +fs/ext3/inode.c +fs/ext3/namei.c +fs/ext3/super.c +fs/ext3/symlink.c +fs/ext3/xattr.c +fs/ext3/xattr_user.c +fs/jfs/jfs_xattr.h +fs/jfs/xattr.c +fs/mbcache.c +include/asm-arm/unistd.h +include/asm-ia64/unistd.h +include/asm-ppc64/unistd.h +include/asm-s390/unistd.h +include/asm-s390x/unistd.h +include/asm-sparc/unistd.h +include/asm-sparc64/unistd.h +include/linux/cache_def.h +include/linux/errno.h +include/linux/ext2_fs.h +include/linux/ext2_xattr.h +include/linux/ext3_fs.h +include/linux/ext3_jbd.h +include/linux/ext3_xattr.h +include/linux/fs.h +include/linux/mbcache.h +kernel/ksyms.c +mm/vmscan.c +fs/ext3/ext3-exports.c diff --git a/lustre/kernel_patches/pc/lustre-2.5.63.pc b/lustre/kernel_patches/pc/lustre-2.5.63.pc new file mode 100644 index 0000000..daeea17 --- /dev/null +++ b/lustre/kernel_patches/pc/lustre-2.5.63.pc @@ -0,0 +1,12 @@ +arch/um/kernel/mem.c +fs/namei.c +fs/nfsd/vfs.c +fs/sysfs/inode.c +include/linux/dcache.h +include/linux/fs.h +include/linux/namei.h +include/linux/slab.h +kernel/ksyms.c +mm/slab.c +net/unix/af_unix.c +fs/dcache.c diff --git a/lustre/kernel_patches/pc/mcore-2.4.20-8.pc b/lustre/kernel_patches/pc/mcore-2.4.20-8.pc new file mode 100644 index 0000000..b290f60 --- /dev/null +++ b/lustre/kernel_patches/pc/mcore-2.4.20-8.pc @@ -0,0 +1,34 @@ +Makefile +Documentation/Configure.help +arch/i386/config.in +arch/i386/vmlinux.lds +arch/i386/boot/setup.S +arch/i386/kernel/Makefile +arch/i386/kernel/crash.c +arch/i386/kernel/nmi.c +arch/i386/kernel/process.c +arch/i386/kernel/setup.c +arch/i386/kernel/smp.c +arch/i386/kernel/traps.c +drivers/char/misc.c +drivers/char/sysrq.c +include/asm-i386/bootimg.h +include/asm-i386/crash.h +include/linux/bootimg.h +include/linux/crash.h +include/linux/mm.h +include/linux/reboot.h +include/linux/sysctl.h +init/main.c +kernel/Makefile +kernel/bootimg.c +kernel/bootimg_pic.c +kernel/crash.c +kernel/module.c +kernel/panic.c +kernel/sysctl.c +lib/Config.in +mm/memory.c +mm/page_alloc.c +arch/i386//boot/compressed/head.S +arch/i386//kernel/head.S diff --git a/lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc b/lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc deleted file mode 100644 index 44d4abf..0000000 --- a/lustre/kernel_patches/pc/patch-2.4.18-hp1_pnnl18.2.8qsnet.pc +++ /dev/null @@ -1,23 +0,0 @@ -./include/linux/lustre_version.h -./arch/ia64/mm/init.c -./arch/i386/mm/init.c -./drivers/block/blkpg.c -./drivers/block/loop.c -./drivers/ide/ide-disk.c -./fs/ext3/Makefile -./fs/ext3/super.c -./fs/jbd/commit.c -./fs/jbd/journal.c -./fs/jbd/transaction.c -./include/linux/blkdev.h -./include/linux/slab.h -./include/linux/jbd.h -./kernel/ksyms.c -./include/linux/dcache.h -./include/linux/fs.h -./fs/dcache.c -./fs/nfsd/vfs.c -./fs/namei.c -./fs/open.c -./fs/stat.c -./mm/slab.c diff --git a/lustre/kernel_patches/pc/tcp-zero-copy.pc b/lustre/kernel_patches/pc/tcp-zero-copy.pc new file mode 100644 index 0000000..02877c0 --- /dev/null +++ b/lustre/kernel_patches/pc/tcp-zero-copy.pc @@ -0,0 +1,5 @@ +include/linux/skbuff.h +include/net/tcp.h +net/netsyms.c +net/core/skbuff.c +net/ipv4/tcp.c diff --git a/lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc b/lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc new file mode 100644 index 0000000..887e3fa --- /dev/null +++ b/lustre/kernel_patches/pc/uml-patch-2.4.20-4.pc @@ -0,0 +1,394 @@ +CREDITS +Documentation/Configure.help +MAINTAINERS +Makefile +arch/um/Makefile +arch/um/Makefile-i386 +arch/um/Makefile-ia64 +arch/um/Makefile-os-Linux +arch/um/Makefile-ppc +arch/um/Makefile-skas +arch/um/Makefile-tt +arch/um/common.ld.in +arch/um/config.in +arch/um/config.release +arch/um/config_block.in +arch/um/config_char.in +arch/um/config_net.in +arch/um/config_scsi.in +arch/um/defconfig +arch/um/drivers/Makefile +arch/um/drivers/chan_kern.c +arch/um/drivers/chan_user.c +arch/um/drivers/daemon.h +arch/um/drivers/daemon_kern.c +arch/um/drivers/daemon_user.c +arch/um/drivers/fd.c +arch/um/drivers/harddog_kern.c +arch/um/drivers/harddog_user.c +arch/um/drivers/hostaudio_kern.c +arch/um/drivers/hostaudio_user.c +arch/um/drivers/line.c +arch/um/drivers/mcast.h +arch/um/drivers/mcast_kern.c +arch/um/drivers/mcast_user.c +arch/um/drivers/mconsole_kern.c +arch/um/drivers/mconsole_user.c +arch/um/drivers/mmapper_kern.c +arch/um/drivers/net_kern.c +arch/um/drivers/net_user.c +arch/um/drivers/null.c +arch/um/drivers/pcap_kern.c +arch/um/drivers/pcap_user.c +arch/um/drivers/pcap_user.h +arch/um/drivers/port.h +arch/um/drivers/port_kern.c +arch/um/drivers/port_user.c +arch/um/drivers/pty.c +arch/um/drivers/slip.h +arch/um/drivers/slip_kern.c +arch/um/drivers/slip_proto.h +arch/um/drivers/slip_user.c +arch/um/drivers/slirp.h +arch/um/drivers/slirp_kern.c +arch/um/drivers/slirp_user.c +arch/um/drivers/ssl.c +arch/um/drivers/ssl.h +arch/um/drivers/stdio_console.c +arch/um/drivers/stdio_console.h +arch/um/drivers/tty.c +arch/um/drivers/ubd_kern.c +arch/um/drivers/ubd_user.c +arch/um/drivers/xterm.c +arch/um/drivers/xterm.h +arch/um/drivers/xterm_kern.c +arch/um/dyn_link.ld.in +arch/um/fs/Makefile +arch/um/fs/hostfs/Makefile +arch/um/fs/hostfs/hostfs.h +arch/um/fs/hostfs/hostfs_kern.c +arch/um/fs/hostfs/hostfs_user.c +arch/um/fs/hppfs/Makefile +arch/um/fs/hppfs/hppfs_kern.c +arch/um/include/2_5compat.h +arch/um/include/Makefile +arch/um/include/chan_kern.h +arch/um/include/chan_user.h +arch/um/include/choose-mode.h +arch/um/include/frame.h +arch/um/include/frame_kern.h +arch/um/include/frame_user.h +arch/um/include/helper.h +arch/um/include/hostaudio.h +arch/um/include/init.h +arch/um/include/initrd.h +arch/um/include/irq_user.h +arch/um/include/kern.h +arch/um/include/kern_util.h +arch/um/include/line.h +arch/um/include/mconsole.h +arch/um/include/mconsole_kern.h +arch/um/include/mem.h +arch/um/include/mem_user.h +arch/um/include/mode.h +arch/um/include/mode_kern.h +arch/um/include/net_kern.h +arch/um/include/net_user.h +arch/um/include/os.h +arch/um/include/process.h +arch/um/include/ptrace_user.h +arch/um/include/sigcontext.h +arch/um/include/sigio.h +arch/um/include/signal_kern.h +arch/um/include/signal_user.h +arch/um/include/skas_ptrace.h +arch/um/include/syscall_user.h +arch/um/include/sysdep-i386/checksum.h +arch/um/include/sysdep-i386/frame.h +arch/um/include/sysdep-i386/frame_kern.h +arch/um/include/sysdep-i386/frame_user.h +arch/um/include/sysdep-i386/ptrace.h +arch/um/include/sysdep-i386/ptrace_user.h +arch/um/include/sysdep-i386/sigcontext.h +arch/um/include/sysdep-i386/syscalls.h +arch/um/include/sysdep-ia64/ptrace.h +arch/um/include/sysdep-ia64/sigcontext.h +arch/um/include/sysdep-ia64/syscalls.h +arch/um/include/sysdep-ppc/ptrace.h +arch/um/include/sysdep-ppc/sigcontext.h +arch/um/include/sysdep-ppc/syscalls.h +arch/um/include/sysrq.h +arch/um/include/tempfile.h +arch/um/include/time_user.h +arch/um/include/tlb.h +arch/um/include/ubd_user.h +arch/um/include/um_mmu.h +arch/um/include/um_uaccess.h +arch/um/include/umid.h +arch/um/include/uml_uaccess.h +arch/um/include/umn.h +arch/um/include/user.h +arch/um/include/user_util.h +arch/um/kernel/Makefile +arch/um/kernel/checksum.c +arch/um/kernel/config.c.in +arch/um/kernel/exec_kern.c +arch/um/kernel/exitcode.c +arch/um/kernel/frame.c +arch/um/kernel/frame_kern.c +arch/um/kernel/gmon_syms.c +arch/um/kernel/gprof_syms.c +arch/um/kernel/helper.c +arch/um/kernel/init_task.c +arch/um/kernel/initrd_kern.c +arch/um/kernel/initrd_user.c +arch/um/kernel/irq.c +arch/um/kernel/irq_user.c +arch/um/kernel/ksyms.c +arch/um/kernel/mem.c +arch/um/kernel/mem_user.c +arch/um/kernel/mprot.h +arch/um/kernel/process.c +arch/um/kernel/process_kern.c +arch/um/kernel/ptrace.c +arch/um/kernel/reboot.c +arch/um/kernel/resource.c +arch/um/kernel/sigio_kern.c +arch/um/kernel/sigio_user.c +arch/um/kernel/signal_kern.c +arch/um/kernel/signal_user.c +arch/um/kernel/skas/Makefile +arch/um/kernel/skas/exec_kern.c +arch/um/kernel/skas/exec_user.c +arch/um/kernel/skas/include/mmu.h +arch/um/kernel/skas/include/mode.h +arch/um/kernel/skas/include/mode_kern.h +arch/um/kernel/skas/include/proc_mm.h +arch/um/kernel/skas/include/ptrace-skas.h +arch/um/kernel/skas/include/skas.h +arch/um/kernel/skas/include/uaccess.h +arch/um/kernel/skas/mem.c +arch/um/kernel/skas/mem_user.c +arch/um/kernel/skas/mmu.c +arch/um/kernel/skas/process.c +arch/um/kernel/skas/process_kern.c +arch/um/kernel/skas/sys-i386/Makefile +arch/um/kernel/skas/sys-i386/sigcontext.c +arch/um/kernel/skas/syscall_kern.c +arch/um/kernel/skas/syscall_user.c +arch/um/kernel/skas/time.c +arch/um/kernel/skas/tlb.c +arch/um/kernel/skas/trap_user.c +arch/um/kernel/skas/util/Makefile +arch/um/kernel/skas/util/mk_ptregs.c +arch/um/kernel/smp.c +arch/um/kernel/sys_call_table.c +arch/um/kernel/syscall_kern.c +arch/um/kernel/syscall_user.c +arch/um/kernel/sysrq.c +arch/um/kernel/tempfile.c +arch/um/kernel/time.c +arch/um/kernel/time_kern.c +arch/um/kernel/tlb.c +arch/um/kernel/trap_kern.c +arch/um/kernel/trap_user.c +arch/um/kernel/tt/Makefile +arch/um/kernel/tt/exec_kern.c +arch/um/kernel/tt/exec_user.c +arch/um/kernel/tt/gdb.c +arch/um/kernel/tt/gdb_kern.c +arch/um/kernel/tt/include/debug.h +arch/um/kernel/tt/include/mmu.h +arch/um/kernel/tt/include/mode.h +arch/um/kernel/tt/include/mode_kern.h +arch/um/kernel/tt/include/ptrace-tt.h +arch/um/kernel/tt/include/tt.h +arch/um/kernel/tt/include/uaccess.h +arch/um/kernel/tt/ksyms.c +arch/um/kernel/tt/mem.c +arch/um/kernel/tt/mem_user.c +arch/um/kernel/tt/process_kern.c +arch/um/kernel/tt/ptproxy/Makefile +arch/um/kernel/tt/ptproxy/proxy.c +arch/um/kernel/tt/ptproxy/ptproxy.h +arch/um/kernel/tt/ptproxy/ptrace.c +arch/um/kernel/tt/ptproxy/sysdep.c +arch/um/kernel/tt/ptproxy/sysdep.h +arch/um/kernel/tt/ptproxy/wait.c +arch/um/kernel/tt/ptproxy/wait.h +arch/um/kernel/tt/sys-i386/Makefile +arch/um/kernel/tt/sys-i386/sigcontext.c +arch/um/kernel/tt/syscall_kern.c +arch/um/kernel/tt/syscall_user.c +arch/um/kernel/tt/time.c +arch/um/kernel/tt/tlb.c +arch/um/kernel/tt/tracer.c +arch/um/kernel/tt/trap_user.c +arch/um/kernel/tt/uaccess_user.c +arch/um/kernel/tt/unmap.c +arch/um/kernel/tty_log.c +arch/um/kernel/uaccess_user.c +arch/um/kernel/um_arch.c +arch/um/kernel/umid.c +arch/um/kernel/user_syms.c +arch/um/kernel/user_util.c +arch/um/link.ld.in +arch/um/main.c +arch/um/os-Linux/Makefile +arch/um/os-Linux/drivers/Makefile +arch/um/os-Linux/drivers/etap.h +arch/um/os-Linux/drivers/ethertap_kern.c +arch/um/os-Linux/drivers/ethertap_user.c +arch/um/os-Linux/drivers/tuntap.h +arch/um/os-Linux/drivers/tuntap_kern.c +arch/um/os-Linux/drivers/tuntap_user.c +arch/um/os-Linux/file.c +arch/um/os-Linux/include/file.h +arch/um/os-Linux/process.c +arch/um/os-Linux/tty.c +arch/um/sys-i386/Makefile +arch/um/sys-i386/bugs.c +arch/um/sys-i386/checksum.S +arch/um/sys-i386/fault.c +arch/um/sys-i386/ksyms.c +arch/um/sys-i386/ldt.c +arch/um/sys-i386/ptrace.c +arch/um/sys-i386/ptrace_user.c +arch/um/sys-i386/sigcontext.c +arch/um/sys-i386/syscalls.c +arch/um/sys-i386/sysrq.c +arch/um/sys-i386/util/Makefile +arch/um/sys-i386/util/mk_sc.c +arch/um/sys-i386/util/mk_thread_kern.c +arch/um/sys-i386/util/mk_thread_user.c +arch/um/sys-ia64/Makefile +arch/um/sys-ppc/Makefile +arch/um/sys-ppc/misc.S +arch/um/sys-ppc/miscthings.c +arch/um/sys-ppc/ptrace.c +arch/um/sys-ppc/ptrace_user.c +arch/um/sys-ppc/sigcontext.c +arch/um/sys-ppc/sysrq.c +arch/um/util/Makefile +arch/um/util/mk_constants_kern.c +arch/um/util/mk_constants_user.c +arch/um/util/mk_task_kern.c +arch/um/util/mk_task_user.c +drivers/char/Makefile +drivers/char/tty_io.c +drivers/net/setup.c +include/asm-i386/hardirq.h +include/asm-um/a.out.h +include/asm-um/arch-signal-i386.h +include/asm-um/archparam-i386.h +include/asm-um/archparam-ppc.h +include/asm-um/atomic.h +include/asm-um/bitops.h +include/asm-um/boot.h +include/asm-um/bugs.h +include/asm-um/byteorder.h +include/asm-um/cache.h +include/asm-um/checksum.h +include/asm-um/cobalt.h +include/asm-um/current.h +include/asm-um/delay.h +include/asm-um/desc.h +include/asm-um/div64.h +include/asm-um/dma.h +include/asm-um/elf.h +include/asm-um/errno.h +include/asm-um/fcntl.h +include/asm-um/fixmap.h +include/asm-um/floppy.h +include/asm-um/hardirq.h +include/asm-um/hdreg.h +include/asm-um/highmem.h +include/asm-um/hw_irq.h +include/asm-um/ide.h +include/asm-um/init.h +include/asm-um/io.h +include/asm-um/ioctl.h +include/asm-um/ioctls.h +include/asm-um/ipc.h +include/asm-um/ipcbuf.h +include/asm-um/irq.h +include/asm-um/keyboard.h +include/asm-um/kmap_types.h +include/asm-um/linux_logo.h +include/asm-um/locks.h +include/asm-um/mca_dma.h +include/asm-um/mman.h +include/asm-um/mmu.h +include/asm-um/mmu_context.h +include/asm-um/module.h +include/asm-um/msgbuf.h +include/asm-um/mtrr.h +include/asm-um/namei.h +include/asm-um/page.h +include/asm-um/page_offset.h +include/asm-um/param.h +include/asm-um/pci.h +include/asm-um/pgalloc.h +include/asm-um/pgtable.h +include/asm-um/poll.h +include/asm-um/posix_types.h +include/asm-um/processor-generic.h +include/asm-um/processor-i386.h +include/asm-um/processor-ppc.h +include/asm-um/ptrace-generic.h +include/asm-um/ptrace-i386.h +include/asm-um/resource.h +include/asm-um/rwlock.h +include/asm-um/rwsem.h +include/asm-um/scatterlist.h +include/asm-um/segment.h +include/asm-um/semaphore.h +include/asm-um/sembuf.h +include/asm-um/serial.h +include/asm-um/shmbuf.h +include/asm-um/shmparam.h +include/asm-um/sigcontext-generic.h +include/asm-um/sigcontext-i386.h +include/asm-um/sigcontext-ppc.h +include/asm-um/siginfo.h +include/asm-um/signal.h +include/asm-um/smp.h +include/asm-um/smplock.h +include/asm-um/socket.h +include/asm-um/sockios.h +include/asm-um/softirq.h +include/asm-um/spinlock.h +include/asm-um/stat.h +include/asm-um/statfs.h +include/asm-um/string.h +include/asm-um/system-generic.h +include/asm-um/system-i386.h +include/asm-um/system-ppc.h +include/asm-um/termbits.h +include/asm-um/termios.h +include/asm-um/timex.h +include/asm-um/tlb.h +include/asm-um/types.h +include/asm-um/uaccess.h +include/asm-um/ucontext.h +include/asm-um/unaligned.h +include/asm-um/unistd.h +include/asm-um/user.h +include/asm-um/vga.h +include/asm-um/xor.h +include/linux/blk.h +include/linux/fs.h +include/linux/hostfs_fs_i.h +include/linux/hppfs_fs_i.h +include/linux/kernel.h +include/linux/kernel_stat.h +include/linux/mm.h +include/linux/proc_mm.h +include/linux/tty.h +init/do_mounts.c +kernel/panic.c +mm/Makefile +mm/mmap.c +mm/mprotect.c +mm/proc_mm.c +mm/slab.c diff --git a/lustre/kernel_patches/pc/uml_check_get_page.pc b/lustre/kernel_patches/pc/uml_check_get_page.pc index 3dbf042..0e90ce5 100644 --- a/lustre/kernel_patches/pc/uml_check_get_page.pc +++ b/lustre/kernel_patches/pc/uml_check_get_page.pc @@ -1,2 +1 @@ arch/um/kernel/mem.c -arch/um/kernel/mem.c.uml-fixes diff --git a/lustre/kernel_patches/pc/uml_compile_fixes.pc b/lustre/kernel_patches/pc/uml_compile_fixes.pc index c1caa12..cd28cbd 100644 --- a/lustre/kernel_patches/pc/uml_compile_fixes.pc +++ b/lustre/kernel_patches/pc/uml_compile_fixes.pc @@ -1,2 +1 @@ include/asm-um/pgtable.h -include/asm-um/pgtable.h.orig diff --git a/lustre/kernel_patches/pc/uml_no_panic.pc b/lustre/kernel_patches/pc/uml_no_panic.pc index 3dbf042..0e90ce5 100644 --- a/lustre/kernel_patches/pc/uml_no_panic.pc +++ b/lustre/kernel_patches/pc/uml_no_panic.pc @@ -1,2 +1 @@ arch/um/kernel/mem.c -arch/um/kernel/mem.c.uml-fixes diff --git a/lustre/kernel_patches/pc/vanilla-2.4.18.pc b/lustre/kernel_patches/pc/vanilla-2.4.18.pc deleted file mode 100644 index c1ed719..0000000 --- a/lustre/kernel_patches/pc/vanilla-2.4.18.pc +++ /dev/null @@ -1,23 +0,0 @@ -include/linux/lustre_version.h -arch/ia64/mm/init.c -arch/i386/mm/init.c -drivers/block/blkpg.c -drivers/block/loop.c -drivers/ide/ide-disk.c -fs/ext3/Makefile -fs/ext3/super.c -fs/jbd/commit.c -fs/jbd/journal.c -fs/jbd/transaction.c -include/linux/blkdev.h -include/linux/slab.h -include/linux/jbd.h -kernel/ksyms.c -include/linux/dcache.h -include/linux/fs.h -fs/dcache.c -fs/nfsd/vfs.c -fs/namei.c -fs/open.c -fs/stat.c -mm/slab.c diff --git a/lustre/kernel_patches/pc/vanilla-2.4.19.pc b/lustre/kernel_patches/pc/vanilla-2.4.19.pc deleted file mode 100644 index bb5c390..0000000 --- a/lustre/kernel_patches/pc/vanilla-2.4.19.pc +++ /dev/null @@ -1,19 +0,0 @@ -include/linux/lustre_version.h -arch/ia64/mm/init.c -arch/i386/mm/init.c -drivers/block/blkpg.c -drivers/block/loop.c -drivers/ide/ide-disk.c -fs/ext3/Makefile -fs/ext3/super.c -include/linux/blkdev.h -include/linux/slab.h -kernel/ksyms.c -include/linux/dcache.h -include/linux/fs.h -fs/dcache.c -fs/nfsd/vfs.c -fs/namei.c -fs/open.c -fs/stat.c -mm/slab.c diff --git a/lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc b/lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc index dd2b1c8..8801aa7 100644 --- a/lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc +++ b/lustre/kernel_patches/pc/vfs_intent-2.4.18-18.pc @@ -2,6 +2,7 @@ fs/dcache.c fs/namei.c fs/nfsd/vfs.c fs/open.c +fs/proc/base.c fs/stat.c fs/exec.c include/linux/dcache.h diff --git a/lustre/kernel_patches/pc/vfs_intent.pc b/lustre/kernel_patches/pc/vfs_intent-2.4.20-rh.pc similarity index 82% rename from lustre/kernel_patches/pc/vfs_intent.pc rename to lustre/kernel_patches/pc/vfs_intent-2.4.20-rh.pc index 881576c..fbe6ff1 100644 --- a/lustre/kernel_patches/pc/vfs_intent.pc +++ b/lustre/kernel_patches/pc/vfs_intent-2.4.20-rh.pc @@ -6,3 +6,5 @@ fs/stat.c include/linux/dcache.h include/linux/fs.h kernel/ksyms.c +fs/exec.c +fs/proc/base.c diff --git a/lustre/kernel_patches/pc/vfs_intent_hp.pc b/lustre/kernel_patches/pc/vfs_intent-2.4.20-vanilla.pc similarity index 82% rename from lustre/kernel_patches/pc/vfs_intent_hp.pc rename to lustre/kernel_patches/pc/vfs_intent-2.4.20-vanilla.pc index 881576c..f8a99ea 100644 --- a/lustre/kernel_patches/pc/vfs_intent_hp.pc +++ b/lustre/kernel_patches/pc/vfs_intent-2.4.20-vanilla.pc @@ -1,8 +1,10 @@ +fs/exec.c fs/dcache.c fs/namei.c fs/nfsd/vfs.c fs/open.c fs/stat.c +fs/proc/base.c include/linux/dcache.h include/linux/fs.h kernel/ksyms.c diff --git a/lustre/kernel_patches/prepare_tree.sh b/lustre/kernel_patches/prepare_tree.sh index f512132..7d688db 100755 --- a/lustre/kernel_patches/prepare_tree.sh +++ b/lustre/kernel_patches/prepare_tree.sh @@ -67,7 +67,7 @@ REVINO=`(cd $TREE ; stat $REVERSE | awk '($3 == "Inode:") {print $4}')` [ $ABSINO != $REVINO ] && die "inodes differ, my reverse path is bad?" -echo export PATCHSCRIPTS=$REVERSE +echo export PATCHSCRIPTS_LIBDIR=$REVERSE cd $TREE ln -sf $REVERSE/series/$SERIES series diff --git a/lustre/kernel_patches/scripts/apatch b/lustre/kernel_patches/scripts/apatch index 4b63598..be1c68e 100755 --- a/lustre/kernel_patches/scripts/apatch +++ b/lustre/kernel_patches/scripts/apatch @@ -5,8 +5,6 @@ echo "Check your install, or go to the right directory" exit 1 } - - do_apply() { FILES=$(cat $P/pc/$PATCH_NAME.pc) @@ -70,7 +68,7 @@ apatch() echo "$PATCH_NAME" is already applied exit 1 fi - + if [ $opt_force != 0 ] then echo FORCING PATCH @@ -78,6 +76,7 @@ apatch() if [ $opt_force != 0 ] || can_apply $P/patches/"$PATCH_NAME".patch then + check_pc_match $P/patches/"$PATCH_NAME".patch $P/pc/"$PATCH_NAME".pc do_apply $P/patches/"$PATCH_NAME".patch add_to_db "$PATCH_NAME" echo applied $PATCH_NAME diff --git a/lustre/kernel_patches/scripts/cat-series b/lustre/kernel_patches/scripts/cat-series new file mode 100755 index 0000000..c38b1a8 --- /dev/null +++ b/lustre/kernel_patches/scripts/cat-series @@ -0,0 +1,17 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +if [ $# -eq 0 ] +then + cat_series +else + __cat_series $1 +fi diff --git a/lustre/kernel_patches/scripts/combine-applied b/lustre/kernel_patches/scripts/combine-applied index 8768b29..60ab7e9 100755 --- a/lustre/kernel_patches/scripts/combine-applied +++ b/lustre/kernel_patches/scripts/combine-applied @@ -23,21 +23,23 @@ fi need_file_there applied-patches CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX) +APPLY_FILE=$(mktemp /tmp/cmbd-XXXXXXXX) for FILE in `cat applied-patches` do - NEXT=$(mktemp /tmp/cmbd-XXXXXXXX) - if [ -f $P/patches/$FILE ] + if [ -f $P/pc/$FILE.pc ] then - combinediff $CURRENT $P/patches/$FILE > $NEXT - elif [ -f $P/patches/$FILE.patch ] + cat $P/pc/$FILE.pc >> $CURRENT + elif [ -f $P/pc/$FILE ] then - combinediff $CURRENT $P/patches/$FILE.patch > $NEXT - elif [ -f $FILE ] - then - combinediff $CURRENT $FILE > $NEXT - fi - rm $CURRENT - CURRENT=$NEXT + cat $P/pc/$FILE >> $CURRENT + fi +done +cat $CURRENT | sort -u > $APPLY_FILE +echo > $1 +for FILE in `cat $APPLY_FILE` +do + diff -uNp $FILE~orig $FILE >> $1 done +rm -rf $APPLY_FILE +rm -rf $CURRENT -mv $NEXT "$1" diff --git a/lustre/kernel_patches/scripts/forkpatch b/lustre/kernel_patches/scripts/forkpatch new file mode 100755 index 0000000..cef297c --- /dev/null +++ b/lustre/kernel_patches/scripts/forkpatch @@ -0,0 +1,76 @@ +#!/bin/sh + +# +# Fork the next patch in the series +# + +. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: forkpatch <newname>" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +NEW=$1 +BASE=`stripit $NEW` +SERIES=series + +if [ ! -e $SERIES ] +then + echo 'File "series" not found' + exit 1 +fi + +if [ -f $P/$BASE.patch ] ; then + echo "Patch $NEW already exists as a file" + exit 1 +fi + +if grep $BASE $SERIES >& /dev/null ; then + echo "Patch $NEW already exists in series" + exit 1 +fi + +TMPSERIES=$(mktemp /tmp/series-XXXXXXXX) +top=$(toppatch) +if [ x"$top" == x ] +then + todo=$(head -1 $SERIES) +else + last_in_series=$(stripit $(tail -1 $SERIES)) + if [ $last_in_series == $top ] + then + echo "Series fully applied. Ends at $top" + exit 0 + fi + todo=$(grep -C1 "^$top\.patch" $SERIES | tail -1) + if [ x$todo = x ] + then + todo=$(head -1 $SERIES) + fi +fi + +basetodo=`stripit $todo` + +sed "s/$todo/$BASE.patch/" < $SERIES > $TMPSERIES +cat $TMPSERIES > $SERIES +rm -f $TMPSERIES +cp -f $P/patches/$todo $P/patches/$BASE.patch +cp -f $P/pc/$basetodo.pc $P/pc/$BASE.pc +if [ -f $P/txt/$basetodo.txt ]; then + cp -f $P/txt/$basetodo.txt $P/txt/$BASE.txt +else + echo "Warning no documentation for $BASE" +fi + +echo "Cloned $todo to $BASE" diff --git a/lustre/kernel_patches/scripts/join-patch b/lustre/kernel_patches/scripts/join-patch new file mode 100755 index 0000000..065ea73 --- /dev/null +++ b/lustre/kernel_patches/scripts/join-patch @@ -0,0 +1,28 @@ +#!/bin/sh + +usage() +{ + echo "Usage: join-patch patchname" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +PATCHNAME=$(stripit $1) + +if ! can_apply $PATCHNAME +then + echo Patch $PATCHNAME does not apply + exit 1 +fi + +pcpatch $PATCHNAME +for i in $(cat $P/pc/$PATCHNAME.pc) +do + fpatch $i +done + +patch -p1 -i "$P/patches/$PATCHNAME.patch" -f diff --git a/lustre/kernel_patches/scripts/patchfns b/lustre/kernel_patches/scripts/patchfns index b6cc468..78e494b 100644 --- a/lustre/kernel_patches/scripts/patchfns +++ b/lustre/kernel_patches/scripts/patchfns @@ -11,9 +11,9 @@ DB=applied-patches # Otherwise use "." # -if [ x$PATCHSCRIPTS != x ] +if [ x$PATCHSCRIPTS_LIBDIR != x ] then - P=$PATCHSCRIPTS + P=$PATCHSCRIPTS_LIBDIR elif [ -d ./patch-scripts ] then P=./patch-scripts @@ -69,7 +69,20 @@ is_applied() return 1 fi } - +check_pc_match() +{ + if [ -f /usr/bin/lsdiff ]; then + tmpfile=$(mktemp /tmp/p_XXXXXX) || exit 1 + lsdiff --strip=1 $1 > $tmpfile + diff $2 $tmpfile > /dev/null + if [ $? != 0 ]; then + echo " $1 do not match with $2 " + echo " $2 will be changed to match $2" + cat $tmpfile > $P/pc/$PATCH_NAME.pc + fi + rm -rf $tmpfile + fi +} can_apply() { if patch -p1 --dry-run -i "$1" -f @@ -166,19 +179,23 @@ copy_file_to_bup() file=$1 patch=$2 bup="$file"~"$patch" + orig="$file"~"orig" + src_dir=`pwd` if [ -e $bup ] then echo "Cannot install file $file in patch $patch: backup $bup exists" exit 1 fi - if [ -e $file ] then - cp $file "$file"~"$patch" + cp -p $file "$file"~"$patch" else echo "file $file appears to be newly added" fi + if [ ! -L "$orig" ]; then + ln -s "$src_dir/$bup" $orig + fi } install_file_in_patch() diff --git a/lustre/kernel_patches/scripts/poppatch b/lustre/kernel_patches/scripts/poppatch index 792cb9b..70055d6 100755 --- a/lustre/kernel_patches/scripts/poppatch +++ b/lustre/kernel_patches/scripts/poppatch @@ -64,7 +64,9 @@ do then if [ $STOP_AT == $(toppatch) ] then + sum-series applied-patch exit 0 fi fi done +sum-series applied-patch diff --git a/lustre/kernel_patches/scripts/pushpatch b/lustre/kernel_patches/scripts/pushpatch index 018716d..6702e63 100755 --- a/lustre/kernel_patches/scripts/pushpatch +++ b/lustre/kernel_patches/scripts/pushpatch @@ -78,7 +78,9 @@ do then if [ $STOP_AT == $(toppatch) ] then + sum-series applied-patch exit 0 fi fi done +sum-series applied-patch diff --git a/lustre/kernel_patches/scripts/refpatch b/lustre/kernel_patches/scripts/refpatch index 88f3caf..3195a57 100755 --- a/lustre/kernel_patches/scripts/refpatch +++ b/lustre/kernel_patches/scripts/refpatch @@ -28,4 +28,5 @@ fi TOP_PATCH=$(top_patch) mpatch $* $(top_patch) +sum-series applied-patch echo "Refreshed $TOP_PATCH" diff --git a/lustre/kernel_patches/scripts/rpatch b/lustre/kernel_patches/scripts/rpatch index 42e1533..5a8da38 100755 --- a/lustre/kernel_patches/scripts/rpatch +++ b/lustre/kernel_patches/scripts/rpatch @@ -6,15 +6,34 @@ exit 1 } +# do_remove() +# { +# if patch -R -p1 -s -i $P/patches/"$1".patch +# then +# true +# else +# echo SOMETHING WENT WRONG +# exit 1 +# fi +# } + do_remove() { - if patch -R -p1 -s -i $P/patches/"$1".patch - then - true - else - echo SOMETHING WENT WRONG - exit 1 - fi + FILES=$(cat $P/pc/$1.pc) + for file in $FILES ; do + base_dir=`pwd` + if [ -L "$file"~"orig" ]; then + if [ `readlink "$file"~"orig"` = "$base_dir/""$file"~"$1" ]; then + rm -rf "$file"~"orig" + fi + fi + if [ -f "$file"~"$1" ]; then + mv -f "$file"~"$1" "$file" + else + rm -f "$file" + fi + done + true } kill_old_ones() @@ -40,18 +59,20 @@ fi PATCH_NAME=$(stripit $1) warn_top_current - if is_applied "$PATCH_NAME" then - if can_remove "$PATCH_NAME" - then +# if can_remove "$PATCH_NAME" +# then + if [ ! -f $P/pc/$PATCH_NAME.pc ]; then + exit 1 + fi do_remove "$PATCH_NAME" kill_old_ones "$PATCH_NAME" remove_from_db "$PATCH_NAME" - else - echo "$PATCH_NAME" does not remove cleanly - exit 1 - fi +# else +# echo "$PATCH_NAME" does not remove cleanly +# exit 1 +# fi else echo "$PATCH_NAME" is not applied exit 1 diff --git a/lustre/kernel_patches/scripts/sum-series b/lustre/kernel_patches/scripts/sum-series new file mode 100755 index 0000000..5b628fb --- /dev/null +++ b/lustre/kernel_patches/scripts/sum-series @@ -0,0 +1,41 @@ +#!/bin/sh + +# +# Make superpatch from current series using combinediff. +# + +. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: sum-series output-file" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +need_file_there applied-patches +CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX) +for FILE in $(cat applied-patches) +do +# echo "Adding patch $FILE...." + if [ -f $P/patches/$FILE ] + then + cat $P/patches/$FILE >> $CURRENT + elif [ -f $P/patches/$FILE.patch ] + then + cat $P/patches/$FILE.patch >> $CURRENT + elif [ -f $FILE ] + then + cat $FILE >> $CURRENT + fi +done + +mv $CURRENT "$1" diff --git a/lustre/kernel_patches/scripts/trypatch b/lustre/kernel_patches/scripts/trypatch new file mode 100755 index 0000000..2e3cd15 --- /dev/null +++ b/lustre/kernel_patches/scripts/trypatch @@ -0,0 +1,72 @@ +#!/bin/sh + +# +# Fork the next patch in the series +# + +. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: trypatch <newname>" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +NEW=$1 +BASE=`stripit $NEW` +SERIES=series + +if [ ! -e $SERIES ] +then + echo 'File "series" not found' + exit 1 +fi + +if grep $BASE $SERIES >& /dev/null ; then + echo "Patch $NEW already exists in series" + exit 1 +fi + +if [ ! -f $P/patches/$BASE.patch ] ; then + echo "Patch $NEW doesn't exist as a file" + exit 1 +fi + +$TMPSERIES=$(mktemp /tmp/series-XXXXXXXX) +top=$(toppatch) +if [ x"$top" == x ] +then + todo=$(head -1 $SERIES) +else + last_in_series=$(stripit $(tail -1 $SERIES)) + if [ $last_in_series == $top ] + then + echo "Series fully applied. Ends at $top" + exit 0 + fi + todo=$(grep -C1 "^$top\.patch" $SERIES | tail -1) + if [ x$todo = x ] + then + todo=$(head -1 $SERIES) + fi +fi + +if patch -p1 -i $P/patches/$BASE.patch ; then + patch -R -p1 -i $P/patches/$BASE.patch + + $basetodo=$(basename $todo) + sed "s/$todo/$BASE/" < $SERIES > $TMPSERIES + mv -f $TMPSERIES $SERIES + echo "Replaced $todo with $BASE" +else + echo "Failed to replace $todo with $BASE" +fi diff --git a/lustre/kernel_patches/scripts/unused-patches b/lustre/kernel_patches/scripts/unused-patches new file mode 100755 index 0000000..2f3a70a --- /dev/null +++ b/lustre/kernel_patches/scripts/unused-patches @@ -0,0 +1,39 @@ +#!/bin/sh + +# +# List unused patches +# + +. patchfns >/dev/null || . /usr/lib/patch-scripts/patchfns >/dev/null || { \ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: unused-patches" + exit 1 +} + +if [ $# -ne 0 ] +then + usage +fi + +for FILE in $(ls $P/patches) +do + BASE=`stripit $FILE` +# echo checking $BASE in $P/patches + if grep $FILE $P/series/* >& /dev/null ; then + true +# echo $FILE found in $P/series + else + if [ $BASE != CVS ]; then + echo patches/$FILE + echo txt/$BASE.txt + echo pc/$BASE.pc + fi + fi +done + diff --git a/lustre/kernel_patches/series/chaos b/lustre/kernel_patches/series/chaos index 913ae18..00ae7fd 100644 --- a/lustre/kernel_patches/series/chaos +++ b/lustre/kernel_patches/series/chaos @@ -5,3 +5,16 @@ lustre_version.patch vfs_intent-2.4.18-18.patch invalidate_show.patch iod-rmap-exports.patch +export-truncate.patch +htree-ext3-2.4.18.patch +linux-2.4.18ea-0.8.26.patch +ext3-2.4-ino_t.patch +ext3-2.4.18-ino_sb_macro.patch +ext3-orphan_lock.patch +ext3-delete_thread-2.4.18.patch +extN-misc-fixup.patch +extN-noread.patch +extN-wantedi.patch +extN-san.patch +extN-2.4.18-ino_sb_fixup.patch +iopen-2.4.18.patch diff --git a/lustre/kernel_patches/series/hp-pnnl b/lustre/kernel_patches/series/hp-pnnl deleted file mode 100644 index bf276fb..0000000 --- a/lustre/kernel_patches/series/hp-pnnl +++ /dev/null @@ -1,8 +0,0 @@ -dev_read_only_hp.patch -exports_hp.patch -kmem_cache_validate_hp.patch -jbd-transno-cb.patch -lustre_version.patch -vfs_intent_hp.patch -invalidate_show.patch -iod-stock-24-exports_hp.patch diff --git a/lustre/kernel_patches/series/hp-pnnl-2.4.20 b/lustre/kernel_patches/series/hp-pnnl-2.4.20 new file mode 100644 index 0000000..b951209 --- /dev/null +++ b/lustre/kernel_patches/series/hp-pnnl-2.4.20 @@ -0,0 +1,25 @@ +dev_read_only_hp_2.4.20.patch +exports_2.4.20-rh-hp.patch +kmem_cache_validate_hp.patch +lustre_version.patch +vfs_intent-2.4.20-vanilla.patch +invalidate_show.patch +export-truncate.patch +iod-stock-24-exports_hp.patch +ext-2.4-patch-1.patch +ext-2.4-patch-2.patch +ext-2.4-patch-3.patch +ext-2.4-patch-4.patch +linux-2.4.20-xattr-0.8.54-hp.patch +ext3-2.4.20-fixes.patch +ext3-2.4-ino_t.patch +ext3-largefile.patch +ext3-truncate_blocks.patch +ext3-use-after-free.patch +ext3-orphan_lock.patch +ext3-delete_thread-2.4.20.patch +ext3-noread-2.4.20.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +iopen-2.4.20.patch +tcp-zero-copy.patch diff --git a/lustre/kernel_patches/series/rh-2.4.18-18 b/lustre/kernel_patches/series/rh-2.4.18-18 index 51a833f..df7f536 100644 --- a/lustre/kernel_patches/series/rh-2.4.18-18 +++ b/lustre/kernel_patches/series/rh-2.4.18-18 @@ -7,4 +7,18 @@ uml_no_panic.patch vfs_intent-2.4.18-18.patch uml_compile_fixes.patch invalidate_show.patch +export-truncate.patch iod-rmap-exports.patch +htree-ext3-2.4.18.patch +linux-2.4.18ea-0.8.26.patch +ext3-2.4-ino_t.patch +ext3-2.4.18-ino_sb_macro.patch +ext3-orphan_lock.patch +ext3-delete_thread-2.4.18.patch +extN-misc-fixup.patch +extN-noread.patch +extN-wantedi.patch +extN-san.patch +extN-2.4.18-ino_sb_fixup.patch +iopen-2.4.18.patch +tcp-zero-copy.patch diff --git a/lustre/kernel_patches/series/rh-2.4.20 b/lustre/kernel_patches/series/rh-2.4.20 new file mode 100644 index 0000000..a97c37c --- /dev/null +++ b/lustre/kernel_patches/series/rh-2.4.20 @@ -0,0 +1,23 @@ +mcore-2.4.20-8.patch +dsp.patch +dev_read_only_2.4.20-rh.patch +exports_2.4.20-rh-hp.patch +kmem_cache_validate_2.4.20-rh.patch +lustre_version.patch +vfs_intent-2.4.20-rh.patch +invalidate_show-2.4.20-rh.patch +iod-rmap-exports-2.4.20.patch +export-truncate.patch +ext-2.4-patch-1-chaos.patch +ext-2.4-patch-2.patch +ext-2.4-patch-3.patch +ext-2.4-patch-4.patch +linux-2.4.20-xattr-0.8.54-chaos.patch +ext3-2.4.20-fixes.patch +ext3_orphan_lock-2.4.20-rh.patch +ext3-delete_thread-2.4.20.patch +ext3-noread-2.4.20.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +iopen-2.4.20.patch +tcp-zero-copy.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.18 b/lustre/kernel_patches/series/vanilla-2.4.18 deleted file mode 100644 index 5d2ab68..0000000 --- a/lustre/kernel_patches/series/vanilla-2.4.18 +++ /dev/null @@ -1,2 +0,0 @@ -vanilla-2.4.18 -invalidate_show.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.19 b/lustre/kernel_patches/series/vanilla-2.4.19 deleted file mode 100644 index 37cb65e..0000000 --- a/lustre/kernel_patches/series/vanilla-2.4.19 +++ /dev/null @@ -1,3 +0,0 @@ -vanilla-2.4.19.patch -jbd-transno-cb.patch -invalidate_show.patch diff --git a/lustre/kernel_patches/series/vanilla-2.4.20 b/lustre/kernel_patches/series/vanilla-2.4.20 new file mode 100644 index 0000000..e56cac6c --- /dev/null +++ b/lustre/kernel_patches/series/vanilla-2.4.20 @@ -0,0 +1,29 @@ +uml-patch-2.4.20-4.patch +dev_read_only_2.4.20.patch +exports_2.4.20.patch +kmem_cache_validate_2.4.20.patch +lustre_version.patch +vfs_intent-2.4.20-vanilla.patch +invalidate_show.patch +export-truncate.patch +iod-stock-24-exports.patch +uml_check_get_page.patch +uml_no_panic.patch +ext-2.4-patch-1.patch +ext-2.4-patch-2.patch +ext-2.4-patch-3.patch +ext-2.4-patch-4.patch +linux-2.4.20-xattr-0.8.54.patch +ext3-2.4.20-fixes.patch +ext3-2.4-ino_t.patch +ext3-largefile.patch +ext3-truncate_blocks.patch +ext3-unmount_sync.patch +ext3-use-after-free.patch +ext3-orphan_lock.patch +ext3-noread-2.4.20.patch +ext3-delete_thread-2.4.20.patch +extN-wantedi.patch +ext3-san-2.4.20.patch +iopen-2.4.20.patch +tcp-zero-copy.patch diff --git a/lustre/kernel_patches/series/vanilla-2.5 b/lustre/kernel_patches/series/vanilla-2.5 index 3269420..b77c77b 100644 --- a/lustre/kernel_patches/series/vanilla-2.5 +++ b/lustre/kernel_patches/series/vanilla-2.5 @@ -1,2 +1,2 @@ lustre_version.patch -lustre-2.5.patch +lustre-2.5.63.patch diff --git a/lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt b/lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt new file mode 100644 index 0000000..b890cbd --- /dev/null +++ b/lustre/kernel_patches/txt/ext3-2.4.20-fixes.txt @@ -0,0 +1,3 @@ +DESC +Fix for block allocation errors if block bitmap or inode block list is corrupt. +EDESC diff --git a/lustre/kernel_patches/txt/vfs_intent.txt b/lustre/kernel_patches/txt/vfs_intent.txt deleted file mode 100644 index 010cdb7..0000000 --- a/lustre/kernel_patches/txt/vfs_intent.txt +++ /dev/null @@ -1,3 +0,0 @@ -DESC -(undescribed patch) -EDESC diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index b7af3d9..2ef001d 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -1,10 +1,13 @@ -series/chaos - chaos-39 -series/rh-2.4.18-18 - redhat 2.4.18-18 -series/hp-pnnl ** Note: functionally equivalent to 2.4.19 - linux-2.4.18-hp2_pnnl2 -series/vanilla-2.4.19 ** Not officially supported - linux-2.4.19 -series/lin-2.5.44 - uml-2.5.44 +SERIES MEMNONIC COMMENT + +hp-pnnl-2.4.20 linux-2.4.20-hp4_pnnl1 same as vanilla but no uml +vanilla-2.4.20 linux-2.4.20 patch includes uml +chaos-2.4.20 linux-chaos-2.4.20 same as rh-2.4.20-8 +rh-2.4.20 linux-rh-2.4.20-8 same as chaos-2.4.20 +rh-2.4.18-18 linux-rh-2.4.18-18 same as chaos but includes uml +chaos linux-chaos-2.4.18 same as rh-2.4.18-18 but no uml + +REVIEW: + +vanilla-2.5 linux-2.5.63 +hp-pnnl linux-2.4.19-hp2_pnnl6 diff --git a/lustre/ldlm/Makefile.am b/lustre/ldlm/Makefile.am index d0c4199..1ceb276 100644 --- a/lustre/ldlm/Makefile.am +++ b/lustre/ldlm/Makefile.am @@ -5,7 +5,7 @@ DEFS= -LDLMSOURCES= l_lock.c ldlm_lock.c ldlm_resource.c \ +LDLMSOURCES= l_lock.c ldlm_lock.c ldlm_resource.c ldlm_lib.c \ ldlm_extent.c ldlm_request.c ldlm_lockd.c if LIBLUSTRE @@ -20,5 +20,3 @@ ldlm_SOURCES = $(LDLMSOURCES) endif include $(top_srcdir)/Rules - - diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 9b10854..f6a9f5e 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -30,16 +30,11 @@ #include <linux/obd_support.h> #include <linux/lustre_lib.h> -/* This function will be called to judge if the granted queue of another child - * (read: another extent) is conflicting and needs its granted queue walked to - * issue callbacks. - * - * This helps to find conflicts between read and write locks on overlapping - * extents. */ +/* This function will be called to judge if one extent overlaps with another */ int ldlm_extent_compat(struct ldlm_lock *a, struct ldlm_lock *b) { - if (MAX(a->l_extent.start, b->l_extent.start) <= - MIN(a->l_extent.end, b->l_extent.end)) + if ((a->l_extent.start <= b->l_extent.end) && + (a->l_extent.end >= b->l_extent.start)) RETURN(0); RETURN(1); @@ -48,7 +43,7 @@ int ldlm_extent_compat(struct ldlm_lock *a, struct ldlm_lock *b) /* The purpose of this function is to return: * - the maximum extent * - containing the requested extent - * - and not overlapping existing extents outside the requested one + * - and not overlapping existing conflicting extents outside the requested one * * An alternative policy is to not shrink the new extent when conflicts exist. * @@ -62,21 +57,33 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex, struct ldlm_lock *lock; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - if (lock->l_extent.end < req_ex->start) { - new_ex->start = MIN(lock->l_extent.end, new_ex->start); - } else { - if (lock->l_extent.start < req_ex->start && - !lockmode_compat(lock->l_req_mode, mode)) - /* Policy: minimize conflict overlap */ + /* if lock doesn't overlap new_ex, skip it. */ + if (lock->l_extent.end < new_ex->start || + lock->l_extent.start > new_ex->end) + continue; + + /* Locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_req_mode, mode)) + continue; + + if (lock->l_extent.start < req_ex->start) { + if (lock->l_extent.end == ~0) { new_ex->start = req_ex->start; + new_ex->end = req_ex->end; + return; + } + new_ex->start = MIN(lock->l_extent.end + 1, + req_ex->start); } - if (lock->l_extent.start > req_ex->end) { - new_ex->end = MAX(lock->l_extent.start, new_ex->end); - } else { - if (lock->l_extent.end > req_ex->end && - !lockmode_compat(lock->l_req_mode, mode)) - /* Policy: minimize conflict overlap */ + + if (lock->l_extent.end > req_ex->end) { + if (lock->l_extent.start == 0) { + new_ex->start = req_ex->start; new_ex->end = req_ex->end; + return; + } + new_ex->end = MAX(lock->l_extent.start - 1, + req_ex->end); } } } @@ -104,8 +111,9 @@ int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock **lockp, memcpy(&lock->l_extent, &new_ex, sizeof(new_ex)); - LDLM_DEBUG(lock, "new extent "LPU64" -> "LPU64, new_ex.start, - new_ex.end); + LDLM_DEBUG(lock, "requested extent ["LPU64"->"LPU64"], new extent [" + LPU64"->"LPU64"]", + req_ex->start, req_ex->end, new_ex.start, new_ex.end); if (new_ex.end != req_ex->end || new_ex.start != req_ex->start) return ELDLM_LOCK_CHANGED; diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h new file mode 100644 index 0000000..b8bfdac --- /dev/null +++ b/lustre/ldlm/ldlm_internal.h @@ -0,0 +1 @@ +int ldlm_cancel_lru(struct ldlm_namespace *ns); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c new file mode 100644 index 0000000..735e3831 --- /dev/null +++ b/lustre/ldlm/ldlm_lib.c @@ -0,0 +1,883 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_LDLM + +#ifdef __KERNEL__ +# include <linux/module.h> +#else +# include <liblustre.h> +#endif +#include <linux/obd_ost.h> +#include <linux/lustre_dlm.h> +#include <linux/lustre_mds.h> +#include <linux/lustre_net.h> + +int client_import_connect(struct lustre_handle *dlm_handle, + struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct obd_export *exp; + struct ptlrpc_request *request; + /* XXX maybe this is a good time to create a connect struct? */ + int rc, size[] = {sizeof(imp->imp_target_uuid), + sizeof(obd->obd_uuid), + sizeof(*dlm_handle)}; + char *tmp[] = {imp->imp_target_uuid.uuid, + obd->obd_uuid.uuid, + (char *)dlm_handle}; + int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT; + int msg_flags; + + ENTRY; + down(&cli->cl_sem); + rc = class_connect(dlm_handle, obd, cluuid); + if (rc) + GOTO(out_sem, rc); + + cli->cl_conn_count++; + if (cli->cl_conn_count > 1) + GOTO(out_sem, rc); + + if (obd->obd_namespace != NULL) + CERROR("already have namespace!\n"); + obd->obd_namespace = ldlm_namespace_new(obd->obd_name, + LDLM_NAMESPACE_CLIENT); + if (obd->obd_namespace == NULL) + GOTO(out_disco, rc = -ENOMEM); + + request = ptlrpc_prep_req(imp, rq_opc, 3, size, tmp); + if (!request) + GOTO(out_ldlm, rc = -ENOMEM); + + request->rq_level = LUSTRE_CONN_NEW; + request->rq_replen = lustre_msg_size(0, NULL); + + imp->imp_dlm_handle = *dlm_handle; + + imp->imp_level = LUSTRE_CONN_CON; + rc = ptlrpc_queue_wait(request); + if (rc) { + class_disconnect(dlm_handle, 0); + GOTO(out_req, rc); + } + + exp = class_conn2export(dlm_handle); + exp->exp_connection = ptlrpc_connection_addref(request->rq_connection); + class_export_put(exp); + + msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); + if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) { + imp->imp_replayable = 1; + CDEBUG(D_HA, "connected to replayable target: %s\n", + imp->imp_target_uuid.uuid); + } + imp->imp_level = LUSTRE_CONN_FULL; + imp->imp_remote_handle = request->rq_repmsg->handle; + CDEBUG(D_HA, "local import: %p, remote handle: "LPX64"\n", imp, + imp->imp_remote_handle.cookie); + + EXIT; +out_req: + ptlrpc_req_finished(request); + if (rc) { +out_ldlm: + ldlm_namespace_free(obd->obd_namespace); + obd->obd_namespace = NULL; +out_disco: + cli->cl_conn_count--; + class_disconnect(dlm_handle, 0); + } +out_sem: + up(&cli->cl_sem); + return rc; +} + +int client_import_disconnect(struct lustre_handle *dlm_handle, int failover) +{ + struct obd_device *obd = class_conn2obd(dlm_handle); + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + struct ptlrpc_request *request = NULL; + int rc = 0, err, rq_opc; + ENTRY; + + if (!obd) { + CERROR("invalid connection for disconnect: cookie "LPX64"\n", + dlm_handle ? dlm_handle->cookie : -1UL); + RETURN(-EINVAL); + } + + rq_opc = obd->obd_type->typ_ops->o_brw ? OST_DISCONNECT:MDS_DISCONNECT; + down(&cli->cl_sem); + if (!cli->cl_conn_count) { + CERROR("disconnecting disconnected device (%s)\n", + obd->obd_name); + GOTO(out_sem, rc = -EINVAL); + } + + cli->cl_conn_count--; + if (cli->cl_conn_count) + GOTO(out_no_disconnect, rc = 0); + + if (obd->obd_namespace != NULL) { + /* obd_no_recov == local only */ + ldlm_cli_cancel_unused(obd->obd_namespace, NULL, + obd->obd_no_recov, NULL); + ldlm_namespace_free(obd->obd_namespace); + obd->obd_namespace = NULL; + } + + /* Yeah, obd_no_recov also (mainly) means "forced shutdown". */ + if (obd->obd_no_recov) { + ptlrpc_abort_inflight(imp); + } else { + request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL); + if (!request) + GOTO(out_req, rc = -ENOMEM); + + request->rq_replen = lustre_msg_size(0, NULL); + + /* Process disconnects even if we're waiting for recovery. */ + request->rq_level = LUSTRE_CONN_RECOVD; + + rc = ptlrpc_queue_wait(request); + if (rc) + GOTO(out_req, rc); + } + EXIT; + out_req: + if (request) + ptlrpc_req_finished(request); + out_no_disconnect: + err = class_disconnect(dlm_handle, 0); + if (!rc && err) + rc = err; + out_sem: + up(&cli->cl_sem); + RETURN(rc); +} + +/* -------------------------------------------------------------------------- + * from old lib/target.c + * -------------------------------------------------------------------------- */ + +int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, + struct obd_uuid *cluuid) +{ + if (exp->exp_connection) { + struct lustre_handle *hdl; + hdl = &exp->exp_ldlm_data.led_import->imp_remote_handle; + /* Might be a re-connect after a partition. */ + if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { + CERROR("%s reconnecting\n", cluuid->uuid); + conn->cookie = exp->exp_handle.h_cookie; + RETURN(EALREADY); + } else { + CERROR("%s reconnecting from %s, " + "handle mismatch (ours "LPX64", theirs " + LPX64")\n", cluuid->uuid, + exp->exp_connection->c_remote_uuid.uuid, + hdl->cookie, conn->cookie); + /* XXX disconnect them here? */ + memset(conn, 0, sizeof *conn); + /* This is a little scary, but right now we build this + * file separately into each server module, so I won't + * go _immediately_ to hell. + */ + RETURN(-EALREADY); + } + } + + conn->cookie = exp->exp_handle.h_cookie; + CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", + cluuid->uuid, exp); + CDEBUG(D_IOCTL,"connect: cookie "LPX64"\n", conn->cookie); + RETURN(0); +} + +int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) +{ + struct obd_device *target; + struct obd_export *export = NULL; + struct obd_import *dlmimp; + struct lustre_handle conn; + struct obd_uuid tgtuuid; + struct obd_uuid cluuid; + struct obd_uuid remote_uuid; + struct list_head *p; + char *str, *tmp; + int rc, i, abort_recovery; + ENTRY; + + LASSERT_REQSWAB (req, 0); + str = lustre_msg_string (req->rq_reqmsg, 0, sizeof (tgtuuid.uuid) - 1); + if (str == NULL) { + CERROR("bad target UUID for connect\n"); + GOTO(out, rc = -EINVAL); + } + obd_str2uuid (&tgtuuid, str); + + LASSERT_REQSWAB (req, 1); + str = lustre_msg_string (req->rq_reqmsg, 1, sizeof (cluuid.uuid) - 1); + if (str == NULL) { + CERROR("bad client UUID for connect\n"); + GOTO(out, rc = -EINVAL); + } + obd_str2uuid (&cluuid, str); + + i = class_uuid2dev(&tgtuuid); + if (i == -1) { + CERROR("UUID '%s' not found for connect\n", tgtuuid.uuid); + GOTO(out, rc = -ENODEV); + } + + target = &obd_dev[i]; + if (!target || target->obd_stopping || !target->obd_set_up) { + CERROR("UUID '%s' is not available for connect\n", str); + GOTO(out, rc = -ENODEV); + } + + /* XXX extract a nettype and format accordingly */ + snprintf(remote_uuid.uuid, sizeof remote_uuid, + "NET_"LPX64"_UUID", req->rq_peer.peer_nid); + + spin_lock_bh(&target->obd_processing_task_lock); + abort_recovery = target->obd_abort_recovery; + spin_unlock_bh(&target->obd_processing_task_lock); + if (abort_recovery) + target_abort_recovery(target); + + tmp = lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn); + if (tmp == NULL) + GOTO(out, rc = -EPROTO); + + memcpy(&conn, tmp, sizeof conn); + + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); + if (rc) + GOTO(out, rc); + + /* lctl gets a backstage, all-access pass. */ + if (obd_uuid_equals(&cluuid, &lctl_fake_uuid)) + goto dont_check_exports; + + spin_lock(&target->obd_dev_lock); + list_for_each(p, &target->obd_exports) { + export = list_entry(p, struct obd_export, exp_obd_chain); + if (obd_uuid_equals(&cluuid, &export->exp_client_uuid)) { + spin_unlock(&target->obd_dev_lock); + LASSERT(export->exp_obd == target); + + rc = target_handle_reconnect(&conn, export, &cluuid); + break; + } + export = NULL; + } + /* If we found an export, we already unlocked. */ + if (!export) + spin_unlock(&target->obd_dev_lock); + + /* Tell the client if we're in recovery. */ + /* If this is the first client, start the recovery timer */ + if (target->obd_recovering) { + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + target_start_recovery_timer(target, handler); + } + + /* Tell the client if we support replayable requests */ + if (target->obd_replayable) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); + + if (export == NULL) { + if (target->obd_recovering) { + CERROR("denying connection for new client %s: " + "in recovery\n", cluuid.uuid); + rc = -EBUSY; + } else { + dont_check_exports: + rc = obd_connect(&conn, target, &cluuid); + } + } + + /* If all else goes well, this is our RPC return code. */ + req->rq_status = 0; + + if (rc && rc != EALREADY) + GOTO(out, rc); + + req->rq_repmsg->handle = conn; + + /* If the client and the server are the same node, we will already + * have an export that really points to the client's DLM export, + * because we have a shared handles table. + * + * XXX this will go away when shaver stops sending the "connect" handle + * in the real "remote handle" field of the request --phik 24 Apr 2003 + */ + if (req->rq_export != NULL) + class_export_put(req->rq_export); + + /* ownership of this export ref transfers to the request */ + export = req->rq_export = class_conn2export(&conn); + LASSERT(export != NULL); + + if (req->rq_connection != NULL) + ptlrpc_put_connection(req->rq_connection); + if (export->exp_connection != NULL) + ptlrpc_put_connection(export->exp_connection); + export->exp_connection = ptlrpc_get_connection(&req->rq_peer, + &remote_uuid); + req->rq_connection = ptlrpc_connection_addref(export->exp_connection); + + if (rc == EALREADY) { + /* We indicate the reconnection in a flag, not an error code. */ + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); + GOTO(out, rc = 0); + } + + memcpy(&conn, lustre_msg_buf(req->rq_reqmsg, 2, sizeof conn), + sizeof conn); + + if (export->exp_ldlm_data.led_import != NULL) + class_destroy_import(export->exp_ldlm_data.led_import); + dlmimp = export->exp_ldlm_data.led_import = class_new_import(); + dlmimp->imp_connection = ptlrpc_connection_addref(req->rq_connection); + dlmimp->imp_client = &export->exp_obd->obd_ldlm_client; + dlmimp->imp_remote_handle = conn; + dlmimp->imp_obd = target; + dlmimp->imp_dlm_fake = 1; + dlmimp->imp_level = LUSTRE_CONN_FULL; + class_import_put(dlmimp); +out: + if (rc) + req->rq_status = rc; + RETURN(rc); +} + +int target_handle_disconnect(struct ptlrpc_request *req) +{ + struct lustre_handle *conn = &req->rq_reqmsg->handle; + struct obd_import *dlmimp; + int rc; + ENTRY; + + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); + if (rc) + RETURN(rc); + + req->rq_status = obd_disconnect(conn, 0); + + dlmimp = req->rq_export->exp_ldlm_data.led_import; + class_destroy_import(dlmimp); + + class_export_put(req->rq_export); + req->rq_export = NULL; + RETURN(0); +} + +/* + * Recovery functions + */ + +void target_cancel_recovery_timer(struct obd_device *obd) +{ + del_timer(&obd->obd_recovery_timer); +} + +static void abort_delayed_replies(struct obd_device *obd) +{ + struct ptlrpc_request *req; + struct list_head *tmp, *n; + list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "aborted:"); + req->rq_status = -ENOTCONN; + req->rq_type = PTL_RPC_MSG_ERR; + ptlrpc_reply(req); + list_del(&req->rq_list); + OBD_FREE(req->rq_reqmsg, req->rq_reqlen); + OBD_FREE(req, sizeof *req); + } +} + +static void abort_recovery_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req; + struct list_head *tmp, *n; + int rc; + + list_for_each_safe(tmp, n, &obd->obd_recovery_queue) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "aborted:"); + req->rq_status = -ENOTCONN; + req->rq_type = PTL_RPC_MSG_ERR; + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc == 0) { + ptlrpc_reply(req); + } else { + DEBUG_REQ(D_ERROR, req, + "packing failed for abort-reply; skipping"); + } + list_del(&req->rq_list); + class_export_put(req->rq_export); + OBD_FREE(req->rq_reqmsg, req->rq_reqlen); + OBD_FREE(req, sizeof *req); + } +} + +void target_abort_recovery(void *data) +{ + struct obd_device *obd = data; + + CERROR("disconnecting clients and aborting recovery\n"); + spin_lock_bh(&obd->obd_processing_task_lock); + if (!obd->obd_recovering) { + spin_unlock_bh(&obd->obd_processing_task_lock); + EXIT; + return; + } + + obd->obd_recovering = obd->obd_abort_recovery = 0; + obd->obd_recoverable_clients = 0; + wake_up(&obd->obd_next_transno_waitq); + target_cancel_recovery_timer(obd); + spin_unlock_bh(&obd->obd_processing_task_lock); + class_disconnect_exports(obd, 0); + abort_delayed_replies(obd); + abort_recovery_queue(obd); +} + +static void target_recovery_expired(unsigned long castmeharder) +{ + struct obd_device *obd = (struct obd_device *)castmeharder; + CERROR("recovery timed out, aborting\n"); + spin_lock_bh(&obd->obd_processing_task_lock); + obd->obd_abort_recovery = 1; + wake_up(&obd->obd_next_transno_waitq); + spin_unlock_bh(&obd->obd_processing_task_lock); +} + +static void reset_recovery_timer(struct obd_device *obd) +{ + int recovering; + spin_lock(&obd->obd_dev_lock); + recovering = obd->obd_recovering; + spin_unlock(&obd->obd_dev_lock); + + if (!recovering) + return; + CDEBUG(D_ERROR, "timer will expire in %ld seconds\n", + OBD_RECOVERY_TIMEOUT / HZ); + mod_timer(&obd->obd_recovery_timer, jiffies + OBD_RECOVERY_TIMEOUT); +} + + +/* Only start it the first time called */ +void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler) +{ + spin_lock_bh(&obd->obd_processing_task_lock); + if (obd->obd_recovery_handler) { + spin_unlock_bh(&obd->obd_processing_task_lock); + return; + } + CERROR("%s: starting recovery timer\n", obd->obd_name); + obd->obd_recovery_handler = handler; + obd->obd_recovery_timer.function = target_recovery_expired; + obd->obd_recovery_timer.data = (unsigned long)obd; + init_timer(&obd->obd_recovery_timer); + spin_unlock_bh(&obd->obd_processing_task_lock); + + reset_recovery_timer(obd); +} + +static int check_for_next_transno(struct obd_device *obd) +{ + struct ptlrpc_request *req; + int wake_up; + + req = list_entry(obd->obd_recovery_queue.next, + struct ptlrpc_request, rq_list); + LASSERT(req->rq_reqmsg->transno >= obd->obd_next_recovery_transno); + + wake_up = req->rq_reqmsg->transno == obd->obd_next_recovery_transno || + (obd->obd_recovering) == 0; + CDEBUG(D_HA, "check_for_next_transno: "LPD64" vs "LPD64", %d == %d\n", + req->rq_reqmsg->transno, obd->obd_next_recovery_transno, + obd->obd_recovering, wake_up); + return wake_up; +} + +static void process_recovery_queue(struct obd_device *obd) +{ + struct ptlrpc_request *req; + int abort_recovery = 0; + struct l_wait_info lwi = { 0 }; + ENTRY; + + for (;;) { + spin_lock_bh(&obd->obd_processing_task_lock); + LASSERT(obd->obd_processing_task == current->pid); + req = list_entry(obd->obd_recovery_queue.next, + struct ptlrpc_request, rq_list); + + if (req->rq_reqmsg->transno != obd->obd_next_recovery_transno) { + spin_unlock_bh(&obd->obd_processing_task_lock); + CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is " + LPD64")\n", + obd->obd_next_recovery_transno, + req->rq_reqmsg->transno); + l_wait_event(obd->obd_next_transno_waitq, + check_for_next_transno(obd), &lwi); + spin_lock_bh(&obd->obd_processing_task_lock); + abort_recovery = obd->obd_abort_recovery; + spin_unlock_bh(&obd->obd_processing_task_lock); + if (abort_recovery) { + target_abort_recovery(obd); + return; + } + continue; + } + list_del_init(&req->rq_list); + spin_unlock_bh(&obd->obd_processing_task_lock); + + DEBUG_REQ(D_ERROR, req, "processing: "); + (void)obd->obd_recovery_handler(req); + reset_recovery_timer(obd); +#warning FIXME: mds_fsync_super(mds->mds_sb); + class_export_put(req->rq_export); + OBD_FREE(req->rq_reqmsg, req->rq_reqlen); + OBD_FREE(req, sizeof *req); + spin_lock_bh(&obd->obd_processing_task_lock); + obd->obd_next_recovery_transno++; + if (list_empty(&obd->obd_recovery_queue)) { + obd->obd_processing_task = 0; + spin_unlock_bh(&obd->obd_processing_task_lock); + break; + } + spin_unlock_bh(&obd->obd_processing_task_lock); + } + EXIT; +} + +int target_queue_recovery_request(struct ptlrpc_request *req, + struct obd_device *obd) +{ + struct list_head *tmp; + int inserted = 0; + __u64 transno = req->rq_reqmsg->transno; + struct ptlrpc_request *saved_req; + struct lustre_msg *reqmsg; + + /* CAVEAT EMPTOR: The incoming request message has been swabbed + * (i.e. buflens etc are in my own byte order), but type-dependent + * buffers (eg mds_body, ost_body etc) have NOT been swabbed. */ + + if (!transno) { + INIT_LIST_HEAD(&req->rq_list); + DEBUG_REQ(D_HA, req, "not queueing"); + return 1; + } + + /* XXX If I were a real man, these LBUGs would be sane cleanups. */ + /* XXX just like the request-dup code in queue_final_reply */ + OBD_ALLOC(saved_req, sizeof *saved_req); + if (!saved_req) + LBUG(); + OBD_ALLOC(reqmsg, req->rq_reqlen); + if (!reqmsg) + LBUG(); + + spin_lock_bh(&obd->obd_processing_task_lock); + + /* If we're processing the queue, we want don't want to queue this + * message. + * + * Also, if this request has a transno less than the one we're waiting + * for, we should process it now. It could (and currently always will) + * be an open request for a descriptor that was opened some time ago. + */ + if (obd->obd_processing_task == current->pid || + transno < obd->obd_next_recovery_transno) { + /* Processing the queue right now, don't re-add. */ + LASSERT(list_empty(&req->rq_list)); + spin_unlock_bh(&obd->obd_processing_task_lock); + OBD_FREE(reqmsg, req->rq_reqlen); + OBD_FREE(saved_req, sizeof *saved_req); + return 1; + } + + memcpy(saved_req, req, sizeof *req); + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + req = saved_req; + req->rq_reqmsg = reqmsg; + class_export_get(req->rq_export); + INIT_LIST_HEAD(&req->rq_list); + + /* XXX O(n^2) */ + list_for_each(tmp, &obd->obd_recovery_queue) { + struct ptlrpc_request *reqiter = + list_entry(tmp, struct ptlrpc_request, rq_list); + + if (reqiter->rq_reqmsg->transno > transno) { + list_add_tail(&req->rq_list, &reqiter->rq_list); + inserted = 1; + break; + } + } + + if (!inserted) { + list_add_tail(&req->rq_list, &obd->obd_recovery_queue); + } + + if (obd->obd_processing_task != 0) { + /* Someone else is processing this queue, we'll leave it to + * them. + */ + if (transno == obd->obd_next_recovery_transno) + wake_up(&obd->obd_next_transno_waitq); + spin_unlock_bh(&obd->obd_processing_task_lock); + return 0; + } + + /* Nobody is processing, and we know there's (at least) one to process + * now, so we'll do the honours. + */ + obd->obd_processing_task = current->pid; + spin_unlock_bh(&obd->obd_processing_task_lock); + + process_recovery_queue(obd); + return 0; +} + +struct obd_device * target_req2obd(struct ptlrpc_request *req) +{ + return req->rq_export->exp_obd; +} + +int target_queue_final_reply(struct ptlrpc_request *req, int rc) +{ + struct obd_device *obd = target_req2obd(req); + struct ptlrpc_request *saved_req; + struct lustre_msg *reqmsg; + int recovery_done = 0; + + if (rc) { + /* Just like ptlrpc_error, but without the sending. */ + lustre_pack_msg(0, NULL, NULL, &req->rq_replen, + &req->rq_repmsg); + req->rq_type = PTL_RPC_MSG_ERR; + } + + LASSERT(list_empty(&req->rq_list)); + /* XXX just like the request-dup code in queue_recovery_request */ + OBD_ALLOC(saved_req, sizeof *saved_req); + if (!saved_req) + LBUG(); + OBD_ALLOC(reqmsg, req->rq_reqlen); + if (!reqmsg) + LBUG(); + memcpy(saved_req, req, sizeof *saved_req); + memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); + req = saved_req; + req->rq_reqmsg = reqmsg; + list_add(&req->rq_list, &obd->obd_delayed_reply_queue); + + spin_lock_bh(&obd->obd_processing_task_lock); + --obd->obd_recoverable_clients; + recovery_done = (obd->obd_recoverable_clients == 0); + spin_unlock_bh(&obd->obd_processing_task_lock); + + if (recovery_done) { + struct list_head *tmp, *n; + ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace); + CDEBUG(D_ERROR, + "%s: all clients recovered, sending delayed replies\n", + obd->obd_name); + obd->obd_recovering = 0; + list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "delayed:"); + ptlrpc_reply(req); + list_del(&req->rq_list); + OBD_FREE(req->rq_reqmsg, req->rq_reqlen); + OBD_FREE(req, sizeof *req); + } + target_cancel_recovery_timer(obd); + } else { + CERROR("%s: %d recoverable clients remain\n", + obd->obd_name, obd->obd_recoverable_clients); + } + + return 1; +} + +static void ptlrpc_abort_reply (struct ptlrpc_request *req) +{ + /* On return, we must be sure that the ACK callback has either + * happened or will not happen. Note that the SENT callback will + * happen come what may since we successfully posted the PUT. */ + int rc; + struct l_wait_info lwi; + unsigned long flags; + + again: + /* serialise with ACK callback */ + spin_lock_irqsave (&req->rq_lock, flags); + if (!req->rq_want_ack) { + spin_unlock_irqrestore (&req->rq_lock, flags); + /* The ACK callback has happened already. Although the + * SENT callback might still be outstanding (yes really) we + * don't care; this is just like normal completion. */ + return; + } + spin_unlock_irqrestore (&req->rq_lock, flags); + + /* Have a bash at unlinking the MD. This will fail until the SENT + * callback has happened since the MD is busy from the PUT. If the + * ACK still hasn't arrived after then, a successful unlink will + * ensure the ACK callback never happens. */ + rc = PtlMDUnlink (req->rq_reply_md_h); + switch (rc) { + default: + LBUG (); + case PTL_OK: + /* SENT callback happened; ACK callback preempted */ + LASSERT (req->rq_want_ack); + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_want_ack = 0; + spin_unlock_irqrestore (&req->rq_lock, flags); + return; + case PTL_INV_MD: + return; + case PTL_MD_INUSE: + /* Still sending or ACK callback in progress: wait until + * either callback has completed and try again. + * Actually we can't wait for the SENT callback because + * there's no state the SENT callback can touch that will + * allow it to communicate with us! So we just wait here + * for a short time, effectively polling for the SENT + * callback by calling PtlMDUnlink() again, to see if it + * has finished. Note that if the ACK does arrive, its + * callback wakes us in short order. --eeb */ + lwi = LWI_TIMEOUT (HZ/4, NULL, NULL); + rc = l_wait_event(req->rq_wait_for_rep, !req->rq_want_ack, + &lwi); + CDEBUG (D_HA, "Retrying req %p: %d\n", req, rc); + /* NB go back and test rq_want_ack with locking, to ensure + * if ACK callback happened, it has completed stopped + * referencing this req. */ + goto again; + } +} + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) +{ + int i; + int netrc; + unsigned long flags; + struct ptlrpc_req_ack_lock *ack_lock; + struct l_wait_info lwi = { 0 }; + wait_queue_t commit_wait; + struct obd_device *obd = + req->rq_export ? req->rq_export->exp_obd : NULL; + struct obd_export *exp = + (req->rq_export && req->rq_ack_locks[0].mode) ? + req->rq_export : NULL; + + if (exp) { + exp->exp_outstanding_reply = req; + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_want_ack = 1; + spin_unlock_irqrestore (&req->rq_lock, flags); + } + + if (!OBD_FAIL_CHECK(fail_id | OBD_FAIL_ONCE)) { + if (rc) { + DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); + netrc = ptlrpc_error(req); + } else { + DEBUG_REQ(D_NET, req, "sending reply"); + netrc = ptlrpc_reply(req); + } + } else { + obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; + DEBUG_REQ(D_ERROR, req, "dropping reply"); + if (!exp && req->rq_repmsg) { + OBD_FREE(req->rq_repmsg, req->rq_replen); + req->rq_repmsg = NULL; + } + init_waitqueue_head(&req->rq_wait_for_rep); + netrc = 0; + } + + /* a failed send simulates the callbacks */ + LASSERT(netrc == 0 || req->rq_want_ack == 0); + if (exp == NULL) { + LASSERT(req->rq_want_ack == 0); + return; + } + LASSERT(obd != NULL); + + init_waitqueue_entry(&commit_wait, current); + add_wait_queue(&obd->obd_commit_waitq, &commit_wait); + rc = l_wait_event(req->rq_wait_for_rep, + !req->rq_want_ack || req->rq_resent || + req->rq_transno <= obd->obd_last_committed, &lwi); + remove_wait_queue(&obd->obd_commit_waitq, &commit_wait); + + spin_lock_irqsave (&req->rq_lock, flags); + /* If we got here because the ACK callback ran, this acts as a + * barrier to ensure the callback completed the wakeup. */ + spin_unlock_irqrestore (&req->rq_lock, flags); + + /* If we committed the transno already, then we might wake up before + * the ack arrives. We need to stop waiting for the ack before we can + * reuse this request structure. We are guaranteed by this point that + * this cannot abort the sending of the actual reply.*/ + ptlrpc_abort_reply(req); + + if (req->rq_resent) { + DEBUG_REQ(D_HA, req, "resent: not cancelling locks"); + return; + } + + LASSERT(rc == 0); + DEBUG_REQ(D_HA, req, "cancelling locks for %s", + req->rq_want_ack ? "commit" : "ack"); + + exp->exp_outstanding_reply = NULL; + + for (ack_lock = req->rq_ack_locks, i = 0; i < 4; i++, ack_lock++) { + if (!ack_lock->mode) + break; + ldlm_lock_decref(&ack_lock->lock, ack_lock->mode); + } +} + +int target_handle_ping(struct ptlrpc_request *req) +{ + return lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); +} diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 81cc428..62272fa 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -24,16 +24,17 @@ #define DEBUG_SUBSYSTEM S_LDLM #ifdef __KERNEL__ -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/lustre_dlm.h> -#include <linux/lustre_mds.h> +# include <linux/slab.h> +# include <linux/module.h> +# include <linux/lustre_dlm.h> +# include <linux/lustre_mds.h> #else -#include <liblustre.h> -#include <linux/kp30.h> +# include <liblustre.h> +# include <linux/kp30.h> #endif #include <linux/obd_class.h> +#include "ldlm_internal.h" //struct lustre_lock ldlm_everything_lock; @@ -154,7 +155,7 @@ void ldlm_lock_put(struct ldlm_lock *lock) if (lock->l_parent) LDLM_LOCK_PUT(lock->l_parent); - PORTAL_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); + OBD_SLAB_FREE(lock, ldlm_lock_slab, sizeof(*lock)); l_unlock(&ns->ns_lock); } @@ -248,7 +249,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent, if (resource == NULL) LBUG(); - PORTAL_SLAB_ALLOC(lock, ldlm_lock_slab, sizeof(*lock)); + OBD_SLAB_ALLOC(lock, ldlm_lock_slab, SLAB_KERNEL, sizeof(*lock)); if (lock == NULL) RETURN(NULL); @@ -318,7 +319,6 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh) { - POISON(&lockh->addr, 0x69, sizeof(lockh->addr)); lockh->cookie = lock->l_handle.h_cookie; } @@ -447,10 +447,6 @@ void ldlm_lock_addref_internal(struct ldlm_lock *lock, __u32 mode) LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); } -/* Args: unlocked lock */ -int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - struct ldlm_res_id, int flags); - void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) { struct ldlm_namespace *ns; @@ -484,17 +480,14 @@ void ldlm_lock_decref_internal(struct ldlm_lock *lock, __u32 mode) "warning\n"); LDLM_DEBUG(lock, "final decref done on cbpending lock"); - - if (lock->l_blocking_ast == NULL) { - /* The lock wasn't even fully formed; just destroy it */ - ldlm_lock_destroy(lock); - } l_unlock(&ns->ns_lock); /* FIXME: need a real 'desc' here */ if (lock->l_blocking_ast != NULL) lock->l_blocking_ast(lock, NULL, lock->l_data, LDLM_CB_BLOCKING); + else + LDLM_DEBUG(lock, "No blocking AST?"); } else if (ns->ns_client && !lock->l_readers && !lock->l_writers) { /* If this is a client-side namespace and this was the last * reference, put it on the LRU. */ @@ -533,8 +526,8 @@ void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); l_lock(&lock->l_resource->lr_namespace->ns_lock); lock->l_flags |= LDLM_FL_CBPENDING; - ldlm_lock_decref_internal(lock, mode); l_unlock(&lock->l_resource->lr_namespace->ns_lock); + ldlm_lock_decref_internal(lock, mode); LDLM_LOCK_PUT(lock); } @@ -630,7 +623,17 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, if (lock == old_lock) break; - if (lock->l_flags & LDLM_FL_CBPENDING) + /* llite sometimes wants to match locks that will be + * canceled when their users drop, but we allow it to match + * if it passes in CBPENDING and the lock still has users. + * this is generally only going to be used by children + * whose parents already hold a lock so forward progress + * can still happen. */ + if (lock->l_flags & LDLM_FL_CBPENDING && + !(flags & LDLM_FL_CBPENDING)) + continue; + if (lock->l_flags & LDLM_FL_CBPENDING && + lock->l_readers == 0 && lock->l_writers == 0) continue; if (lock->l_req_mode != mode) @@ -666,6 +669,9 @@ static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t mode, * server (ie, connh is NULL) * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted * list will be considered + * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked + * to be canceled can still be matched as long as they still have reader + * or writer refernces * * Returns 1 if it finds an already-existing lock that is compatible; in this * case, lockh is filled in with a addref()ed lock @@ -710,14 +716,15 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, GOTO(out, rc = 1); EXIT; - out: + out: ldlm_resource_putref(res); l_unlock(&ns->ns_lock); if (lock) { ldlm_lock2handle(lock, lockh); if (lock->l_completion_ast) - lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, NULL); + lock->l_completion_ast(lock, LDLM_FL_WAIT_NOREPROC, + NULL); } if (rc) LDLM_DEBUG(lock, "matched"); @@ -734,7 +741,9 @@ int ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, struct ldlm_res_id res_id, __u32 type, - ldlm_mode_t mode, void *data, void *cp_data) + ldlm_mode_t mode, + ldlm_blocking_callback blocking, + void *data) { struct ldlm_resource *res, *parent_res = NULL; struct ldlm_lock *lock, *parent_lock = NULL; @@ -760,7 +769,7 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, lock->l_req_mode = mode; lock->l_data = data; - lock->l_cp_data = cp_data; + lock->l_blocking_ast = blocking; RETURN(lock); } @@ -769,8 +778,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, struct ldlm_lock **lockp, void *cookie, int cookie_len, int *flags, - ldlm_completion_callback completion, - ldlm_blocking_callback blocking) + ldlm_completion_callback completion) { struct ldlm_resource *res; struct ldlm_lock *lock = *lockp; @@ -779,7 +787,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, ENTRY; res = lock->l_resource; - lock->l_blocking_ast = blocking; if (res->lr_type == LDLM_EXTENT) memcpy(&lock->l_extent, cookie, sizeof(lock->l_extent)); @@ -867,12 +874,6 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, *flags |= LDLM_FL_BLOCK_GRANTED; GOTO(out, ELDLM_OK); } - - if (lock->l_granted_cb != NULL && lock->l_data != NULL) { - /* We just -know- */ - struct ptlrpc_request *req = lock->l_data; - lock->l_granted_cb(lock, req->rq_repmsg, 0); - } ldlm_grant_lock(lock, NULL, 0); EXIT; out: @@ -994,11 +995,14 @@ void ldlm_cancel_callback(struct ldlm_lock *lock) l_lock(&lock->l_resource->lr_namespace->ns_lock); if (!(lock->l_flags & LDLM_FL_CANCEL)) { lock->l_flags |= LDLM_FL_CANCEL; - if (lock->l_blocking_ast) + if (lock->l_blocking_ast) { + l_unlock(&lock->l_resource->lr_namespace->ns_lock); lock->l_blocking_ast(lock, NULL, lock->l_data, LDLM_CB_CANCELING); - else + return; + } else { LDLM_DEBUG(lock, "no blocking ast"); + } } l_unlock(&lock->l_resource->lr_namespace->ns_lock); } @@ -1023,7 +1027,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock) LBUG(); } - ldlm_cancel_callback(lock); + ldlm_cancel_callback(lock); /* XXX FIXME bug 1030 */ ldlm_resource_unlink_lock(lock); ldlm_lock_destroy(lock); @@ -1031,7 +1035,7 @@ void ldlm_lock_cancel(struct ldlm_lock *lock) EXIT; } -int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, void *cp_data) +int ldlm_lock_set_data(struct lustre_handle *lockh, void *data) { struct ldlm_lock *lock = ldlm_handle2lock(lockh); ENTRY; @@ -1040,7 +1044,6 @@ int ldlm_lock_set_data(struct lustre_handle *lockh, void *data, void *cp_data) RETURN(-EINVAL); lock->l_data = data; - lock->l_cp_data = cp_data; LDLM_LOCK_PUT(lock); @@ -1118,6 +1121,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, void ldlm_lock_dump(int level, struct ldlm_lock *lock) { char ver[128]; + struct obd_device *obd; if (!((portal_debug | D_ERROR) & level)) return; @@ -1136,13 +1140,21 @@ void ldlm_lock_dump(int level, struct ldlm_lock *lock) CDEBUG(level, " -- Lock dump: %p (%s) (rc: %d)\n", lock, ver, atomic_read(&lock->l_refc)); - if (lock->l_export && lock->l_export->exp_connection) + obd = class_conn2obd(lock->l_connh); + if (lock->l_export && lock->l_export->exp_connection) { CDEBUG(level, " Node: NID "LPX64" on %s (rhandle: "LPX64")\n", lock->l_export->exp_connection->c_peer.peer_nid, lock->l_export->exp_connection->c_peer.peer_ni->pni_name, lock->l_remote_handle.cookie); - else + } else if (obd == NULL) { CDEBUG(level, " Node: local\n"); + } else { + struct obd_import *imp = obd->u.cli.cl_import; + CDEBUG(level, " Node: NID "LPX64" on %s (rhandle: "LPX64")\n", + imp->imp_connection->c_peer.peer_nid, + imp->imp_connection->c_peer.peer_ni->pni_name, + lock->l_remote_handle.cookie); + } CDEBUG(level, " Parent: %p\n", lock->l_parent); CDEBUG(level, " Resource: %p ("LPD64")\n", lock->l_resource, lock->l_resource->lr_name.name[0]); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index dafcb6e..9d2857e7 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -28,13 +28,13 @@ # include <linux/module.h> # include <linux/slab.h> # include <linux/init.h> +# include <linux/wait.h> #else # include <liblustre.h> #endif #include <linux/lustre_dlm.h> #include <linux/obd_class.h> - extern kmem_cache_t *ldlm_resource_slab; extern kmem_cache_t *ldlm_lock_slab; extern struct lustre_lock ldlm_handle_lock; @@ -42,6 +42,10 @@ extern struct list_head ldlm_namespace_list; extern int (*mds_reint_p)(int offset, struct ptlrpc_request *req); extern int (*mds_getattr_name_p)(int offset, struct ptlrpc_request *req); +static int ldlm_already_setup = 0; + +#ifdef __KERNEL__ + inline unsigned long round_timeout(unsigned long timeout) { return ((timeout / HZ) + 1) * HZ; @@ -51,23 +55,103 @@ inline unsigned long round_timeout(unsigned long timeout) static struct list_head waiting_locks_list; static spinlock_t waiting_locks_spinlock; static struct timer_list waiting_locks_timer; -static int ldlm_already_setup = 0; + +static struct expired_lock_thread { + wait_queue_head_t elt_waitq; + int elt_state; + struct list_head elt_expired_locks; + spinlock_t elt_lock; +} expired_lock_thread; + +#define ELT_STOPPED 0 +#define ELT_READY 1 +#define ELT_TERMINATE 2 + +static inline int have_expired_locks(void) +{ + int need_to_run; + + spin_lock_bh(&expired_lock_thread.elt_lock); + need_to_run = !list_empty(&expired_lock_thread.elt_expired_locks); + spin_unlock_bh(&expired_lock_thread.elt_lock); + + RETURN(need_to_run); +} + +static int expired_lock_main(void *arg) +{ + struct list_head *expired = &expired_lock_thread.elt_expired_locks; + struct l_wait_info lwi = { 0 }; + unsigned long flags; + + ENTRY; + lock_kernel(); + kportal_daemonize("ldlm_elt"); + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + + unlock_kernel(); + + expired_lock_thread.elt_state = ELT_READY; + wake_up(&expired_lock_thread.elt_waitq); + + while (1) { + l_wait_event(expired_lock_thread.elt_waitq, + have_expired_locks() || + expired_lock_thread.elt_state == ELT_TERMINATE, + &lwi); + + spin_lock_bh(&expired_lock_thread.elt_lock); + while (!list_empty(expired)) { + struct ldlm_lock *lock = list_entry(expired->next, + struct ldlm_lock, + l_pending_chain); + spin_unlock_bh(&expired_lock_thread.elt_lock); + + ptlrpc_fail_export(lock->l_export); + + spin_lock_bh(&expired_lock_thread.elt_lock); + } + spin_unlock_bh(&expired_lock_thread.elt_lock); + + if (expired_lock_thread.elt_state == ELT_TERMINATE) + break; + } + + expired_lock_thread.elt_state = ELT_STOPPED; + wake_up(&expired_lock_thread.elt_waitq); + RETURN(0); +} static void waiting_locks_callback(unsigned long unused) { - struct list_head *liter, *n; + struct ldlm_lock *lock; spin_lock_bh(&waiting_locks_spinlock); - list_for_each_safe(liter, n, &waiting_locks_list) { - struct ldlm_lock *l = list_entry(liter, struct ldlm_lock, - l_pending_chain); - if (l->l_callback_timeout > jiffies) + while (!list_empty(&waiting_locks_list)) { + lock = list_entry(waiting_locks_list.next, struct ldlm_lock, + l_pending_chain); + + if (lock->l_callback_timeout > jiffies) break; - CERROR("lock timer expired, lock %p\n", l); - LDLM_DEBUG(l, "timer expired, recovering exp %p on conn %p", - l->l_export, l->l_export->exp_connection); - recovd_conn_fail(l->l_export->exp_connection); + + LDLM_ERROR(lock, "lock callback timer expired: evicting client " + "%s@%s nid "LPU64, + lock->l_export->exp_client_uuid.uuid, + lock->l_export->exp_connection->c_remote_uuid.uuid, + lock->l_export->exp_connection->c_peer.peer_nid); + + spin_lock_bh(&expired_lock_thread.elt_lock); + list_del(&lock->l_pending_chain); + list_add(&lock->l_pending_chain, + &expired_lock_thread.elt_expired_locks); + spin_unlock_bh(&expired_lock_thread.elt_lock); + wake_up(&expired_lock_thread.elt_waitq); } + spin_unlock_bh(&waiting_locks_spinlock); } @@ -80,8 +164,8 @@ static void waiting_locks_callback(unsigned long unused) static int ldlm_add_waiting_lock(struct ldlm_lock *lock) { unsigned long timeout_rounded; - ENTRY; + LDLM_DEBUG(lock, "adding to wait list"); LASSERT(list_empty(&lock->l_pending_chain)); spin_lock_bh(&waiting_locks_spinlock); @@ -95,7 +179,9 @@ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) } list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */ spin_unlock_bh(&waiting_locks_spinlock); - RETURN(1); + /* We drop this ref when we get removed from the list. */ + class_export_get(lock->l_export); + return 1; } /* @@ -107,13 +193,18 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) { struct list_head *list_next; - ENTRY; + if (lock->l_export == NULL) { + /* We don't have a "waiting locks list" on clients. */ + LDLM_DEBUG(lock, "client lock: no-op"); + return 0; + } spin_lock_bh(&waiting_locks_spinlock); if (list_empty(&lock->l_pending_chain)) { spin_unlock_bh(&waiting_locks_spinlock); - RETURN(0); + LDLM_DEBUG(lock, "wasn't waiting"); + return 0; } list_next = lock->l_pending_chain.next; @@ -132,13 +223,39 @@ int ldlm_del_waiting_lock(struct ldlm_lock *lock) } list_del_init(&lock->l_pending_chain); spin_unlock_bh(&waiting_locks_spinlock); + /* We got this ref when we were added to the list. */ + class_export_put(lock->l_export); + LDLM_DEBUG(lock, "removed"); + return 1; +} + +#else /* !__KERNEL__ */ + +static int ldlm_add_waiting_lock(struct ldlm_lock *lock) +{ RETURN(1); } -static inline void ldlm_failed_ast(struct ldlm_lock *lock) +int ldlm_del_waiting_lock(struct ldlm_lock *lock) { - /* XXX diagnostic */ - recovd_conn_fail(lock->l_export->exp_connection); + RETURN(0); +} + +#endif /* __KERNEL__ */ + +static inline void ldlm_failed_ast(struct ldlm_lock *lock, int rc, + char *ast_type) +{ + CERROR("%s AST failed (%d) for res "LPU64"/"LPU64 + ", mode %s: evicting client %s@%s NID "LPU64"\n", + ast_type, rc, + lock->l_resource->lr_name.name[0], + lock->l_resource->lr_name.name[1], + ldlm_lockname[lock->l_granted_mode], + lock->l_export->exp_client_uuid.uuid, + lock->l_export->exp_connection->c_remote_uuid.uuid, + lock->l_export->exp_connection->c_peer.peer_nid); + ptlrpc_fail_export(lock->l_export); } int ldlm_server_blocking_ast(struct ldlm_lock *lock, @@ -171,12 +288,19 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, RETURN(0); } - req = ptlrpc_prep_req(&lock->l_export->exp_ldlm_data.led_import, +#if 0 + if (LTIME_S(CURRENT_TIME) - lock->l_export->exp_last_request_time > 30){ + ldlm_failed_ast(lock, -ETIMEDOUT, "Not-attempted blocking"); + RETURN(-ETIMEDOUT); + } +#endif + + req = ptlrpc_prep_req(lock->l_export->exp_ldlm_data.led_import, LDLM_BL_CALLBACK, 1, &size, NULL); if (!req) RETURN(-ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->lock_handle1, &lock->l_remote_handle, sizeof(body->lock_handle1)); memcpy(&body->lock_desc, desc, sizeof(*desc)); @@ -188,14 +312,28 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, l_unlock(&lock->l_resource->lr_namespace->ns_lock); req->rq_level = LUSTRE_CONN_RECOVD; - req->rq_timeout = 2; + req->rq_timeout = 2; /* 2 second timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); if (rc == -ETIMEDOUT || rc == -EINTR) { ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock); + ldlm_failed_ast(lock, rc, "blocking"); } else if (rc) { - CERROR("client returned %d from blocking AST for lock %p\n", - req->rq_status, lock); + if (rc == -EINVAL) + CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d " + "from blocking AST for lock %p--normal race\n", + req->rq_connection->c_peer.peer_nid, + req->rq_repmsg->status, lock); + else if (rc == -ENOTCONN) + CDEBUG(D_DLMTRACE, "client (nid "LPU64") returned %d " + "from blocking AST for lock %p--this client was " + "probably rebooted while it held a lock, nothing" + " serious\n",req->rq_connection->c_peer.peer_nid, + req->rq_repmsg->status, lock); + else + CDEBUG(D_ERROR, "client (nid "LPU64") returned %d " + "from blocking AST for lock %p\n", + req->rq_connection->c_peer.peer_nid, + req->rq_repmsg->status, lock); LDLM_DEBUG(lock, "client returned error %d from blocking AST", req->rq_status); ldlm_lock_cancel(lock); @@ -221,12 +359,12 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) RETURN(-EINVAL); } - req = ptlrpc_prep_req(&lock->l_export->exp_ldlm_data.led_import, + req = ptlrpc_prep_req(lock->l_export->exp_ldlm_data.led_import, LDLM_CP_CALLBACK, 1, &size, NULL); if (!req) RETURN(-ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->lock_handle1, &lock->l_remote_handle, sizeof(body->lock_handle1)); body->lock_flags = flags; @@ -236,11 +374,11 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) req->rq_replen = lustre_msg_size(0, NULL); req->rq_level = LUSTRE_CONN_RECOVD; - req->rq_timeout = 2; + req->rq_timeout = 2; /* 2 second timeout for initial AST reply */ rc = ptlrpc_queue_wait(req); if (rc == -ETIMEDOUT || rc == -EINTR) { ldlm_del_waiting_lock(lock); - ldlm_failed_ast(lock); + ldlm_failed_ast(lock, rc, "completion"); } else if (rc) { CERROR("client returned %d from completion AST for lock %p\n", req->rq_status, lock); @@ -272,7 +410,13 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, LDLM_DEBUG_NOLOCK("server-side enqueue handler START"); - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); + dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req == NULL) { + CERROR ("Can't unpack dlm_req\n"); + RETURN (-EFAULT); + } + flags = dlm_req->lock_flags; if (dlm_req->lock_desc.l_resource.lr_type == LDLM_PLAIN && (flags & LDLM_FL_HAS_INTENT)) { @@ -298,7 +442,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, &dlm_req->lock_handle2, dlm_req->lock_desc.l_resource.lr_name, dlm_req->lock_desc.l_resource.lr_type, - dlm_req->lock_desc.l_req_mode, NULL, 0); + dlm_req->lock_desc.l_req_mode, + blocking_callback, NULL); if (!lock) GOTO(out, err = -ENOMEM); @@ -314,11 +459,11 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req, l_unlock(&lock->l_resource->lr_namespace->ns_lock); err = ldlm_lock_enqueue(obddev->obd_namespace, &lock, cookie, cookielen, - &flags, completion_callback, blocking_callback); + &flags, completion_callback); if (err) GOTO(out, err); - dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep)); dlm_rep->lock_flags = flags; ldlm_lock2handle(lock, &dlm_rep->lock_handle); @@ -358,13 +503,19 @@ int ldlm_handle_convert(struct ptlrpc_request *req) int rc, size = sizeof(*dlm_rep); ENTRY; + dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req == NULL) { + CERROR ("Can't unpack dlm_req\n"); + RETURN (-EFAULT); + } + rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) { CERROR("out of memory\n"); RETURN(-ENOMEM); } - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep)); dlm_rep->lock_flags = dlm_req->lock_flags; lock = ldlm_handle2lock(&dlm_req->lock_handle1); @@ -396,21 +547,24 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) int rc; ENTRY; + dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req == NULL) { + CERROR("bad request buffer for cancel\n"); + RETURN(-EFAULT); + } + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) { CERROR("out of memory\n"); RETURN(-ENOMEM); } - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - if (!dlm_req) { - CERROR("bad request buffer for cancel\n"); - RETURN(-EINVAL); - } lock = ldlm_handle2lock(&dlm_req->lock_handle1); if (!lock) { - CERROR("received cancel for unknown lock cookie "LPX64"\n", - dlm_req->lock_handle1.cookie); + CERROR("received cancel for unknown lock cookie "LPX64 + " from nid "LPU64"\n", dlm_req->lock_handle1.cookie, + req->rq_connection->c_peer.peer_nid); LDLM_DEBUG_NOLOCK("server-side cancel handler stale lock " "(cookie "LPU64")", dlm_req->lock_handle1.cookie); @@ -423,7 +577,7 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) req->rq_status = 0; } - if (ptlrpc_reply(req->rq_svc, req) != 0) + if (ptlrpc_reply(req) != 0) LBUG(); if (lock) { @@ -443,32 +597,28 @@ static void ldlm_handle_bl_callback(struct ptlrpc_request *req, int do_ast; ENTRY; - /* Try to narrow down this damn iozone bug */ - if (lock->l_resource == NULL) - CERROR("lock %p resource NULL\n", lock); - if (lock->l_resource->lr_type != LDLM_EXTENT) - if (lock->l_resource->lr_namespace != ns) - CERROR("lock %p namespace %p != passed ns %p\n", lock, - lock->l_resource->lr_namespace, ns); + l_lock(&ns->ns_lock); LDLM_DEBUG(lock, "client blocking AST callback handler START"); - l_lock(&ns->ns_lock); lock->l_flags |= LDLM_FL_CBPENDING; do_ast = (!lock->l_readers && !lock->l_writers); - l_unlock(&ns->ns_lock); if (do_ast) { LDLM_DEBUG(lock, "already unused, calling " "callback (%p)", lock->l_blocking_ast); - if (lock->l_blocking_ast != NULL) + if (lock->l_blocking_ast != NULL) { + l_unlock(&ns->ns_lock); lock->l_blocking_ast(lock, &dlm_req->lock_desc, lock->l_data, LDLM_CB_BLOCKING); + l_lock(&ns->ns_lock); + } } else { LDLM_DEBUG(lock, "Lock still has references, will be" " cancelled later"); } LDLM_DEBUG(lock, "client blocking callback handler END"); + l_unlock(&ns->ns_lock); LDLM_LOCK_PUT(lock); EXIT; } @@ -481,9 +631,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, LIST_HEAD(ast_list); ENTRY; - LDLM_DEBUG(lock, "client completion callback handler START"); - l_lock(&ns->ns_lock); + LDLM_DEBUG(lock, "client completion callback handler START"); /* If we receive the completion AST before the actual enqueue returned, * then we might need to switch lock modes, resources, or extents. */ @@ -491,9 +640,22 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; LDLM_DEBUG(lock, "completion AST, new lock mode"); } - if (lock->l_resource->lr_type == LDLM_EXTENT) + if (lock->l_resource->lr_type == LDLM_EXTENT) { memcpy(&lock->l_extent, &dlm_req->lock_desc.l_extent, sizeof(lock->l_extent)); + + if ((lock->l_extent.end & ~PAGE_MASK) != ~PAGE_MASK) { + /* XXX Old versions of BA OST code have a fencepost bug + * which will cause them to grant a lock that's one + * byte too large. This can be safely removed after BA + * ships their next release -phik (02 Apr 2003) */ + lock->l_extent.end--; + } else if ((lock->l_extent.start & ~PAGE_MASK) == + ~PAGE_MASK) { + lock->l_extent.start++; + } + } + ldlm_resource_unlink_lock(lock); if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, &lock->l_resource->lr_name, @@ -505,8 +667,8 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, lock->l_resource->lr_tmp = &ast_list; ldlm_grant_lock(lock, req, sizeof(*req)); lock->l_resource->lr_tmp = NULL; - l_unlock(&ns->ns_lock); LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); + l_unlock(&ns->ns_lock); LDLM_LOCK_PUT(lock); ldlm_run_ast_work(&ast_list); @@ -523,7 +685,7 @@ static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) &req->rq_repmsg); if (rc) return rc; - return ptlrpc_reply(req->rq_svc, req); + return ptlrpc_reply(req); } static int ldlm_callback_handler(struct ptlrpc_request *req) @@ -531,26 +693,29 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) struct ldlm_namespace *ns; struct ldlm_request *dlm_req; struct ldlm_lock *lock; - int rc; ENTRY; - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if (rc) { - CERROR("Invalid request: %d\n", rc); - RETURN(rc); - } + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ if (req->rq_export == NULL) { struct ldlm_request *dlm_req; - CERROR("operation %d with bad export (ptl req %d/rep %d)\n", - req->rq_reqmsg->opc, req->rq_request_portal, - req->rq_reply_portal); - CERROR("--> export addr: "LPX64", cookie: "LPX64"\n", - req->rq_reqmsg->addr, req->rq_reqmsg->cookie); - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - CERROR("--> lock addr: "LPX64", cookie: "LPX64"\n", - dlm_req->lock_handle1.addr,dlm_req->lock_handle1.cookie); + CDEBUG(D_RPCTRACE, "operation %d from nid "LPU64" with bad " + "export cookie "LPX64" (ptl req %d/rep %d); this is " + "normal if this node rebooted with a lock held\n", + req->rq_reqmsg->opc, req->rq_connection->c_peer.peer_nid, + req->rq_reqmsg->handle.cookie, + req->rq_request_portal, req->rq_reply_portal); + + dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req != NULL) + CDEBUG(D_RPCTRACE, "--> lock cookie: "LPX64"\n", + dlm_req->lock_handle1.cookie); + ldlm_callback_reply(req, -ENOTCONN); RETURN(0); } @@ -560,7 +725,7 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) } else if (req->rq_reqmsg->opc == LDLM_CP_CALLBACK) { OBD_FAIL_RETURN(OBD_FAIL_LDLM_CP_CALLBACK, 0); } else { - ldlm_callback_reply(req, -EIO); + ldlm_callback_reply(req, -EPROTO); RETURN(0); } @@ -569,7 +734,14 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) ns = req->rq_export->exp_obd->obd_namespace; LASSERT(ns != NULL); - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); + dlm_req = lustre_swab_reqbuf (req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req == NULL) { + CERROR ("can't unpack dlm_req\n"); + ldlm_callback_reply (req, -EPROTO); + RETURN (0); + } + lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1); if (!lock) { CDEBUG(D_INODE, "callback on lock "LPX64" - lock disappeared\n", @@ -592,6 +764,8 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) CDEBUG(D_INODE, "completion ast\n"); ldlm_handle_cp_callback(req, ns, dlm_req, lock); break; + default: + LBUG(); /* checked above */ } RETURN(0); @@ -602,27 +776,28 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) int rc; ENTRY; - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if (rc) { - CERROR("lustre_ldlm: Invalid request: %d\n", rc); - RETURN(rc); - } + /* Requests arrive in sender's byte order. The ptlrpc service + * handler has already checked and, if necessary, byte-swapped the + * incoming request message body, but I am responsible for the + * message buffers. */ if (req->rq_export == NULL) { struct ldlm_request *dlm_req; CERROR("operation %d with bad export (ptl req %d/rep %d)\n", req->rq_reqmsg->opc, req->rq_request_portal, req->rq_reply_portal); - CERROR("--> export addr: "LPX64", cookie: "LPX64"\n", - req->rq_reqmsg->addr, req->rq_reqmsg->cookie); - dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1); + CERROR("--> export cookie: "LPX64"\n", + req->rq_reqmsg->handle.cookie); + dlm_req = lustre_swab_reqbuf(req, 0, sizeof (*dlm_req), + lustre_swab_ldlm_request); + if (dlm_req != NULL) + ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1); RETURN(-ENOTCONN); } switch (req->rq_reqmsg->opc) { - /* XXX FIXME move this back to mds/handler.c, bug 625069 */ + /* XXX FIXME move this back to mds/handler.c, bug 249 */ case LDLM_CANCEL: CDEBUG(D_INODE, "cancel\n"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_CANCEL, 0); @@ -696,11 +871,18 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(rc); #ifdef __KERNEL__ + inter_module_register("ldlm_cli_cancel_unused", THIS_MODULE, + ldlm_cli_cancel_unused); + inter_module_register("ldlm_namespace_cleanup", THIS_MODULE, + ldlm_namespace_cleanup); + inter_module_register("ldlm_replay_locks", THIS_MODULE, + ldlm_replay_locks); + ldlm->ldlm_cb_service = ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, - ldlm_callback_handler, "ldlm_cbd"); + ldlm_callback_handler, "ldlm_cbd", obddev); if (!ldlm->ldlm_cb_service) { CERROR("failed to start service\n"); @@ -711,7 +893,7 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_CANCEL_REQUEST_PORTAL, LDLM_CANCEL_REPLY_PORTAL, - ldlm_cancel_handler, "ldlm_canceld"); + ldlm_cancel_handler, "ldlm_canceld", obddev); if (!ldlm->ldlm_cancel_service) { CERROR("failed to start service\n"); @@ -741,12 +923,26 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) } } -#endif + INIT_LIST_HEAD(&expired_lock_thread.elt_expired_locks); + spin_lock_init(&expired_lock_thread.elt_lock); + expired_lock_thread.elt_state = ELT_STOPPED; + init_waitqueue_head(&expired_lock_thread.elt_waitq); + + rc = kernel_thread(expired_lock_main, NULL, CLONE_VM | CLONE_FS); + if (rc < 0) { + CERROR("Cannot start ldlm expired-lock thread: %d\n", rc); + GOTO(out_thread, rc); + } + + wait_event(expired_lock_thread.elt_waitq, + expired_lock_thread.elt_state == ELT_READY); + INIT_LIST_HEAD(&waiting_locks_list); spin_lock_init(&waiting_locks_spinlock); waiting_locks_timer.function = waiting_locks_callback; waiting_locks_timer.data = 0; init_timer(&waiting_locks_timer); +#endif ldlm_already_setup = 1; @@ -765,30 +961,49 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) return rc; } -static int ldlm_cleanup(struct obd_device *obddev) +static int ldlm_cleanup(struct obd_device *obddev, int force, int failover) { struct ldlm_obd *ldlm = &obddev->u.ldlm; ENTRY; if (!list_empty(&ldlm_namespace_list)) { CERROR("ldlm still has namespaces; clean these up first.\n"); + ldlm_dump_all_namespaces(); RETURN(-EBUSY); } #ifdef __KERNEL__ + if (force) { + ptlrpc_put_ldlm_hooks(); + } else if (ptlrpc_ldlm_hooks_referenced()) { + CERROR("Some connections weren't cleaned up; run lconf with " + "--force to forcibly unload.\n"); + ptlrpc_dump_connections(); + RETURN(-EBUSY); + } + ptlrpc_stop_all_threads(ldlm->ldlm_cb_service); ptlrpc_unregister_service(ldlm->ldlm_cb_service); ptlrpc_stop_all_threads(ldlm->ldlm_cancel_service); ptlrpc_unregister_service(ldlm->ldlm_cancel_service); ldlm_proc_cleanup(obddev); + + expired_lock_thread.elt_state = ELT_TERMINATE; + wake_up(&expired_lock_thread.elt_waitq); + wait_event(expired_lock_thread.elt_waitq, + expired_lock_thread.elt_state == ELT_STOPPED); + + inter_module_unregister("ldlm_namespace_cleanup"); + inter_module_unregister("ldlm_cli_cancel_unused"); + inter_module_unregister("ldlm_replay_locks"); #endif + ldlm_already_setup = 0; RETURN(0); } static int ldlm_connect(struct lustre_handle *conn, struct obd_device *src, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { return class_connect(conn, src, cluuid); } @@ -896,6 +1111,18 @@ EXPORT_SYMBOL(ldlm_namespace_dump); EXPORT_SYMBOL(l_lock); EXPORT_SYMBOL(l_unlock); +/* ldlm_lib.c */ +EXPORT_SYMBOL(client_import_connect); +EXPORT_SYMBOL(client_import_disconnect); +EXPORT_SYMBOL(target_abort_recovery); +EXPORT_SYMBOL(target_handle_connect); +EXPORT_SYMBOL(target_cancel_recovery_timer); +EXPORT_SYMBOL(target_send_reply); +EXPORT_SYMBOL(target_queue_recovery_request); +EXPORT_SYMBOL(target_handle_ping); +EXPORT_SYMBOL(target_handle_disconnect); +EXPORT_SYMBOL(target_queue_final_reply); + #ifdef __KERNEL__ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); MODULE_DESCRIPTION("Lustre Lock Management Module v0.1"); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index d64a402..e5d9c24 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -29,41 +29,54 @@ #include <linux/obd_class.h> #include <linux/obd.h> -static int interrupted_completion_wait(void *data) +static void interrupted_completion_wait(void *data) { - RETURN(1); } +struct lock_wait_data { + struct ldlm_lock *lwd_lock; + int lwd_generation; +}; + int ldlm_expired_completion_wait(void *data) { - struct ldlm_lock *lock = data; - struct ptlrpc_connection *conn; - struct obd_device *obd; + struct lock_wait_data *lwd = data; + struct ldlm_lock *lock = lwd->lwd_lock; + struct obd_device *obd = class_conn2obd(lock->l_connh); - if (!lock) - CERROR("NULL lock\n"); - else if (!lock->l_connh) - CERROR("lock %p has NULL connh\n", lock); - else if (!(obd = class_conn2obd(lock->l_connh))) - CERROR("lock %p has NULL obd\n", lock); - else if (!(conn = obd->u.cli.cl_import.imp_connection)) - CERROR("lock %p has NULL connection\n", lock); - else { - LDLM_DEBUG(lock, "timed out waiting for completion"); - CERROR("lock %p timed out from %s\n", lock, - conn->c_remote_uuid.uuid); - ldlm_lock_dump(D_ERROR, lock); - class_signal_connection_failure(conn); + if (obd == NULL) { + LDLM_ERROR(lock, "lock timed out; mot entering recovery in " + "server code, just going back to sleep"); + } else { + struct obd_import *imp = obd->u.cli.cl_import; + ptlrpc_fail_import(imp, lwd->lwd_generation); + LDLM_ERROR(lock, "lock timed out, entering recovery for %s@%s", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); } + RETURN(0); } int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data) { - struct l_wait_info lwi = - LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait, - interrupted_completion_wait, lock); + struct lock_wait_data lwd; + unsigned long irqflags; + struct obd_device *obd; + struct obd_import *imp = NULL; int rc = 0; + struct l_wait_info lwi; + + obd = class_conn2obd(lock->l_connh); + + /* if this is a local lock, then there is no import */ + if (obd != NULL) + imp = obd->u.cli.cl_import; + + lwd.lwd_lock = lock; + + lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait, + interrupted_completion_wait, &lwd); ENTRY; if (flags == LDLM_FL_WAIT_NOREPROC) @@ -84,6 +97,12 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data) ldlm_reprocess_all(lock->l_resource); noreproc: + if (imp != NULL) { + spin_lock_irqsave(&imp->imp_lock, irqflags); + lwd.lwd_generation = imp->imp_generation; + spin_unlock_irqrestore(&imp->imp_lock, irqflags); + } + /* Go to sleep until the lock is granted or cancelled. */ rc = l_wait_event(lock->l_waitq, ((lock->l_req_mode == lock->l_granted_mode) || @@ -114,7 +133,6 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - void *cp_data, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -127,7 +145,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, } lock = ldlm_lock_create(ns, parent_lockh, res_id, type, mode, - data, cp_data); + blocking, data); if (!lock) GOTO(out_nolock, err = -ENOMEM); LDLM_DEBUG(lock, "client-side local enqueue handler, new lock created"); @@ -136,8 +154,8 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_lock2handle(lock, lockh); lock->l_flags |= LDLM_FL_LOCAL; - err = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, completion, - blocking); + err = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, + completion); if (err != ELDLM_OK) GOTO(out, err); @@ -172,7 +190,6 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - void *cp_data, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -188,7 +205,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, rc = ldlm_cli_enqueue_local(ns, parent_lock_handle, res_id, type, cookie, cookielen, mode, flags, completion, blocking, data, - cp_data, lockh); + lockh); RETURN(rc); } @@ -200,7 +217,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, LASSERT(connh == lock->l_connh); } else { lock = ldlm_lock_create(ns, parent_lock_handle, res_id, type, - mode, data, cp_data); + mode, blocking, data); if (lock == NULL) GOTO(out_nolock, rc = -ENOMEM); /* ugh. I set this early (instead of waiting for _enqueue) @@ -227,7 +244,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, LBUG(); /* Dump lock data into the request buffer */ - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); ldlm_lock2desc(lock, &body->lock_desc); body->lock_flags = *flags; @@ -243,6 +260,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, } lock->l_connh = connh; lock->l_export = NULL; + lock->l_blocking_ast = blocking; LDLM_DEBUG(lock, "sending request"); rc = ptlrpc_queue_wait(req); @@ -253,26 +271,54 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); /* Set a flag to prevent us from sending a CANCEL (bug 407) */ l_lock(&ns->ns_lock); - lock->l_flags |= LDLM_FL_CANCELING; + lock->l_flags |= LDLM_FL_LOCAL_ONLY; l_unlock(&ns->ns_lock); ldlm_lock_decref_and_cancel(lockh, mode); + + if (rc == ELDLM_LOCK_ABORTED) { + /* caller expects reply buffer 0 to have been swabbed */ + reply = lustre_swab_repbuf(req, 0, sizeof (*reply), + lustre_swab_ldlm_reply); + if (reply == NULL) { + CERROR ("Can't unpack ldlm_reply\n"); + GOTO (out_req, rc = -EPROTO); + } + } GOTO(out_req, rc); } - reply = lustre_msg_buf(req->rq_repmsg, 0); + reply = lustre_swab_repbuf(req, 0, sizeof (*reply), + lustre_swab_ldlm_reply); + if (reply == NULL) { + CERROR ("Can't unpack ldlm_reply\n"); + GOTO (out_req, rc = -EPROTO); + } + memcpy(&lock->l_remote_handle, &reply->lock_handle, sizeof(lock->l_remote_handle)); *flags = reply->lock_flags; - CDEBUG(D_INFO, "local: %p, remote: %p, flags: %d\n", lock, - (void *)(unsigned long)reply->lock_handle.addr, *flags); + CDEBUG(D_INFO, "local: %p, remote cookie: "LPX64", flags: %d\n", lock, + reply->lock_handle.cookie, *flags); if (type == LDLM_EXTENT) { CDEBUG(D_INFO, "requested extent: "LPU64" -> "LPU64", got " "extent "LPU64" -> "LPU64"\n", body->lock_desc.l_extent.start, body->lock_desc.l_extent.end, reply->lock_extent.start, reply->lock_extent.end); + + if ((reply->lock_extent.end & ~PAGE_MASK) != ~PAGE_MASK) { + /* XXX Old versions of BA OST code have a fencepost bug + * which will cause them to grant a lock that's one + * byte too large. This can be safely removed after BA + * ships their next release -phik (02 Apr 2003) */ + reply->lock_extent.end--; + } else if ((reply->lock_extent.start & ~PAGE_MASK) == + ~PAGE_MASK) { + reply->lock_extent.start++; + } + cookie = &reply->lock_extent; /* FIXME bug 267 */ cookielen = sizeof(reply->lock_extent); } @@ -310,7 +356,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, l_lock(&ns->ns_lock); lock->l_completion_ast = NULL; rc = ldlm_lock_enqueue(ns, &lock, cookie, cookielen, flags, - completion, blocking); + completion); l_unlock(&ns->ns_lock); if (lock->l_completion_ast) lock->l_completion_ast(lock, *flags, NULL); @@ -339,7 +385,6 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, ldlm_completion_callback completion, ldlm_blocking_callback blocking, void *data, - void *cp_data, struct lustre_handle *lockh) { int rc; @@ -357,7 +402,7 @@ int ldlm_match_or_enqueue(struct lustre_handle *connh, rc = ldlm_cli_enqueue(connh, req, ns, parent_lock_handle, res_id, type, cookie, cookielen, mode, flags, completion, blocking, data, - cp_data, lockh); + lockh); if (rc != ELDLM_OK) CERROR("ldlm_cli_enqueue: err: %d\n", rc); RETURN(rc); @@ -373,7 +418,7 @@ int ldlm_cli_replay_enqueue(struct ldlm_lock *lock) ldlm_lock2handle(lock, &lockh); return ldlm_cli_enqueue(lock->l_connh, NULL, NULL, NULL, junk, lock->l_resource->lr_type, NULL, 0, -1, &flags, - NULL, NULL, NULL, 0, &lockh); + NULL, NULL, NULL, &lockh); } static int ldlm_cli_convert_local(struct ldlm_lock *lock, int new_mode, @@ -425,7 +470,7 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags) if (!req) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->lock_handle1, &lock->l_remote_handle, sizeof(body->lock_handle1)); @@ -439,7 +484,13 @@ int ldlm_cli_convert(struct lustre_handle *lockh, int new_mode, int *flags) if (rc != ELDLM_OK) GOTO(out, rc); - reply = lustre_msg_buf(req->rq_repmsg, 0); + reply = lustre_swab_repbuf(req, 0, sizeof (*reply), + lustre_swab_ldlm_reply); + if (reply == NULL) { + CERROR ("Can't unpack ldlm_reply\n"); + GOTO (out, rc = -EPROTO); + } + res = ldlm_lock_convert(lock, new_mode, &reply->lock_flags); if (res != NULL) ldlm_reprocess_all(res); @@ -469,23 +520,30 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) if (lock->l_connh) { int local_only; + struct obd_import *imp; LDLM_DEBUG(lock, "client-side cancel"); /* Set this flag to prevent others from getting new references*/ l_lock(&lock->l_resource->lr_namespace->ns_lock); lock->l_flags |= LDLM_FL_CBPENDING; - ldlm_cancel_callback(lock); local_only = (lock->l_flags & LDLM_FL_LOCAL_ONLY); l_unlock(&lock->l_resource->lr_namespace->ns_lock); + ldlm_cancel_callback(lock); if (local_only) { CDEBUG(D_INFO, "not sending request (at caller's " - "instruction\n"); + "instruction)\n"); + goto local_cancel; + } + + imp = class_conn2cliimp(lock->l_connh); + if (imp == NULL || imp->imp_invalid) { + CDEBUG(D_HA, "skipping cancel on invalid import %p\n", + imp); goto local_cancel; } - req = ptlrpc_prep_req(class_conn2cliimp(lock->l_connh), - LDLM_CANCEL, 1, &size, NULL); + req = ptlrpc_prep_req(imp, LDLM_CANCEL, 1, &size, NULL); if (!req) GOTO(out, rc = -ENOMEM); @@ -493,21 +551,23 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->lock_handle1, &lock->l_remote_handle, sizeof(body->lock_handle1)); req->rq_replen = lustre_msg_size(0, NULL); rc = ptlrpc_queue_wait(req); - ptlrpc_req_finished(req); - if (rc == ESTALE) { - CERROR("client/server out of sync\n"); - LBUG(); - } - if (rc != ELDLM_OK) + + if (rc == ESTALE) + CERROR("client/server (nid "LPU64") out of sync--not " + "fatal\n", + req->rq_import->imp_connection->c_peer.peer_nid); + else if (rc != ELDLM_OK) CERROR("Got rc %d from cancel RPC: canceling " "anyway\n", rc); + + ptlrpc_req_finished(req); local_cancel: ldlm_lock_cancel(lock); } else { @@ -585,8 +645,9 @@ int ldlm_cancel_lru(struct ldlm_namespace *ns) RETURN(rc); } -int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - struct ldlm_res_id res_id, int flags) +static int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + struct ldlm_res_id res_id, int flags, + void *opaque) { struct ldlm_resource *res; struct list_head *tmp, *next, list = LIST_HEAD_INIT(list); @@ -605,8 +666,17 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - if (lock->l_readers || lock->l_writers) - continue; + if (lock->l_readers || lock->l_writers) { + if (flags & LDLM_FL_WARN) { + LDLM_ERROR(lock, "lock in use"); + LBUG(); + } + } + if (opaque != NULL && lock->l_data != opaque) { + LDLM_ERROR(lock, "data %p doesn't match opaque %p", + lock->l_data, opaque); + LBUG(); + } /* See CBPENDING comment in ldlm_cancel_lru */ lock->l_flags |= LDLM_FL_CBPENDING; @@ -653,9 +723,10 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, * * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying * to notify the server. - * If flags & LDLM_FL_NO_CALLBACK, don't run the cancel callback. */ + * If flags & LDLM_FL_NO_CALLBACK, don't run the cancel callback. + * If flags & LDLM_FL_WARN, print a warning if some locks are still in use. */ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, - struct ldlm_res_id *res_id, int flags) + struct ldlm_res_id *res_id, int flags, void *opaque) { int i; ENTRY; @@ -664,7 +735,8 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, RETURN(ELDLM_OK); if (res_id) - RETURN(ldlm_cli_cancel_unused_resource(ns, *res_id, flags)); + RETURN(ldlm_cli_cancel_unused_resource(ns, *res_id, flags, + opaque)); l_lock(&ns->ns_lock); for (i = 0; i < RES_HASH_SIZE; i++) { @@ -676,7 +748,7 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, ldlm_resource_getref(res); rc = ldlm_cli_cancel_unused_resource(ns, res->lr_name, - flags); + flags, opaque); if (rc) CERROR("cancel_unused_res ("LPU64"): %d\n", @@ -827,7 +899,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) /* We're part of recovery, so don't wait for it. */ req->rq_level = LUSTRE_CONN_RECOVD; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); ldlm_lock2desc(lock, &body->lock_desc); body->lock_flags = flags; @@ -839,8 +911,14 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) rc = ptlrpc_queue_wait(req); if (rc != ELDLM_OK) GOTO(out, rc); - - reply = lustre_msg_buf(req->rq_repmsg, 0); + + reply = lustre_swab_repbuf(req, 0, sizeof (*reply), + lustre_swab_ldlm_reply); + if (reply == NULL) { + CERROR("Can't unpack ldlm_reply\n"); + GOTO (out, rc = -EPROTO); + } + memcpy(&lock->l_remote_handle, &reply->lock_handle, sizeof(lock->l_remote_handle)); LDLM_DEBUG(lock, "replayed lock:"); diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 0f9f4e2..84fdecc 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -103,7 +103,7 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) #endif #undef MAX_STRING_SIZE -#define LDLM_MAX_UNUSED 20 +#define LDLM_MAX_UNUSED 100 struct ldlm_namespace *ldlm_namespace_new(char *name, __u32 client) { struct ldlm_namespace *ns = NULL; @@ -280,13 +280,6 @@ int ldlm_namespace_free(struct ldlm_namespace *ns) return ELDLM_OK; } -int ldlm_client_free(struct obd_export *exp) -{ - struct ldlm_export_data *led = &exp->exp_ldlm_data; - ptlrpc_cleanup_client(&led->led_import); - RETURN(0); -} - static __u32 ldlm_hash_fn(struct ldlm_resource *parent, struct ldlm_res_id name) { __u32 hash = 0; @@ -304,7 +297,7 @@ static struct ldlm_resource *ldlm_resource_new(void) { struct ldlm_resource *res; - res = kmem_cache_alloc(ldlm_resource_slab, SLAB_KERNEL); + OBD_SLAB_ALLOC(res, ldlm_resource_slab, SLAB_KERNEL, sizeof *res); if (res == NULL) { LBUG(); return NULL; @@ -461,8 +454,7 @@ int ldlm_resource_putref(struct ldlm_resource *res) list_del_init(&res->lr_hash); list_del_init(&res->lr_childof); - POISON(res, 0x5a, sizeof(*res)); - kmem_cache_free(ldlm_resource_slab, res); + OBD_SLAB_FREE(res, ldlm_resource_slab, sizeof *res); l_unlock(&ns->ns_lock); spin_lock(&ns->ns_counter_lock); diff --git a/lustre/lib/Makefile.am b/lustre/lib/Makefile.am deleted file mode 100644 index 1bcc388..0000000 --- a/lustre/lib/Makefile.am +++ /dev/null @@ -1,4 +0,0 @@ -EXTRA_DIST = mds_updates.c obd_pack.c simple.c -EXTRA_DIST += client.c target.c - -include $(top_srcdir)/Rules diff --git a/lustre/lib/client.c b/lustre/lib/client.c deleted file mode 100644 index ae490d9..0000000 --- a/lustre/lib/client.c +++ /dev/null @@ -1,406 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001-2003 Cluster File Systems, Inc. - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Mike Shaver <shaver@clusterfs.com> - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Client-common OBD method implementations and utility functions. - */ - -#define EXPORT_SYMTAB -#define DEBUG_SUBSYSTEM S_OST /* XXX WRONG */ - -#ifdef __KERNEL__ -#include <linux/module.h> -#else -#include <liblustre.h> -#endif - -#include <linux/obd.h> -#include <linux/obd_ost.h> -#include <linux/lustre_net.h> -#include <linux/lustre_dlm.h> - -struct client_obd *client_conn2cli(struct lustre_handle *conn) -{ - struct obd_export *export = class_conn2export(conn); - if (!export) - LBUG(); - return &export->exp_obd->u.cli; -} - -struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid) -{ - int i; - - for (i = 0; i < MAX_OBD_DEVICES; i++) { - struct obd_device *obd = &obd_dev[i]; - if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) || - (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) { - struct client_obd *cli = &obd->u.cli; - if (strncmp(tgtuuid->uuid, cli->cl_target_uuid.uuid, - sizeof(cli->cl_target_uuid.uuid)) == 0) - return obd; - } - } - - return NULL; -} - -int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) -{ - struct obd_ioctl_data* data = buf; - int rq_portal, rp_portal; - char *name; - struct client_obd *cli = &obddev->u.cli; - struct obd_import *imp = &cli->cl_import; - struct obd_uuid server_uuid; - ENTRY; - - if (obddev->obd_type->typ_ops->o_brw) { - rq_portal = OST_REQUEST_PORTAL; - rp_portal = OSC_REPLY_PORTAL; - name = "osc"; - } else { - rq_portal = MDS_REQUEST_PORTAL; - rp_portal = MDC_REPLY_PORTAL; - name = "mdc"; - } - - if (data->ioc_inllen1 < 1) { - CERROR("requires a TARGET UUID\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen1 > 37) { - CERROR("client UUID must be less than 38 characters\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen2 < 1) { - CERROR("setup requires a SERVER UUID\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen2 > 37) { - CERROR("target UUID must be less than 38 characters\n"); - RETURN(-EINVAL); - } - - sema_init(&cli->cl_sem, 1); - cli->cl_conn_count = 0; - memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1); - memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2, - sizeof(server_uuid))); - - imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid); - if (!imp->imp_connection) - RETURN(-ENOENT); - - INIT_LIST_HEAD(&imp->imp_replay_list); - INIT_LIST_HEAD(&imp->imp_sending_list); - INIT_LIST_HEAD(&imp->imp_delayed_list); - spin_lock_init(&imp->imp_lock); - - ptlrpc_init_client(rq_portal, rp_portal, name, - &obddev->obd_ldlm_client); - imp->imp_client = &obddev->obd_ldlm_client; - imp->imp_obd = obddev; - - cli->cl_max_mds_easize = sizeof(struct lov_mds_md); -#if !defined(__KERNEL__) || (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - cli->cl_sandev = 0; -#else - cli->cl_sandev.value = 0; -#endif - - RETURN(0); -} - -#ifdef __KERNEL__ -/* convert a pathname into a kdev_t */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -static kdev_t path2dev(char *path) -{ - struct dentry *dentry; - struct nameidata nd; - kdev_t dev = 0; - - if (!path_init(path, LOOKUP_FOLLOW, &nd)) - return 0; - - if (path_walk(path, &nd)) - return 0; - - dentry = nd.dentry; - if (dentry->d_inode && !is_bad_inode(dentry->d_inode) && - S_ISBLK(dentry->d_inode->i_mode)) - dev = dentry->d_inode->i_rdev; - path_release(&nd); - - return dev; -} -#else -static int path2dev(char *path) -{ - struct dentry *dentry; - struct nameidata nd; - int dev = 0; - - if (!path_init(path, LOOKUP_FOLLOW, &nd)) - return 0; - - if (path_walk(path, &nd)) - return 0; - - dentry = nd.dentry; - if (dentry->d_inode && !is_bad_inode(dentry->d_inode) && - S_ISBLK(dentry->d_inode->i_mode)) -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - dev = dentry->d_inode->i_rdev; -#else - dev = dentry->d_inode->i_rdev.value; -#endif - path_release(&nd); - - return dev; -} -#endif - -int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf) -{ - struct obd_ioctl_data* data = buf; - struct client_obd *cli = &obddev->u.cli; - struct obd_import *imp = &cli->cl_import; - struct obd_uuid server_uuid; - ENTRY; - - if (data->ioc_inllen1 < 1) { - CERROR("requires a TARGET UUID\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen1 > 37) { - CERROR("client UUID must be less than 38 characters\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen2 < 1) { - CERROR("setup requires a SERVER UUID\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen2 > 37) { - CERROR("target UUID must be less than 38 characters\n"); - RETURN(-EINVAL); - } - - if (data->ioc_inllen3 < 1) { - CERROR("setup requires a SAN device pathname\n"); - RETURN(-EINVAL); - } - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - cli->cl_sandev = path2dev(data->ioc_inlbuf3); - if (!cli->cl_sandev) { - CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3); - RETURN(-EINVAL); - } -#else - cli->cl_sandev.value = path2dev(data->ioc_inlbuf3); - if (!cli->cl_sandev.value) { - CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3); - RETURN(-EINVAL); - } -#endif - - sema_init(&cli->cl_sem, 1); - cli->cl_conn_count = 0; - memcpy(cli->cl_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1); - memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2, - sizeof(server_uuid))); - - imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid); - if (!imp->imp_connection) - RETURN(-ENOENT); - - INIT_LIST_HEAD(&imp->imp_replay_list); - INIT_LIST_HEAD(&imp->imp_sending_list); - INIT_LIST_HEAD(&imp->imp_delayed_list); - spin_lock_init(&imp->imp_lock); - - ptlrpc_init_client(OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - "sanosc", &obddev->obd_ldlm_client); - imp->imp_client = &obddev->obd_ldlm_client; - imp->imp_obd = obddev; - - cli->cl_max_mds_easize = sizeof(struct lov_mds_md); - - RETURN(0); -} -#endif - -int client_obd_cleanup(struct obd_device * obddev) -{ - struct client_obd *obd = &obddev->u.cli; - - ptlrpc_cleanup_client(&obd->cl_import); - ptlrpc_put_connection(obd->cl_import.imp_connection); - - return 0; -} - -int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) -{ - struct client_obd *cli = &obd->u.cli; - struct ptlrpc_request *request; - int rc, size[] = {sizeof(cli->cl_target_uuid), - sizeof(obd->obd_uuid) }; - char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid}; - int rq_opc = (obd->obd_type->typ_ops->o_brw) ? OST_CONNECT :MDS_CONNECT; - struct ptlrpc_connection *c; - struct obd_import *imp = &cli->cl_import; - int msg_flags; - - ENTRY; - down(&cli->cl_sem); - rc = class_connect(conn, obd, cluuid); - if (rc) - GOTO(out_sem, rc); - - cli->cl_conn_count++; - if (cli->cl_conn_count > 1) - GOTO(out_sem, rc); - - if (obd->obd_namespace != NULL) - CERROR("already have namespace!\n"); - obd->obd_namespace = ldlm_namespace_new(obd->obd_name, - LDLM_NAMESPACE_CLIENT); - if (obd->obd_namespace == NULL) - GOTO(out_disco, rc = -ENOMEM); - - INIT_LIST_HEAD(&imp->imp_chain); - imp->imp_max_transno = 0; - imp->imp_peer_committed_transno = 0; - - request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 2, size, tmp); - if (!request) - GOTO(out_ldlm, rc = -ENOMEM); - - request->rq_level = LUSTRE_CONN_NEW; - request->rq_replen = lustre_msg_size(0, NULL); - request->rq_reqmsg->addr = conn->addr; - request->rq_reqmsg->cookie = conn->cookie; - c = class_conn2export(conn)->exp_connection = - ptlrpc_connection_addref(request->rq_connection); - list_add(&imp->imp_chain, &c->c_imports); - recovd_conn_manage(c, recovd, recover); - - imp->imp_level = LUSTRE_CONN_CON; - rc = ptlrpc_queue_wait(request); - if (rc) - GOTO(out_req, rc); - - msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); - if (rq_opc == MDS_CONNECT || msg_flags & MSG_CONNECT_REPLAYABLE) { - imp->imp_flags |= IMP_REPLAYABLE; - CDEBUG(D_HA, "connected to replayable target: %s\n", cli->cl_target_uuid.uuid); - } - imp->imp_level = LUSTRE_CONN_FULL; - imp->imp_handle.addr = request->rq_repmsg->addr; - imp->imp_handle.cookie = request->rq_repmsg->cookie; - - EXIT; -out_req: - ptlrpc_req_finished(request); - if (rc) { -out_ldlm: - ldlm_namespace_free(obd->obd_namespace); - obd->obd_namespace = NULL; -out_disco: - cli->cl_conn_count--; - class_disconnect(conn); - } -out_sem: - up(&cli->cl_sem); - return rc; -} - -int client_obd_disconnect(struct lustre_handle *conn) -{ - struct obd_device *obd = class_conn2obd(conn); - struct client_obd *cli = &obd->u.cli; - int rq_opc; - struct ptlrpc_request *request = NULL; - int rc, err; - ENTRY; - - if (!obd) { - CERROR("invalid connection for disconnect: addr "LPX64 - ", cookie "LPX64"\n", conn ? conn->addr : -1UL, - conn ? conn->cookie : -1UL); - RETURN(-EINVAL); - } - - rq_opc = obd->obd_type->typ_ops->o_brw ? OST_DISCONNECT:MDS_DISCONNECT; - down(&cli->cl_sem); - if (!cli->cl_conn_count) { - CERROR("disconnecting disconnected device (%s)\n", - obd->obd_name); - GOTO(out_sem, rc = -EINVAL); - } - - cli->cl_conn_count--; - if (cli->cl_conn_count) - GOTO(out_no_disconnect, rc = 0); - - if (obd->obd_namespace != NULL) { - ldlm_cli_cancel_unused(obd->obd_namespace, NULL, 0); - ldlm_namespace_free(obd->obd_namespace); - obd->obd_namespace = NULL; - } - request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL, NULL); - if (!request) - GOTO(out_req, rc = -ENOMEM); - - request->rq_replen = lustre_msg_size(0, NULL); - - /* Process disconnects even if we're waiting for recovery. */ - request->rq_level = LUSTRE_CONN_RECOVD; - - rc = ptlrpc_queue_wait(request); - if (rc) - GOTO(out_req, rc); - - EXIT; - out_req: - if (request) - ptlrpc_req_finished(request); - list_del_init(&cli->cl_import.imp_chain); - out_no_disconnect: - err = class_disconnect(conn); - if (!rc && err) - rc = err; - out_sem: - up(&cli->cl_sem); - RETURN(rc); -} diff --git a/lustre/lib/mds_updates.c b/lustre/lib/mds_updates.c deleted file mode 100644 index aa666ad4..0000000 --- a/lustre/lib/mds_updates.c +++ /dev/null @@ -1,604 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Lustre Lite Update Records - * - * Copyright (c) 2002, 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/stat.h> -#include <linux/errno.h> -#include <linux/version.h> -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#include <linux/locks.h> // for wait_on_buffer -#else -#include <linux/buffer_head.h> // for wait_on_buffer -#endif -#include <linux/unistd.h> - -#include <asm/system.h> -#include <asm/uaccess.h> - -#include <linux/fs.h> -#include <linux/stat.h> -#include <asm/uaccess.h> -#include <linux/slab.h> -#include <asm/segment.h> - -#define DEBUG_SUBSYSTEM S_MDS - -#include <linux/obd_support.h> -#include <linux/lustre_lib.h> -#include <linux/lustre_mds.h> -#include <linux/lustre_lite.h> - -void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) -{ - fid->id = HTON__u64(inode->i_ino); - fid->generation = HTON__u32(inode->i_generation); - fid->f_type = HTON__u32(S_IFMT & inode->i_mode); -} - -void mds_pack_inode2body(struct mds_body *b, struct inode *inode) -{ - b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | - OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLNLINK | OBD_MD_FLGENER; - - /* The MDS file size isn't authoritative for regular files, so don't - * even pretend. */ - if (S_ISREG(inode->i_mode)) - b->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); - - b->ino = HTON__u32(inode->i_ino); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - b->atime = HTON__u32(inode->i_atime); - b->mtime = HTON__u32(inode->i_mtime); - b->ctime = HTON__u32(inode->i_ctime); -#else - b->atime = HTON__u32(inode->i_atime.tv_sec); - b->mtime = HTON__u32(inode->i_mtime.tv_sec); - b->ctime = HTON__u32(inode->i_ctime.tv_sec); -#endif - b->mode = HTON__u32(inode->i_mode); - b->size = HTON__u64(inode->i_size); - b->blocks = HTON__u64(inode->i_blocks); - b->uid = HTON__u32(inode->i_uid); - b->gid = HTON__u32(inode->i_gid); - b->flags = HTON__u32(inode->i_flags); - b->rdev = HTON__u32(b->rdev); - b->nlink = HTON__u32(inode->i_nlink); - b->generation = HTON__u32(inode->i_generation); - b->suppgid = HTON__u32(-1); -} - - -void mds_pack_fid(struct ll_fid *fid) -{ - fid->id = HTON__u64(fid->id); - fid->generation = HTON__u32(fid->generation); - fid->f_type = HTON__u32(fid->f_type); -} - -static void mds_pack_body(struct mds_body *b) -{ - if (b == NULL) - LBUG(); - - b->fsuid = HTON__u32(current->fsuid); - b->fsgid = HTON__u32(current->fsgid); - b->capability = HTON__u32(current->cap_effective); - - mds_pack_fid(&b->fid1); - mds_pack_fid(&b->fid2); - b->size = HTON__u64(b->size); - b->ino = HTON__u32(b->ino); - b->valid = HTON__u32(b->valid); - b->mode = HTON__u32(b->mode); - b->uid = HTON__u32(b->uid); - b->gid = HTON__u32(b->gid); - b->mtime = HTON__u32(b->mtime); - b->ctime = HTON__u32(b->ctime); - b->atime = HTON__u32(b->atime); - b->flags = HTON__u32(b->flags); - b->rdev = HTON__u32(b->rdev); - b->nlink = HTON__u32(b->nlink); - b->generation = HTON__u32(b->generation); - b->suppgid = HTON__u32(b->suppgid); -} - -void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, - int flags, - struct inode *inode, const char *name, int namelen) -{ - struct mds_body *b; - b = lustre_msg_buf(req->rq_reqmsg, offset); - - b->fsuid = HTON__u32(current->fsuid); - b->fsgid = HTON__u32(current->fsgid); - b->capability = HTON__u32(current->cap_effective); - b->valid = HTON__u32(valid); - b->flags = HTON__u32(flags); - if (in_group_p(inode->i_gid)) - b->suppgid = HTON__u32(inode->i_gid); - else - b->suppgid = HTON__u32(-1); - - ll_inode2fid(&b->fid1, inode); - if (name) { - char *tmp; - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); - } -} - -void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, - obd_id ino, int type, __u64 xid) -{ - struct mds_body *b; - - b = lustre_msg_buf(req->rq_reqmsg, 0); - b->fsuid = HTON__u32(current->fsuid); - b->fsgid = HTON__u32(current->fsgid); - b->capability = HTON__u32(current->cap_effective); - b->fid1.id = HTON__u64(ino); - b->fid1.f_type = HTON__u32(type); - b->size = HTON__u64(offset); - b->suppgid = HTON__u32(-1); - b->blocks = HTON__u64(xid); -} - - -void mds_pack_req_body(struct ptlrpc_request *req) -{ - struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0); - mds_pack_body(b); -} - -void mds_pack_rep_body(struct ptlrpc_request *req) -{ - struct mds_body *b = lustre_msg_buf(req->rq_repmsg, 0); - mds_pack_body(b); -} - - -/* packing of MDS records */ -void mds_create_pack(struct ptlrpc_request *req, int offset, struct inode *dir, - __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, - const char *name, int namelen, - const void *data, int datalen) -{ - struct mds_rec_create *rec; - char *tmp; - rec = lustre_msg_buf(req->rq_reqmsg, offset); - - rec->cr_opcode = HTON__u32(REINT_CREATE); - rec->cr_fsuid = HTON__u32(current->fsuid); - rec->cr_fsgid = HTON__u32(current->fsgid); - rec->cr_cap = HTON__u32(current->cap_effective); - ll_inode2fid(&rec->cr_fid, dir); - memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); - rec->cr_mode = HTON__u32(mode); - rec->cr_rdev = HTON__u64(rdev); - rec->cr_uid = HTON__u32(uid); - rec->cr_gid = HTON__u32(gid); - rec->cr_time = HTON__u64(time); - if (in_group_p(dir->i_gid)) - rec->cr_suppgid = HTON__u32(dir->i_gid); - else - rec->cr_suppgid = HTON__u32(-1); - - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); - - if (data) { - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2); - LOGL0(data, datalen, tmp); - } -} -/* packing of MDS records */ -void mds_open_pack(struct ptlrpc_request *req, int offset, struct inode *dir, - __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, - __u32 flags, - const char *name, int namelen, - const void *data, int datalen) -{ - struct mds_rec_create *rec; - char *tmp; - rec = lustre_msg_buf(req->rq_reqmsg, offset); - - /* XXX do something about time, uid, gid */ - rec->cr_opcode = HTON__u32(REINT_OPEN); - rec->cr_fsuid = HTON__u32(current->fsuid); - rec->cr_fsgid = HTON__u32(current->fsgid); - rec->cr_cap = HTON__u32(current->cap_effective); - ll_inode2fid(&rec->cr_fid, dir); - memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); - rec->cr_mode = HTON__u32(mode); - rec->cr_flags = HTON__u32(flags); - rec->cr_rdev = HTON__u64(rdev); - rec->cr_uid = HTON__u32(uid); - rec->cr_gid = HTON__u32(gid); - rec->cr_time = HTON__u64(time); - if (in_group_p(dir->i_gid)) - rec->cr_suppgid = HTON__u32(dir->i_gid); - else - rec->cr_suppgid = HTON__u32(-1); - - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); - - if (data) { - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2); - LOGL0(data, datalen, tmp); - } -} - -void mds_setattr_pack(struct ptlrpc_request *req, - struct inode *inode, struct iattr *iattr, - void *ea, int ealen) -{ - struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0); - - rec->sa_opcode = HTON__u32(REINT_SETATTR); - rec->sa_fsuid = HTON__u32(current->fsuid); - rec->sa_fsgid = HTON__u32(current->fsgid); - rec->sa_cap = HTON__u32(current->cap_effective); - ll_inode2fid(&rec->sa_fid, inode); - - if (iattr) { - rec->sa_valid = HTON__u32(iattr->ia_valid); - rec->sa_mode = HTON__u32(iattr->ia_mode); - rec->sa_uid = HTON__u32(iattr->ia_uid); - rec->sa_gid = HTON__u32(iattr->ia_gid); - rec->sa_size = HTON__u64(iattr->ia_size); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - rec->sa_atime = HTON__u64(iattr->ia_atime); - rec->sa_mtime = HTON__u64(iattr->ia_mtime); - rec->sa_ctime = HTON__u64(iattr->ia_ctime); -#else - rec->sa_atime = HTON__u64(iattr->ia_atime.tv_sec); - rec->sa_mtime = HTON__u64(iattr->ia_mtime.tv_sec); - rec->sa_ctime = HTON__u64(iattr->ia_ctime.tv_sec); -#endif - rec->sa_attr_flags = HTON__u32(iattr->ia_attr_flags); - - if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid)) - rec->sa_suppgid = HTON__u32(iattr->ia_gid); - else if ((iattr->ia_valid & ATTR_MODE) && - in_group_p(inode->i_gid)) - rec->sa_suppgid = HTON__u32(inode->i_gid); - else - rec->sa_suppgid = HTON__u32(-1); - } - - if (ealen) - memcpy(lustre_msg_buf(req->rq_reqmsg, 1), ea, ealen); -} - -void mds_unlink_pack(struct ptlrpc_request *req, int offset, - struct inode *inode, struct inode *child, __u32 mode, - const char *name, int namelen) -{ - struct mds_rec_unlink *rec; - char *tmp; - - rec = lustre_msg_buf(req->rq_reqmsg, offset); - - rec->ul_opcode = HTON__u32(REINT_UNLINK); - rec->ul_fsuid = HTON__u32(current->fsuid); - rec->ul_fsgid = HTON__u32(current->fsgid); - rec->ul_cap = HTON__u32(current->cap_effective); - rec->ul_mode = HTON__u32(mode); - if (in_group_p(inode->i_gid)) - rec->ul_suppgid = HTON__u32(inode->i_gid); - else - rec->ul_suppgid = HTON__u32(-1); - ll_inode2fid(&rec->ul_fid1, inode); - if (child) - ll_inode2fid(&rec->ul_fid2, child); - - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); -} - -void mds_link_pack(struct ptlrpc_request *req, int offset, - struct inode *inode, struct inode *dir, - const char *name, int namelen) -{ - struct mds_rec_link *rec; - char *tmp; - - rec = lustre_msg_buf(req->rq_reqmsg, offset); - - rec->lk_opcode = HTON__u32(REINT_LINK); - rec->lk_fsuid = HTON__u32(current->fsuid); - rec->lk_fsgid = HTON__u32(current->fsgid); - rec->lk_cap = HTON__u32(current->cap_effective); - if (in_group_p(dir->i_gid)) - rec->lk_suppgid = HTON__u32(dir->i_gid); - else - rec->lk_suppgid = HTON__u32(-1); - ll_inode2fid(&rec->lk_fid1, inode); - ll_inode2fid(&rec->lk_fid2, dir); - - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(name, namelen, tmp); -} - -void mds_rename_pack(struct ptlrpc_request *req, int offset, - struct inode *srcdir, struct inode *tgtdir, - const char *old, int oldlen, const char *new, int newlen) -{ - struct mds_rec_rename *rec; - char *tmp; - - rec = lustre_msg_buf(req->rq_reqmsg, offset); - - /* XXX do something about time, uid, gid */ - rec->rn_opcode = HTON__u32(REINT_RENAME); - rec->rn_fsuid = HTON__u32(current->fsuid); - rec->rn_fsgid = HTON__u32(current->fsgid); - rec->rn_cap = HTON__u32(current->cap_effective); - if (in_group_p(srcdir->i_gid)) - rec->rn_suppgid1 = HTON__u32(srcdir->i_gid); - else - rec->rn_suppgid1 = HTON__u32(-1); - if (in_group_p(tgtdir->i_gid)) - rec->rn_suppgid2 = HTON__u32(tgtdir->i_gid); - else - rec->rn_suppgid2 = HTON__u32(-1); - ll_inode2fid(&rec->rn_fid1, srcdir); - ll_inode2fid(&rec->rn_fid2, tgtdir); - - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1); - LOGL0(old, oldlen, tmp); - - if (new) { - tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2); - LOGL0(new, newlen, tmp); - } -} - -/* unpacking */ -void mds_unpack_fid(struct ll_fid *fid) -{ - fid->id = NTOH__u64(fid->id); - fid->generation = NTOH__u32(fid->generation); - fid->f_type = NTOH__u32(fid->f_type); -} - -void mds_unpack_body(struct mds_body *b) -{ - if (b == NULL) - LBUG(); - - mds_unpack_fid(&b->fid1); - mds_unpack_fid(&b->fid2); - b->size = NTOH__u64(b->size); - b->blocks = NTOH__u64(b->blocks); - b->valid = NTOH__u32(b->valid); - b->fsuid = NTOH__u32(b->fsuid); - b->fsgid = NTOH__u32(b->fsgid); - b->capability = NTOH__u32(b->capability); - b->ino = NTOH__u32(b->ino); - b->mode = NTOH__u32(b->mode); - b->uid = NTOH__u32(b->uid); - b->gid = NTOH__u32(b->gid); - b->mtime = NTOH__u32(b->mtime); - b->ctime = NTOH__u32(b->ctime); - b->atime = NTOH__u32(b->atime); - b->flags = NTOH__u32(b->flags); - b->rdev = NTOH__u32(b->rdev); - b->nlink = NTOH__u32(b->nlink); - b->generation = NTOH__u32(b->generation); - b->suppgid = NTOH__u32(b->suppgid); -} - -static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *r) -{ - struct iattr *attr = &r->ur_iattr; - struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset); - ENTRY; - - if (req->rq_reqmsg->bufcount < offset + 1 || - req->rq_reqmsg->buflens[offset] != sizeof(*rec)) - RETURN(-EFAULT); - - r->ur_fsuid = NTOH__u32(rec->sa_fsuid); - r->ur_fsgid = NTOH__u32(rec->sa_fsgid); - r->ur_cap = NTOH__u32(rec->sa_cap); - r->ur_suppgid1 = NTOH__u32(rec->sa_suppgid); - r->ur_suppgid2 = NTOH__u32(-1); - r->ur_fid1 = &rec->sa_fid; - attr->ia_valid = NTOH__u32(rec->sa_valid); - attr->ia_mode = NTOH__u32(rec->sa_mode); - attr->ia_uid = NTOH__u32(rec->sa_uid); - attr->ia_gid = NTOH__u32(rec->sa_gid); - attr->ia_size = NTOH__u64(rec->sa_size); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - attr->ia_atime = NTOH__u64(rec->sa_atime); - attr->ia_mtime = NTOH__u64(rec->sa_mtime); - attr->ia_ctime = NTOH__u64(rec->sa_ctime); -#else - attr->ia_atime.tv_sec = NTOH__u64(rec->sa_atime); - attr->ia_mtime.tv_sec = NTOH__u64(rec->sa_mtime); - attr->ia_ctime.tv_sec = NTOH__u64(rec->sa_ctime); -#endif - attr->ia_attr_flags = NTOH__u32(rec->sa_attr_flags); - - if (req->rq_reqmsg->bufcount == offset + 2) { - r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - } else { - r->ur_namelen = 0; - } - - RETURN(0); -} - -static int mds_create_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *r) -{ - struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, offset); - ENTRY; - - if (req->rq_reqmsg->bufcount < offset + 2 || - req->rq_reqmsg->buflens[offset] != sizeof(*rec)) - RETURN(-EFAULT); - - r->ur_fsuid = NTOH__u32(rec->cr_fsuid); - r->ur_fsgid = NTOH__u32(rec->cr_fsgid); - r->ur_cap = NTOH__u32(rec->cr_cap); - r->ur_fid1 = &rec->cr_fid; - r->ur_fid2 = &rec->cr_replayfid; - r->ur_mode = NTOH__u32(rec->cr_mode); - r->ur_rdev = NTOH__u64(rec->cr_rdev); - r->ur_uid = NTOH__u32(rec->cr_uid); - r->ur_gid = NTOH__u32(rec->cr_gid); - r->ur_time = NTOH__u64(rec->cr_time); - r->ur_flags = NTOH__u32(rec->cr_flags); - r->ur_suppgid1 = NTOH__u32(rec->cr_suppgid); - r->ur_suppgid2 = NTOH__u32(-1); - - r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - - if (req->rq_reqmsg->bufcount == offset + 3) { - r->ur_tgt = lustre_msg_buf(req->rq_reqmsg, offset + 2); - r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; - } else { - r->ur_tgt = NULL; - r->ur_tgtlen = 0; - } - RETURN(0); -} - -static int mds_link_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *r) -{ - struct mds_rec_link *rec = lustre_msg_buf(req->rq_reqmsg, offset); - ENTRY; - - if (req->rq_reqmsg->bufcount != offset + 2 || - req->rq_reqmsg->buflens[offset] != sizeof(*rec)) - RETURN(-EFAULT); - - r->ur_fsuid = NTOH__u32(rec->lk_fsuid); - r->ur_fsgid = NTOH__u32(rec->lk_fsgid); - r->ur_cap = NTOH__u32(rec->lk_cap); - r->ur_suppgid1 = NTOH__u32(rec->lk_suppgid); - r->ur_suppgid2 = NTOH__u32(-1); - r->ur_fid1 = &rec->lk_fid1; - r->ur_fid2 = &rec->lk_fid2; - - r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - RETURN(0); -} - -static int mds_unlink_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *r) -{ - struct mds_rec_unlink *rec = lustre_msg_buf(req->rq_reqmsg, offset); - ENTRY; - - if (req->rq_reqmsg->bufcount != offset + 2 || - req->rq_reqmsg->buflens[offset] != sizeof(*rec)) - RETURN(-EFAULT); - - r->ur_fsuid = NTOH__u32(rec->ul_fsuid); - r->ur_fsgid = NTOH__u32(rec->ul_fsgid); - r->ur_cap = NTOH__u32(rec->ul_cap); - r->ur_mode = NTOH__u32(rec->ul_mode); - r->ur_suppgid1 = NTOH__u32(rec->ul_suppgid); - r->ur_suppgid2 = NTOH__u32(-1); - r->ur_fid1 = &rec->ul_fid1; - r->ur_fid2 = &rec->ul_fid2; - - r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - RETURN(0); -} - -static int mds_rename_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *r) -{ - struct mds_rec_rename *rec = lustre_msg_buf(req->rq_reqmsg, offset); - ENTRY; - - if (req->rq_reqmsg->bufcount != offset + 3 || - req->rq_reqmsg->buflens[offset] != sizeof(*rec)) - RETURN(-EFAULT); - - r->ur_fsuid = NTOH__u32(rec->rn_fsuid); - r->ur_fsgid = NTOH__u32(rec->rn_fsgid); - r->ur_cap = NTOH__u32(rec->rn_cap); - r->ur_suppgid1 = NTOH__u32(rec->rn_suppgid1); - r->ur_suppgid2 = NTOH__u32(rec->rn_suppgid2); - r->ur_fid1 = &rec->rn_fid1; - r->ur_fid2 = &rec->rn_fid2; - - r->ur_name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; - - r->ur_tgt = lustre_msg_buf(req->rq_reqmsg, offset + 2); - r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; - RETURN(0); -} - -typedef int (*update_unpacker)(struct ptlrpc_request *req, int offset, - struct mds_update_record *r); - -static update_unpacker mds_unpackers[REINT_MAX + 1] = { - [REINT_SETATTR] mds_setattr_unpack, - [REINT_CREATE] mds_create_unpack, - [REINT_LINK] mds_link_unpack, - [REINT_UNLINK] mds_unlink_unpack, - [REINT_RENAME] mds_rename_unpack, - [REINT_OPEN] mds_create_unpack, -}; - -int mds_update_unpack(struct ptlrpc_request *req, int offset, - struct mds_update_record *rec) -{ - __u32 *opcode = lustre_msg_buf(req->rq_reqmsg, offset); - int rc, realop; - ENTRY; - - if (!opcode || req->rq_reqmsg->buflens[offset] < sizeof(*opcode)) - RETURN(-EFAULT); - - realop = rec->ur_opcode = NTOH__u32(*opcode); - realop &= REINT_OPCODE_MASK; - - if (realop < 0 || realop > REINT_MAX) { - LBUG(); - RETURN(-EFAULT); - } - - rc = mds_unpackers[realop](req, offset, rec); - RETURN(rc); -} diff --git a/lustre/lib/obd_pack.c b/lustre/lib/obd_pack.c deleted file mode 100644 index c76ff32..0000000 --- a/lustre/lib/obd_pack.c +++ /dev/null @@ -1,64 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com> - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * (Un)packing of OST requests - * - */ - -#define DEBUG_SUBSYSTEM S_OST -#ifndef __KERNEL__ -#include <liblustre.h> -#endif - -#include <linux/obd_ost.h> -#include <linux/lustre_net.h> - -void ost_pack_ioo(struct obd_ioobj *ioo, struct lov_stripe_md *lsm, int bufcnt) -{ - ioo->ioo_id = HTON__u64(lsm->lsm_object_id); - ioo->ioo_gr = HTON__u64(0); - ioo->ioo_type = HTON__u32(S_IFREG); - ioo->ioo_bufcnt = HTON__u32(bufcnt); -} - -void ost_unpack_ioo(struct obd_ioobj *dst, struct obd_ioobj *src) -{ - dst->ioo_id = NTOH__u64(src->ioo_id); - dst->ioo_gr = NTOH__u64(src->ioo_gr); - dst->ioo_type = NTOH__u32(src->ioo_type); - dst->ioo_bufcnt = NTOH__u32(src->ioo_bufcnt); -} - -void ost_pack_niobuf(struct niobuf_remote *nb, __u64 offset, __u32 len, - __u32 flags, __u32 xid) -{ - nb->offset = HTON__u64(offset); - nb->len = HTON__u32(len); - nb->xid = HTON__u32(xid); - nb->flags = HTON__u32(flags); -} - -void ost_unpack_niobuf(struct niobuf_remote *dst, struct niobuf_remote *src) -{ - dst->offset = NTOH__u64(src->offset); - dst->len = NTOH__u32(src->len); - dst->xid = NTOH__u32(src->xid); - dst->flags = NTOH__u32(src->flags); -} diff --git a/lustre/lib/target.c b/lustre/lib/target.c deleted file mode 100644 index 82f1164..0000000 --- a/lustre/lib/target.c +++ /dev/null @@ -1,524 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001-2003 Cluster File Systems, Inc. - * Author: Peter J. Braam <braam@clusterfs.com> - * Author: Phil Schwan <phil@clusterfs.com> - * Author: Mike Shaver <shaver@clusterfs.com> - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Target-common OBD method implementations and utility functions. - */ - -#define EXPORT_SYMTAB -#define DEBUG_SUBSYSTEM S_OST /* XXX WRONG */ - -#include <linux/module.h> -#include <linux/obd_ost.h> -#include <linux/lustre_net.h> -#include <linux/lustre_dlm.h> - -int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - struct obd_uuid *cluuid) -{ - if (exp->exp_connection) { - struct lustre_handle *hdl; - hdl = &exp->exp_ldlm_data.led_import.imp_handle; - /* Might be a re-connect after a partition. */ - if (!memcmp(conn, hdl, sizeof *conn)) { - CERROR("%s reconnecting\n", cluuid->uuid); - conn->addr = (__u64) (unsigned long)exp; - conn->cookie = exp->exp_cookie; - RETURN(EALREADY); - } else { - CERROR("%s reconnecting from %s, " - "handle mismatch (ours "LPX64"/"LPX64", " - "theirs "LPX64"/"LPX64")\n", cluuid->uuid, - exp->exp_connection->c_remote_uuid.uuid, - hdl->addr, - hdl->cookie, conn->addr, conn->cookie); - /* XXX disconnect them here? */ - memset(conn, 0, sizeof *conn); - /* This is a little scary, but right now we build this - * file separately into each server module, so I won't - * go _immediately_ to hell. - */ - RETURN(-EALREADY); - } - } - - conn->addr = (__u64) (unsigned long)exp; - conn->cookie = exp->exp_cookie; - CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid->uuid, exp); - CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n", - (long long)conn->addr, (long long)conn->cookie); - RETURN(0); -} - - -int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) -{ - struct obd_device *target; - struct obd_export *export = NULL; - struct obd_import *dlmimp; - struct lustre_handle conn; - struct obd_uuid tgtuuid; - struct obd_uuid cluuid; - struct list_head *p; - int rc, i; - ENTRY; - - if (req->rq_reqmsg->buflens[0] > 37) { - CERROR("bad target UUID for connect\n"); - GOTO(out, rc = -EINVAL); - } - obd_str2uuid(&tgtuuid, lustre_msg_buf(req->rq_reqmsg, 0)); - - if (req->rq_reqmsg->buflens[1] > 37) { - CERROR("bad client UUID for connect\n"); - GOTO(out, rc = -EINVAL); - } - obd_str2uuid(&cluuid, lustre_msg_buf(req->rq_reqmsg, 1)); - - i = class_uuid2dev(&tgtuuid); - if (i == -1) { - CERROR("UUID '%s' not found for connect\n", tgtuuid.uuid); - GOTO(out, rc = -ENODEV); - } - - target = &obd_dev[i]; - if (!target) - GOTO(out, rc = -ENODEV); - - spin_lock_bh(&target->obd_processing_task_lock); - if (target->obd_flags & OBD_ABORT_RECOVERY) - target_abort_recovery(target); - spin_unlock_bh(&target->obd_processing_task_lock); - - conn.addr = req->rq_reqmsg->addr; - conn.cookie = req->rq_reqmsg->cookie; - - rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - GOTO(out, rc); - - /* lctl gets a backstage, all-access pass. */ - if (!strcmp(cluuid.uuid, "OBD_CLASS_UUID")) - goto dont_check_exports; - - spin_lock(&target->obd_dev_lock); - list_for_each(p, &target->obd_exports) { - export = list_entry(p, struct obd_export, exp_obd_chain); - if (!memcmp(&cluuid, &export->exp_client_uuid, - sizeof(export->exp_client_uuid))) { - spin_unlock(&target->obd_dev_lock); - LASSERT(export->exp_obd == target); - - rc = target_handle_reconnect(&conn, export, &cluuid); - break; - } - export = NULL; - } - /* If we found an export, we already unlocked. */ - if (!export) - spin_unlock(&target->obd_dev_lock); - - /* Tell the client if we're in recovery. */ - /* If this is the first client, start the recovery timer */ - if (target->obd_flags & OBD_RECOVERING) { - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); - target_start_recovery_timer(target, handler); - } - - /* Tell the client if we support replayable requests */ - if (target->obd_flags & OBD_REPLAYABLE) - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); - - if (!export) { - if (target->obd_flags & OBD_RECOVERING) { - CERROR("denying connection for new client %s: " - "in recovery\n", cluuid.uuid); - rc = -EBUSY; - } else { - dont_check_exports: - rc = obd_connect(&conn, target, &cluuid, ptlrpc_recovd, - target_revoke_connection); - } - } - - /* If all else goes well, this is our RPC return code. */ - req->rq_status = 0; - - if (rc && rc != EALREADY) - GOTO(out, rc); - - req->rq_repmsg->addr = conn.addr; - req->rq_repmsg->cookie = conn.cookie; - - export = class_conn2export(&conn); - LASSERT(export); - - req->rq_export = export; - export->exp_connection = ptlrpc_get_connection(&req->rq_peer, &cluuid); - if (req->rq_connection != NULL) - ptlrpc_put_connection(req->rq_connection); - req->rq_connection = ptlrpc_connection_addref(export->exp_connection); - - if (rc == EALREADY) { - /* We indicate the reconnection in a flag, not an error code. */ - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT); - GOTO(out, rc = 0); - } - - spin_lock(&export->exp_connection->c_lock); - list_add(&export->exp_conn_chain, &export->exp_connection->c_exports); - spin_unlock(&export->exp_connection->c_lock); - recovd_conn_manage(export->exp_connection, ptlrpc_recovd, - target_revoke_connection); - - dlmimp = &export->exp_ldlm_data.led_import; - dlmimp->imp_connection = req->rq_connection; - dlmimp->imp_client = &export->exp_obd->obd_ldlm_client; - dlmimp->imp_handle.addr = req->rq_reqmsg->addr; - dlmimp->imp_handle.cookie = req->rq_reqmsg->cookie; - dlmimp->imp_obd = target; - dlmimp->imp_recover = NULL; - INIT_LIST_HEAD(&dlmimp->imp_replay_list); - INIT_LIST_HEAD(&dlmimp->imp_sending_list); - INIT_LIST_HEAD(&dlmimp->imp_delayed_list); - spin_lock_init(&dlmimp->imp_lock); - dlmimp->imp_level = LUSTRE_CONN_FULL; -out: - if (rc) - req->rq_status = rc; - RETURN(rc); -} - -int target_handle_disconnect(struct ptlrpc_request *req) -{ - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; - int rc; - ENTRY; - - rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - RETURN(rc); - - req->rq_status = obd_disconnect(conn); - req->rq_export = NULL; - RETURN(0); -} - -static int target_disconnect_client(struct ptlrpc_connection *conn) -{ - struct list_head *expiter, *n; - struct lustre_handle hdl; - struct obd_export *exp; - int rc; - ENTRY; - - list_for_each_safe(expiter, n, &conn->c_exports) { - exp = list_entry(expiter, struct obd_export, exp_conn_chain); - - CDEBUG(D_HA, "disconnecting export %p/%s\n", - exp, exp->exp_client_uuid.uuid); - hdl.addr = (__u64)(unsigned long)exp; - hdl.cookie = exp->exp_cookie; - rc = obd_disconnect(&hdl); - if (rc) - CERROR("disconnecting export %p failed: %d\n", exp, rc); - } - - /* XXX spank the connection (it's frozen in _RECOVD for now!) */ - RETURN(0); -} - -static int target_fence_failed_connection(struct ptlrpc_connection *conn) -{ - ENTRY; - - conn->c_recovd_data.rd_phase = RD_PREPARED; - - RETURN(0); -} - -int target_revoke_connection(struct recovd_data *rd, int phase) -{ - struct ptlrpc_connection *conn = class_rd2conn(rd); - - LASSERT(conn); - ENTRY; - - switch (phase) { - case PTLRPC_RECOVD_PHASE_PREPARE: - RETURN(target_fence_failed_connection(conn)); - case PTLRPC_RECOVD_PHASE_RECOVER: - RETURN(target_disconnect_client(conn)); - case PTLRPC_RECOVD_PHASE_FAILURE: - LBUG(); - RETURN(0); - } - - LBUG(); - RETURN(-ENOSYS); -} - -/* - * Recovery functions - */ - -static void abort_delayed_replies(struct obd_device *obd) -{ - struct ptlrpc_request *req; - struct list_head *tmp, *n; - list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "aborted:"); - req->rq_status = -ENOTCONN; - req->rq_type = PTL_RPC_MSG_ERR; - ptlrpc_reply(req->rq_svc, req); - list_del(&req->rq_list); - OBD_FREE(req, sizeof *req); - } -} - -void target_abort_recovery(void *data) -{ - struct obd_device *obd = data; - CERROR("disconnecting clients and aborting recovery\n"); - obd->obd_recoverable_clients = 0; - obd->obd_flags &= ~(OBD_RECOVERING | OBD_ABORT_RECOVERY); - abort_delayed_replies(obd); - spin_unlock_bh(&obd->obd_processing_task_lock); - class_disconnect_all(obd); - spin_lock_bh(&obd->obd_processing_task_lock); -} - -static void target_recovery_expired(unsigned long castmeharder) -{ - struct obd_device *obd = (struct obd_device *)castmeharder; - CERROR("recovery timed out, aborting\n"); - spin_lock_bh(&obd->obd_processing_task_lock); - obd->obd_flags |= OBD_ABORT_RECOVERY; - wake_up(&obd->obd_next_transno_waitq); - spin_unlock_bh(&obd->obd_processing_task_lock); -} - -static void reset_recovery_timer(struct obd_device *obd) -{ - CDEBUG(D_ERROR, "timer will expire in %ld seconds\n", - OBD_RECOVERY_TIMEOUT / HZ); - mod_timer(&obd->obd_recovery_timer, jiffies + OBD_RECOVERY_TIMEOUT); -} - - -/* Only start it the first time called */ -void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler) -{ - spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_recovery_handler) { - spin_unlock_bh(&obd->obd_processing_task_lock); - return; - } - CERROR("%s: starting recovery timer\n", obd->obd_name); - obd->obd_recovery_handler = handler; - obd->obd_recovery_timer.function = target_recovery_expired; - obd->obd_recovery_timer.data = (unsigned long)obd; - init_timer(&obd->obd_recovery_timer); - spin_unlock_bh(&obd->obd_processing_task_lock); - - reset_recovery_timer(obd); -} - -static void cancel_recovery_timer(struct obd_device *obd) -{ - del_timer(&obd->obd_recovery_timer); -} - -static int check_for_next_transno(struct obd_device *obd) -{ - struct ptlrpc_request *req; - req = list_entry(obd->obd_recovery_queue.next, - struct ptlrpc_request, rq_list); - LASSERT(req->rq_reqmsg->transno >= obd->obd_next_recovery_transno); - - return req->rq_reqmsg->transno == obd->obd_next_recovery_transno || - (obd->obd_flags & OBD_RECOVERING) == 0; -} - -static void process_recovery_queue(struct obd_device *obd) -{ - struct ptlrpc_request *req; - int aborted = 0; - ENTRY; - - for (;;) { - spin_lock_bh(&obd->obd_processing_task_lock); - LASSERT(obd->obd_processing_task == current->pid); - req = list_entry(obd->obd_recovery_queue.next, - struct ptlrpc_request, rq_list); - - if (req->rq_reqmsg->transno != obd->obd_next_recovery_transno) { - struct l_wait_info lwi = { 0 }; - spin_unlock_bh(&obd->obd_processing_task_lock); - CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is " - LPD64")\n", - obd->obd_next_recovery_transno, - req->rq_reqmsg->transno); - l_wait_event(obd->obd_next_transno_waitq, - check_for_next_transno(obd), &lwi); - spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_flags & OBD_ABORT_RECOVERY) { - target_abort_recovery(obd); - aborted = 1; - } - spin_unlock_bh(&obd->obd_processing_task_lock); - if (aborted) - return; - continue; - } - list_del_init(&req->rq_list); - spin_unlock_bh(&obd->obd_processing_task_lock); - - DEBUG_REQ(D_ERROR, req, "processing: "); - (void)obd->obd_recovery_handler(req); - reset_recovery_timer(obd); -#warning FIXME: mds_fsync_super(mds->mds_sb); - OBD_FREE(req, sizeof *req); - spin_lock_bh(&obd->obd_processing_task_lock); - obd->obd_next_recovery_transno++; - if (list_empty(&obd->obd_recovery_queue)) { - obd->obd_processing_task = 0; - spin_unlock_bh(&obd->obd_processing_task_lock); - break; - } - spin_unlock_bh(&obd->obd_processing_task_lock); - } - EXIT; -} - -int target_queue_recovery_request(struct ptlrpc_request *req, - struct obd_device *obd) -{ - struct list_head *tmp; - int inserted = 0; - __u64 transno = req->rq_reqmsg->transno; - struct ptlrpc_request *saved_req; - - if (!transno) { - INIT_LIST_HEAD(&req->rq_list); - DEBUG_REQ(D_HA, req, "not queueing"); - return 1; - } - - spin_lock_bh(&obd->obd_processing_task_lock); - - if (obd->obd_processing_task == current->pid) { - /* Processing the queue right now, don't re-add. */ - LASSERT(list_empty(&req->rq_list)); - spin_unlock_bh(&obd->obd_processing_task_lock); - return 1; - } - - OBD_ALLOC(saved_req, sizeof *saved_req); - if (!saved_req) - LBUG(); - memcpy(saved_req, req, sizeof *req); - req = saved_req; - INIT_LIST_HEAD(&req->rq_list); - - /* XXX O(n^2) */ - list_for_each(tmp, &obd->obd_recovery_queue) { - struct ptlrpc_request *reqiter = - list_entry(tmp, struct ptlrpc_request, rq_list); - - if (reqiter->rq_reqmsg->transno > transno) { - list_add_tail(&req->rq_list, &reqiter->rq_list); - inserted = 1; - break; - } - } - - if (!inserted) { - list_add_tail(&req->rq_list, &obd->obd_recovery_queue); - } - - if (obd->obd_processing_task != 0) { - /* Someone else is processing this queue, we'll leave it to - * them. - */ - if (transno == obd->obd_next_recovery_transno) - wake_up(&obd->obd_next_transno_waitq); - spin_unlock_bh(&obd->obd_processing_task_lock); - return 0; - } - - /* Nobody is processing, and we know there's (at least) one to process - * now, so we'll do the honours. - */ - obd->obd_processing_task = current->pid; - spin_unlock_bh(&obd->obd_processing_task_lock); - - process_recovery_queue(obd); - return 0; -} - -struct obd_device * target_req2obd(struct ptlrpc_request *req) -{ - return req->rq_export->exp_obd; -} - -int target_queue_final_reply(struct ptlrpc_request *req, int rc) -{ - struct obd_device *obd = target_req2obd(req); - struct ptlrpc_request *saved_req; - - spin_lock_bh(&obd->obd_processing_task_lock); - if (rc) { - /* Just like ptlrpc_error, but without the sending. */ - lustre_pack_msg(0, NULL, NULL, &req->rq_replen, - &req->rq_repmsg); - req->rq_type = PTL_RPC_MSG_ERR; - } - - LASSERT(list_empty(&req->rq_list)); - OBD_ALLOC(saved_req, sizeof *saved_req); - memcpy(saved_req, req, sizeof *saved_req); - req = saved_req; - list_add(&req->rq_list, &obd->obd_delayed_reply_queue); - if (--obd->obd_recoverable_clients == 0) { - struct list_head *tmp, *n; - ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace); - CDEBUG(D_ERROR, - "all clients recovered, sending delayed replies\n"); - obd->obd_flags &= ~OBD_RECOVERING; - list_for_each_safe(tmp, n, &obd->obd_delayed_reply_queue) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "delayed:"); - ptlrpc_reply(req->rq_svc, req); - list_del(&req->rq_list); - OBD_FREE(req, sizeof *req); - } - cancel_recovery_timer(obd); - } else { - CERROR("%d recoverable clients remain\n", - obd->obd_recoverable_clients); - } - - spin_unlock_bh(&obd->obd_processing_task_lock); - return 1; -} diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index 665295e..6648aa8 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -1,18 +1,30 @@ # Administration utilities Makefile DEFS= -CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include -I$(srcdir)/../include -Wall -L$(PORTALSLIB) +CFLAGS:=-g -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include -I$(srcdir)/../include -I$(top_srcdir)/../libsysio/include -Wall -L../portals/utils KFLAGS:= -CPPFLAGS = $(HAVE_EFENCE) +CPPFLAGS = $(HAVE_EFENCE) -D_LARGEFILE64_SOURCE=1 LIBS = $(LIBEFENCE) -LLIBS= ../lov/liblov.a ../obdecho/libobdecho.a ../osc/libosc.a ../ldlm/libldlm.a ../ptlrpc/libptlrpc.a ../obdclass/liblustreclass.a +LLIBS= ./libllite.a ../lov/liblov.a ../obdecho/libobdecho.a ../osc/libosc.a ../ldlm/libldlm.a ../ptlrpc/libptlrpc.a ../obdclass/liblustreclass.a ../mdc/libmdc.a + +lib_LIBRARIES = libllite.a +libllite_a_SOURCES = llite_lib.c super.c file.c rw.c + +bin_PROGRAMS = libtest lltest libtest_LDADD := $(LIBREADLINE) $(LLIBS) \ - $(PORTALS)/user/procbridge/libprocbridge.a $(PORTALS)/user/tcpnal/libtcpnal.a \ - $(PORTALS)/user/util/libtcpnalutil.a $(PORTALS)/api/libptlapi.a \ - $(PORTALS)/lib/libptllib.a -lptlctl -lpthread -bin_PROGRAMS = libtest + ../portals/unals/libtpcnal.a \ + ../portals/portals/libportals.a\ + -lptlctl -lpthread libtest_SOURCES = libtest.c +lltest_LDADD := $(LIBREADLINE) $(LLIBS) \ + ../../libsysio/src/libsysio.a ../../libsysio/dev/stdfd/libsysio_stdfd.a \ + -lc \ + ../portals/unals/libtcpnal.a ../portals/portals/libportals.a \ + -lptlctl -lpthread +lltest_SOURCES = lltest.c + include $(top_srcdir)/Rules + diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c new file mode 100644 index 0000000..8344af5 --- /dev/null +++ b/lustre/liblustre/file.c @@ -0,0 +1,553 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include <stdlib.h> +#include <string.h> +#include <error.h> +#include <assert.h> +#include <time.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <sysio.h> +#include <fs.h> +#include <mount.h> +#include <inode.h> +#include <file.h> + +#include "llite_lib.h" + +void llu_prepare_mdc_op_data(struct mdc_op_data *data, + struct inode *i1, + struct inode *i2, + const char *name, + int namelen, + int mode) +{ + struct llu_inode_info *lli1, *lli2; + + LASSERT(i1); + + lli1 = llu_i2info(i1); + data->ino1 = lli1->lli_st_ino; + data->gen1 = lli1->lli_st_generation; + data->typ1 = lli1->lli_st_mode & S_IFMT; + data->gid1 = lli1->lli_st_gid; + + if (i2) { + lli2 = llu_i2info(i2); + data->ino2 = lli2->lli_st_ino; + data->gen2 = lli2->lli_st_generation; + data->typ2 = lli2->lli_st_mode & S_IFMT; + data->gid2 = lli2->lli_st_gid; + } else + data->ino2 = 0; + + data->name = name; + data->namelen = namelen; + data->mode = mode; +} + +static struct inode *llu_create_node(struct inode *dir, const char *name, + int namelen, const void *data, int datalen, + int mode, __u64 extra, + struct lookup_intent *it) +{ + struct inode *inode; + struct ptlrpc_request *request = NULL; + struct mds_body *body; + time_t time = 123456;//time(NULL); + struct llu_sb_info *sbi = llu_i2sbi(dir); + + if (it && it->it_disposition) { + LBUG(); +#if 0 + ll_invalidate_inode_pages(dir); +#endif + request = it->it_data; + body = lustre_msg_buf(request->rq_repmsg, 1, sizeof(*body)); + } else { + struct mdc_op_data op_data; + struct llu_inode_info *lli_dir = llu_i2info(dir); + int gid = current->fsgid; + int rc; + + if (lli_dir->lli_st_mode & S_ISGID) { + gid = lli_dir->lli_st_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } + + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, namelen, 0); + rc = mdc_create(&sbi->ll_mdc_conn, &op_data, + data, datalen, mode, current->fsuid, gid, + time, extra, &request); + if (rc) { + inode = (struct inode*)rc; + goto out; + } + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); + } + + inode = llu_new_inode(dir->i_fs, body->ino, body->mode); + if (!inode) { + /* FIXME more cleanup needed? */ + goto out; + } + + llu_update_inode(inode, body, NULL); + + if (it && it->it_disposition) { + /* We asked for a lock on the directory, but were + * granted a lock on the inode. Since we finally have + * an inode pointer, stuff it in the lock. */ +#if 0 + ll_mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, + inode); +#endif + } + + out: + ptlrpc_req_finished(request); + return inode; +} + +int llu_create(struct inode *dir, struct pnode_base *pnode, int mode) +{ + struct inode *inode; +#if 0 + int rc = 0; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu,intent=%s\n", + dentry->d_name.name, dir->i_ino, LL_IT2STR(dentry->d_it)); + + it = dentry->d_it; + + rc = ll_it_open_error(IT_OPEN_CREATE, it); + if (rc) { + LL_GET_INTENT(dentry, it); + ptlrpc_req_finished(it->it_data); + RETURN(rc); + } +#endif + inode = llu_create_node(dir, pnode->pb_name.name, pnode->pb_name.len, + NULL, 0, mode, 0, NULL); + + if (IS_ERR(inode)) + RETURN(PTR_ERR(inode)); + + pnode->pb_ino = inode; + + return 0; +} + +static int llu_create_obj(struct lustre_handle *conn, struct inode *inode, + struct lov_stripe_md *lsm) +{ + struct ptlrpc_request *req = NULL; + struct llu_inode_info *lli = llu_i2info(inode); + struct lov_mds_md *lmm = NULL; + struct obdo *oa; + struct iattr iattr; + struct mdc_op_data op_data; + int rc, err, lmm_size = 0;; + ENTRY; + + oa = obdo_alloc(); + if (!oa) + RETURN(-ENOMEM); + + oa->o_mode = S_IFREG | 0600; + oa->o_id = lli->lli_st_ino; + /* Keep these 0 for now, because chown/chgrp does not change the + * ownership on the OST, and we don't want to allow BA OST NFS + * users to access these objects by mistake. + */ + oa->o_uid = 0; + oa->o_gid = 0; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLUID | OBD_MD_FLGID; + + rc = obd_create(conn, oa, &lsm, NULL); + if (rc) { + CERROR("error creating objects for inode %lu: rc = %d\n", + lli->lli_st_ino, rc); + if (rc > 0) { + CERROR("obd_create returned invalid rc %d\n", rc); + rc = -EIO; + } + GOTO(out_oa, rc); + } + + LASSERT(lsm && lsm->lsm_object_id); + rc = obd_packmd(conn, &lmm, lsm); + if (rc < 0) + GOTO(out_destroy, rc); + + lmm_size = rc; + + /* Save the stripe MD with this file on the MDS */ + memset(&iattr, 0, sizeof(iattr)); + iattr.ia_valid = ATTR_FROM_OPEN; + + llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + rc = mdc_setattr(&llu_i2sbi(inode)->ll_mdc_conn, &op_data, + &iattr, lmm, lmm_size, &req); + ptlrpc_req_finished(req); + + obd_free_diskmd(conn, &lmm); + + /* If we couldn't complete mdc_open() and store the stripe MD on the + * MDS, we need to destroy the objects now or they will be leaked. + */ + if (rc) { + CERROR("error: storing stripe MD for %lu: rc %d\n", + lli->lli_st_ino, rc); + GOTO(out_destroy, rc); + } + lli->lli_smd = lsm; + + EXIT; +out_oa: + obdo_free(oa); + return rc; + +out_destroy: + obdo_from_inode(oa, inode, OBD_MD_FLTYPE); + oa->o_id = lsm->lsm_object_id; + oa->o_valid |= OBD_MD_FLID; + err = obd_destroy(conn, oa, lsm, NULL); + obd_free_memmd(conn, &lsm); + if (err) { + CERROR("error uncreating inode %lu objects: rc %d\n", + lli->lli_st_ino, err); + } + goto out_oa; +} + +/* FIXME currently no "it" passed in */ +static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) +{ + struct ll_file_data *fd; +#if 0 + struct ptlrpc_request *req = it->it_data; + struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + ENTRY; +#endif + LASSERT(!lli->lli_file_data); + + fd = malloc(sizeof(struct ll_file_data)); + /* We can't handle this well without reorganizing ll_file_open and + * ll_mdc_close, so don't even try right now. */ + LASSERT(fd != NULL); + + memset(fd, 0, sizeof(*fd)); +#if 0 + memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle)); + fd->fd_mds_och.och_req = it->it_data; +#endif + lli->lli_file_data = fd; + + RETURN(0); +} + +static int llu_osc_open(struct lustre_handle *conn, struct inode *inode, + struct lov_stripe_md *lsm) +{ + struct ll_file_data *fd = llu_i2info(inode)->lli_file_data; + struct obdo *oa; + int rc; + ENTRY; + + oa = obdo_alloc(); + if (!oa) + RETURN(-ENOMEM); + oa->o_id = lsm->lsm_object_id; + oa->o_mode = S_IFREG; + oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och); + if (rc) + GOTO(out, rc); + +// file->f_flags &= ~O_LOV_DELAY_CREATE; + obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | + OBD_MD_FLCTIME); + + EXIT; +out: + obdo_free(oa); + return rc; +} + +static int llu_file_open(struct inode *inode) +{ +#if 0 + struct llu_sb_info *sbi = llu_i2sbi(inode); +#endif + struct llu_inode_info *lli = llu_i2info(inode); + struct lustre_handle *conn = llu_i2obdconn(inode); + struct lookup_intent *it; + struct lov_stripe_md *lsm; + int rc = 0; + +#if 0 + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); + LL_GET_INTENT(file->f_dentry, it); + rc = ll_it_open_error(IT_OPEN_OPEN, it); + if (rc) + RETURN(rc); +#endif + rc = llu_local_open(lli, it); + if (rc) + LBUG(); +#if 0 + mdc_set_open_replay_data(&((struct ll_file_data *) + file->private_data)->fd_mds_och); +#endif + lsm = lli->lli_smd; + if (lsm == NULL) { +#if 0 + if (file->f_flags & O_LOV_DELAY_CREATE) { + CDEBUG(D_INODE, "delaying object creation\n"); + RETURN(0); + } +#endif + if (!lli->lli_smd) { + rc = llu_create_obj(conn, inode, NULL); + if (rc) + GOTO(out_close, rc); + } else { + CERROR("warning: stripe already set on ino %lu\n", + lli->lli_st_ino); + } + lsm = lli->lli_smd; + } + + rc = llu_osc_open(conn, inode, lsm); + if (rc) + GOTO(out_close, rc); + RETURN(0); + + out_close: +// ll_mdc_close(&sbi->ll_mdc_conn, inode, file); + return rc; +} + +int llu_iop_open(struct pnode *pnode, int flags, mode_t mode) +{ + struct inode *dir = pnode->p_parent->p_base->pb_ino; + int rc; + /* FIXME later we must add the ldlm here */ + + LASSERT(dir); + + /* libsysio forgot to guarentee mode is valid XXX */ + mode |= S_IFREG; + + if (!pnode->p_base->pb_ino) { + rc = llu_create(dir, pnode->p_base, mode); + if (rc) + return rc; + } + + LASSERT(pnode->p_base->pb_ino); + return llu_file_open(pnode->p_base->pb_ino); +} + + +static int llu_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct ll_file_data *fd = lli->lli_file_data; + struct ptlrpc_request *req = NULL; + unsigned long flags; + struct obd_import *imp; + int rc; + + /* FIXME add following code later FIXME */ +#if 0 + /* Complete the open request and remove it from replay list */ + rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, lli->lli_st_ino, + inode->i_mode, &fd->fd_mds_och.och_fh, &req); + if (rc) + CERROR("inode %lu close failed: rc = %d\n", + lli->lli_st_ino, rc); + + imp = fd->fd_mds_och.och_req->rq_import; + LASSERT(imp != NULL); + spin_lock_irqsave(&imp->imp_lock, flags); + + DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p", + fd->fd_mds_och.och_req); + + /* We held on to the request for replay until we saw a close for that + * file. Now that we've closed it, it gets replayed on the basis of + * its transno only. */ + spin_lock (&fd->fd_mds_och.och_req->rq_lock); + fd->fd_mds_och.och_req->rq_replay = 0; + spin_unlock (&fd->fd_mds_och.och_req->rq_lock); + + if (fd->fd_mds_och.och_req->rq_transno) { + /* This open created a file, so it needs replay as a + * normal transaction now. Our reference to it now + * effectively owned by the imp_replay_list, and it'll + * be committed just like other transno-having + * requests from here on out. */ + + /* We now retain this close request, so that it is + * replayed if the open is replayed. We duplicate the + * transno, so that we get freed at the right time, + * and rely on the difference in xid to keep + * everything ordered correctly. + * + * But! If this close was already given a transno + * (because it caused real unlinking of an + * open-unlinked file, f.e.), then we'll be ordered on + * the basis of that and we don't need to do anything + * magical here. */ + if (!req->rq_transno) { + req->rq_transno = fd->fd_mds_och.och_req->rq_transno; + ptlrpc_retain_replayable_request(req, imp); + } + spin_unlock_irqrestore(&imp->imp_lock, flags); + + /* Should we free_committed now? we always free before + * replay, so it's probably a wash. We could check to + * see if the fd_req should already be committed, in + * which case we can avoid the whole retain_replayable + * dance. */ + } else { + /* No transno means that we can just drop our ref. */ + spin_unlock_irqrestore(&imp->imp_lock, flags); + } + ptlrpc_req_finished(fd->fd_mds_och.och_req); + + /* Do this after the fd_req->rq_transno check, because we don't want + * to bounce off zero references. */ + ptlrpc_req_finished(req); + fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC; +#endif + lli->lli_file_data = NULL; + free(fd); + + RETURN(-abs(rc)); +} + +static int llu_file_release(struct inode *inode) +{ + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct llu_inode_info *lli = llu_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + struct ll_file_data *fd; + struct obdo oa; + int rc = 0, rc2; + + fd = lli->lli_file_data; + if (!fd) /* no process opened the file after an mcreate */ + RETURN(rc = 0); + + /* we might not be able to get a valid handle on this file + * again so we really want to flush our write cache.. */ + if (S_ISREG(inode->i_mode) && lsm) { + memset(&oa, 0, sizeof(oa)); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; + + memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE); + oa.o_valid |= OBD_MD_FLHANDLE; + + rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (rc) + CERROR("inode %lu object close failed: rc = " + "%d\n", lli->lli_st_ino, rc); + } + + rc2 = llu_mdc_close(&sbi->ll_mdc_conn, inode); + if (rc2 && !rc) + rc = rc2; + + RETURN(rc); +} + +int llu_iop_close(struct inode *inode) +{ + return llu_file_release(inode); +} + +int llu_iop_ipreadv(struct inode *ino, + struct io_arguments *ioargs, + struct ioctx **ioctxp) +{ + struct ioctx *ioctx; + + if (!ioargs->ioarg_iovlen) + return 0; + if (ioargs->ioarg_iovlen < 0) + return -EINVAL; + + ioctx = _sysio_ioctx_new(ino, ioargs); + if (!ioctx) + return -ENOMEM; + + ioctx->ioctx_cc = llu_file_read(ino, + ioctx->ioctx_iovec, + ioctx->ioctx_iovlen, + ioctx->ioctx_offset); + if (ioctx->ioctx_cc < 0) + ioctx->ioctx_errno = ioctx->ioctx_cc; + + *ioctxp = ioctx; + return 0; +} + +int llu_iop_ipwritev(struct inode *ino, + struct io_arguments *ioargs, + struct ioctx **ioctxp) +{ + struct ioctx *ioctx; + + if (!ioargs->ioarg_iovlen) + return 0; + if (ioargs->ioarg_iovlen < 0) + return -EINVAL; + + ioctx = _sysio_ioctx_new(ino, ioargs); + if (!ioctx) + return -ENOMEM; + + ioctx->ioctx_cc = llu_file_write(ino, + ioctx->ioctx_iovec, + ioctx->ioctx_iovlen, + ioctx->ioctx_offset); + if (ioctx->ioctx_cc < 0) + ioctx->ioctx_errno = ioctx->ioctx_cc; + + *ioctxp = ioctx; + return 0; +} + diff --git a/lustre/liblustre/libtest.c b/lustre/liblustre/libtest.c index c344198..1d523a6 100644 --- a/lustre/liblustre/libtest.c +++ b/lustre/liblustre/libtest.c @@ -12,6 +12,28 @@ #include <linux/obd_class.h> #include <portals/procbridge.h> +struct ldlm_namespace; +struct ldlm_res_id; +struct obd_import; + +extern int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, int flags); +extern int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only); +extern int ldlm_replay_locks(struct obd_import *imp); + +void *inter_module_get(char *arg) +{ + if (!strcmp(arg, "tcpnal_ni")) + return &tcpnal_ni; + else if (!strcmp(arg, "ldlm_cli_cancel_unused")) + return ldlm_cli_cancel_unused; + else if (!strcmp(arg, "ldlm_namespace_cleanup")) + return ldlm_namespace_cleanup; + else if (!strcmp(arg, "ldlm_replay_locks")) + return ldlm_replay_locks; + else + return NULL; +} + ptl_handle_ni_t tcpnal_ni; struct pingcli_args { @@ -27,7 +49,7 @@ struct task_struct *current; struct obd_class_user_state ocus; /* portals interfaces */ -inline const ptl_handle_ni_t * +ptl_handle_ni_t * kportal_get_ni (int nal) { return &tcpnal_ni; @@ -101,10 +123,10 @@ int main(int argc, char **argv) init_lib_portals(args); ptlrpc_init(); ldlm_init(); + mdc_init(); + lov_init(); osc_init(); echo_client_init(); - /* XXX need mdc_getlovinfo before lov_init can work.. */ - // lov_init(); parse_dump("/tmp/DUMP_FILE", lib_ioctl); diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c new file mode 100644 index 0000000..b11de88 --- /dev/null +++ b/lustre/liblustre/llite_lib.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include <stdlib.h> +#include <string.h> +#include <error.h> +#include <assert.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <sysio.h> +#include <fs.h> +#include <mount.h> +#include <inode.h> +#include <file.h> + +#include <netinet/in.h> +#include <sys/socket.h> +#include <arpa/inet.h> + +#include <portals/api-support.h> /* needed for ptpctl.h */ +#include <portals/ptlctl.h> /* needed for parse_dump */ + +#include "llite_lib.h" + + +ptl_handle_ni_t tcpnal_ni; +struct task_struct *current; +struct obd_class_user_state ocus; + +/* portals interfaces */ +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + return &tcpnal_ni; +} + +inline void +kportal_put_ni (int nal) +{ + return; +} + +struct ldlm_namespace; +struct ldlm_res_id; +struct obd_import; + +extern int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, struct ldlm_res_id *res_id, int flags); +extern int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only); +extern int ldlm_replay_locks(struct obd_import *imp); + +void *inter_module_get(char *arg) +{ + if (!strcmp(arg, "tcpnal_ni")) + return &tcpnal_ni; + else if (!strcmp(arg, "ldlm_cli_cancel_unused")) + return ldlm_cli_cancel_unused; + else if (!strcmp(arg, "ldlm_namespace_cleanup")) + return ldlm_namespace_cleanup; + else if (!strcmp(arg, "ldlm_replay_locks")) + return ldlm_replay_locks; + else + return NULL; +} + +void init_current(char *comm) +{ + current = malloc(sizeof(*current)); + current->fs = malloc(sizeof(*current->fs)); + current->fs->umask = umask(0777); + umask(current->fs->umask); + strncpy(current->comm, comm, sizeof(current->comm)); + current->pid = getpid(); + current->fsuid = 0; + current->fsgid = 0; + current->cap_effective = 0; + memset(¤t->pending, 0, sizeof(current->pending)); +} + +ptl_nid_t tcpnal_mynid; + +int init_lib_portals() +{ + int rc; + + PtlInit(); + rc = PtlNIInit(procbridge_interface, 0, 0, 0, &tcpnal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + PtlFini(); + RETURN (rc); + } + PtlNIDebug(tcpnal_ni, ~0); + return rc; +} + +extern int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg); + +struct mount_option_s mount_option = {NULL, NULL}; + +/* FIXME simple arg parser FIXME */ +void parse_mount_options(void *arg) +{ + char *buf = NULL; + struct obd_ioctl_data *data; + char *ptr, *comma, *eq, **tgt, *v; + int len; + + if (obd_ioctl_getdata(&buf, &len, arg)) { + CERROR("OBD ioctl: data error\n"); + return; + } + data = (struct obd_ioctl_data *)buf; + ptr = data->ioc_inlbuf1; + printf("mount option: %s\n", ptr); + + while (ptr) { + eq = strchr(ptr, '='); + if (!eq) + return; + + *eq = 0; + if (!strcmp("osc", ptr)) + tgt = &mount_option.osc_uuid; + else if (!strcmp("mdc", ptr)) + tgt = &mount_option.mdc_uuid; + else { + printf("Unknown mount option %s\n", ptr); + return; + } + + v = eq + 1; + comma = strchr(v, ','); + if (comma) { + *comma = 0; + ptr = comma + 1; + } else + ptr = NULL; + + *tgt = malloc(strlen(v)+1); + strcpy(*tgt, v); + } + + if (buf) + obd_ioctl_freedata(buf, len); +} + +int lib_ioctl(int dev_id, int opc, void * ptr) +{ + int rc; + + if (dev_id == OBD_DEV_ID) { + struct obd_ioctl_data *ioc = ptr; + + if (opc == OBD_IOC_MOUNTOPT) { + parse_mount_options(ptr); + return 0; + } + + rc = class_handle_ioctl(&ocus, opc, (unsigned long)ptr); + + /* you _may_ need to call obd_ioctl_unpack or some + other verification function if you want to use ioc + directly here */ + printf ("processing ioctl cmd: %x buf len: %d, rc %d\n", + opc, ioc->ioc_len, rc); + + if (rc) + return rc; + } + return (0); +} + +int lllib_init(char *arg) +{ + tcpnal_mynid = ntohl(inet_addr(arg)); + INIT_LIST_HEAD(&ocus.ocus_conns); + + init_current("dummy"); + if (init_obdclass() || + init_lib_portals() || + ptlrpc_init() || + ldlm_init() || + mdc_init() || + lov_init() || + osc_init()) + return -1; + + if (parse_dump("/tmp/DUMP_FILE", lib_ioctl)) + return -1; + + return _sysio_fssw_register("llite", &llu_fssw_ops); +} + +/* FIXME */ +void generate_random_uuid(unsigned char uuid_out[16]) +{ + int *arr = (int*)uuid_out; + int i; + + for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++) + arr[i] = rand(); +} + diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h new file mode 100644 index 0000000..ce2e23b --- /dev/null +++ b/lustre/liblustre/llite_lib.h @@ -0,0 +1,135 @@ +#ifndef __LLU_H_ +#define __LLU_H_ + +#include <liblustre.h> +#include <linux/obd.h> +#include <linux/obd_class.h> +#include <portals/procbridge.h> +#include <linux/lustre_lite.h> + +#include <sys/types.h> +#include <sys/stat.h> + +struct ll_file_data { + struct obd_client_handle fd_mds_och; + struct obd_client_handle fd_ost_och; + __u32 fd_flags; +}; + +struct llu_sb_info +{ + struct obd_uuid ll_sb_uuid; + struct lustre_handle ll_mdc_conn; + struct lustre_handle ll_osc_conn; + obd_id ll_rootino; + int ll_flags; + struct list_head ll_conn_chain; +}; + +struct llu_inode_info { + struct llu_sb_info *lli_sbi; + struct ll_fid lli_fid; + struct lov_stripe_md *lli_smd; + char *lli_symlink_name; + /*struct semaphore lli_open_sem;*/ + unsigned long lli_flags; + struct list_head lli_read_extents; + + /* in libsysio we have no chance to store data in file, + * so place it here */ + struct ll_file_data *lli_file_data; + + /* stat FIXME not 64 bit clean */ + dev_t lli_st_dev; + ino_t lli_st_ino; + mode_t lli_st_mode; + nlink_t lli_st_nlink; + uid_t lli_st_uid; + gid_t lli_st_gid; + dev_t lli_st_rdev; + loff_t lli_st_size; + unsigned int lli_st_blksize; + unsigned int lli_st_blocks; + time_t lli_st_atime; + time_t lli_st_mtime; + time_t lli_st_ctime; + + /* not for stat, change it later */ + int lli_st_flags; + unsigned long lli_st_generation; +}; + +static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs) +{ + return (struct llu_sb_info*)(fs->fs_private); +} + +static inline struct llu_inode_info *llu_i2info(struct inode *inode) +{ + return (struct llu_inode_info*)(inode->i_private); +} + +static inline struct llu_sb_info *llu_i2sbi(struct inode *inode) +{ + return llu_i2info(inode)->lli_sbi; +} + +static inline struct client_obd *sbi2mdc(struct llu_sb_info *sbi) +{ + struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn); + if (obd == NULL) + LBUG(); + return &obd->u.cli; +} + +static inline struct lustre_handle *llu_i2obdconn(struct inode *inode) +{ + return &(llu_i2info(inode)->lli_sbi->ll_osc_conn); +} + + +struct mount_option_s +{ + char *mdc_uuid; + char *osc_uuid; +}; + +/* llite_lib.c */ +void generate_random_uuid(unsigned char uuid_out[16]); + +extern struct mount_option_s mount_option; + +/* super.c */ +void llu_update_inode(struct inode *inode, struct mds_body *body, + struct lov_stripe_md *lmm); +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid); +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid); +struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode); + +extern struct fssw_ops llu_fssw_ops; + +/* file.c */ +void llu_prepare_mdc_op_data(struct mdc_op_data *data, + struct inode *i1, + struct inode *i2, + const char *name, + int namelen, + int mode); +int llu_create(struct inode *dir, struct pnode_base *pnode, int mode); +int llu_iop_open(struct pnode *pnode, int flags, mode_t mode); +int llu_iop_close(struct inode *inode); +int llu_iop_ipreadv(struct inode *ino, + struct io_arguments *ioargs, + struct ioctx **ioctxp); +int llu_iop_ipwritev(struct inode *ino, + struct io_arguments *ioargs, + struct ioctx **ioctxp); + +/* rw.c */ +int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED); +ssize_t llu_file_write(struct inode *inode, const struct iovec *iovec, + size_t iovlen, loff_t pos); +ssize_t llu_file_read(struct inode *inode, const struct iovec *iovec, + size_t iovlen, loff_t pos); + +#endif diff --git a/lustre/liblustre/lltest.c b/lustre/liblustre/lltest.c new file mode 100644 index 0000000..acdc47e --- /dev/null +++ b/lustre/liblustre/lltest.c @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light user test program + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define _BSD_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <getopt.h> +#include <errno.h> +#include <string.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/queue.h> +#include <sys/statvfs.h> + +#include <sysio.h> +#include <mount.h> + + +int do_stat(const char *name) +{ + struct stat stat; + + if (lstat(name, &stat)) { + perror("failed to stat: "); + return -1; + } + printf("******* stat '%s' ********\n", name); + printf("ino:\t\t%lu\n",stat.st_ino); + printf("mode:\t\t%o\n",stat.st_mode); + printf("nlink:\t\t%d\n",stat.st_nlink); + printf("uid/gid:\t%d/%d\n", stat.st_uid, stat.st_gid); + printf("size:\t\t%ld\n", stat.st_size); + printf("blksize:\t%ld\n", stat.st_blksize); + printf("block count:\t%ld\n", stat.st_blocks); + printf("atime:\t\t%lu\n",stat.st_atime); + printf("mtime:\t\t%lu\n",stat.st_mtime); + printf("ctime:\t\t%lu\n",stat.st_ctime); + printf("******* end stat ********\n"); + + return 0; +} +/* + * Get stats of file and file system. + * + * Usage: test_stats [-a] [-r <root-path>] [-m <root-driver>] [<path> ...] + */ + +extern int lllib_init(char *arg); + +char *root_driver = "llite"; +char *root_path = "/"; +unsigned mntflgs = 0; +struct mount root_mount; + +extern int portal_debug; +extern int portal_subsystem_debug; + +char* files[] = {"/dir1", "/dir1/file1", "/dir1/file2", "/dir1/dir2", "/dir1/dir2/file3"}; + +int +main(int argc, char * const argv[]) +{ + struct stat statbuf; + int rc, err, i, fd, written, readed; + char pgbuf[4096], readbuf[4096]; + int npages; + + if (_sysio_init() != 0) { + perror("init sysio"); + exit(1); + } + err = lllib_init(argv[1]); + if (err) { + perror("init llite driver"); + exit(1); + } + + err = _sysio_mount_root(root_path, root_driver, mntflgs, NULL); + if (err) { + errno = -err; + perror(root_driver); + exit(1); + } +#if 0 + for (i=0; i< sizeof(files)/sizeof(char*); i++) { + printf("******** stat %s *********\n", files[i]); + /* XXX ugly, only for testing */ + err = fixme_lstat(files[i], &statbuf); + if (err) + perror(root_driver); + printf("******** end stat %s: %d*********\n", files[i], err); + } +#endif +#if 0 + portal_debug = 0; + portal_subsystem_debug = 0; + npages = 10; + + fd = open("/newfile01", O_RDWR|O_CREAT|O_TRUNC, 00664); + printf("***************** open return %d ****************\n", fd); + + printf("***************** begin write pages ****************\n"); + for (i = 0; i < npages; i++ ) { + memset(pgbuf, ('A'+ i%10), 4096); + written = write(fd, pgbuf, 4096); + printf(">>> page %d: %d bytes written\n", i, written); + } + + printf("***************** begin read pages ****************\n"); + lseek(fd, 0, SEEK_SET); + + for (i = 0; i < npages; i++ ) { + memset(readbuf, '8', 4096); + readed = read(fd, readbuf, 4096); + readbuf[10] = 0; + printf("<<< page %d: %d bytes (%s)\n", i, readed, readbuf); + } + close(fd); +#endif + +#if 1 + //rc = chown("/newfile01", 10, 20); + rc = chmod("/newfile01", 0777); + printf("-------------- chmod return %d -----------\n", rc); + do_stat("/newfile01"); +#endif + + printf("sysio is about shutdown\n"); + /* + * Clean up. + */ + _sysio_shutdown(); + + printf("complete successfully\n"); + return 0; +} diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c new file mode 100644 index 0000000..847b1d0 --- /dev/null +++ b/lustre/liblustre/rw.c @@ -0,0 +1,519 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include <stdlib.h> +#include <string.h> +#include <error.h> +#include <assert.h> +#include <time.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <sysio.h> +#include <fs.h> +#include <mount.h> +#include <inode.h> +#include <file.h> + +#include "llite_lib.h" + +int llu_iop_iodone(struct ioctx *ioctxp __IS_UNUSED) +{ + return 1; +} + +/* + * this grabs a lock and manually implements behaviour that makes it look + * like the OST is returning the file size with each lock acquisition + */ +int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, + struct lov_stripe_md *lsm, + int mode, struct ldlm_extent *extent, + struct lustre_handle *lockh) +{ +#if 0 + struct ll_inode_info *lli = ll_i2info(inode); + int rc; + ENTRY; + + rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh); + if (rc != ELDLM_OK) + RETURN(rc); + + /* always do a getattr for the first person to pop out of lock + * acquisition.. the DID_GETATTR flag and semaphore serialize + * this initial race. we used to make a decision based on whether + * the lock was matched or acquired, but the matcher could win the + * waking race with the first issuer so that was no good.. + */ + if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) + RETURN(ELDLM_OK); + + down(&lli->lli_getattr_sem); + + if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) { + rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL); + if (rc == 0) { + set_bit(LLI_F_DID_GETATTR, &lli->lli_flags); + } else { + /* XXX can this fail? */ + ll_extent_unlock(fd, inode, lsm, mode, lockh); + } + } + + up(&lli->lli_getattr_sem); + RETURN(rc); +#else + return ELDLM_OK; +#endif +} + +int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, + struct lov_stripe_md *lsm, int mode, + struct lustre_handle *lockh) +{ +#if 0 + struct ll_sb_info *sbi = ll_i2sbi(inode); + int rc; + ENTRY; + + /* XXX phil: can we do this? won't it screw the file size up? */ + if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || + (sbi->ll_flags & LL_SBI_NOLCK)) + RETURN(0); + + rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh); + + RETURN(rc); +#else + return 0; +#endif +} + +static int llu_brw(int cmd, struct inode *inode, struct page *page, int flags) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + struct brw_page pg; + int rc; + ENTRY; + + pg.pg = page; + pg.off = ((obd_off)page->index) << PAGE_SHIFT; + + /* FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME */ +#if 0 + if (cmd == OBD_BRW_WRITE && (pg.off + PAGE_SIZE > lli->lli_st_size)) + pg.count = lli->lli_st_size % PAGE_SIZE; + else +#endif + pg.count = PAGE_SIZE; + + CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n", + cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, lli->lli_st_ino, + pg.off, pg.off); + if (pg.count == 0) { + LBUG(); + } + + pg.flag = flags; + + rc = obd_brw(cmd, llu_i2obdconn(inode), lsm, 1, &pg, set, NULL); + if (rc) { + CERROR("error from obd_brw: rc = %d\n", rc); + } + + RETURN(rc); +} + +static int llu_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + struct llu_inode_info *lli = llu_i2info(inode); + obd_off offset = ((obd_off)page->index) << PAGE_SHIFT; + int rc = 0; + ENTRY; + +#if 0 + if (!PageLocked(page)) + LBUG(); + + if (PageUptodate(page)) + RETURN(0); + + //POISON(addr + from, 0xca, to - from); +#endif + /* We're completely overwriting an existing page, so _don't_ set it up + * to date until commit_write */ + if (from == 0 && to == PAGE_SIZE) + RETURN(0); + + /* If are writing to a new page, no need to read old data. + * the extent locking and getattr procedures in ll_file_write have + * guaranteed that i_size is stable enough for our zeroing needs */ + if (lli->lli_st_size <= offset) { + memset(kmap(page), 0, PAGE_SIZE); + kunmap(page); + GOTO(prepare_done, rc = 0); + } + + rc = llu_brw(OBD_BRW_READ, inode, page, 0); + + EXIT; + + prepare_done: + return rc; +} + +static int llu_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + struct llu_inode_info *lli = llu_i2info(inode); + loff_t size; + int rc; + ENTRY; +#if 0 + LASSERT(inode == file->f_dentry->d_inode); + LASSERT(PageLocked(page)); + + CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n", + inode, page, from, to, page->index); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu,from=%d,to=%d\n", + inode->i_ino, from, to); + /* to match full page case in prepare_write */ + SetPageUptodate(page); + /* mark the page dirty, put it on mapping->dirty, + * mark the inode PAGES_DIRTY, put it on sb->dirty */ + set_page_dirty(page); +#endif + rc = llu_brw(OBD_BRW_WRITE, inode, page, 0); + if (rc) + return rc; + + /* this is matched by a hack in obdo_to_inode at the moment */ + size = (((obd_off)page->index) << PAGE_SHIFT) + to; + if (size > lli->lli_st_size) + lli->lli_st_size = size; + + RETURN(0); +} /* ll_commit_write */ + +ssize_t +llu_generic_file_write(struct inode *inode, const char *buf, + size_t count, loff_t pos) +{ + struct page *page; + ssize_t written; + long status = 0; + int err; + unsigned bytes; + + if ((ssize_t) count < 0) + return -EINVAL; +#if 0 + down(&inode->i_sem); +#endif + if (pos < 0) + return -EINVAL; + + written = 0; + +#if 0 + remove_suid(inode); + update_inode_times(inode); +#endif + do { + unsigned long index, offset; + char *kaddr; + + /* + * Try to find the page in the cache. If it isn't there, + * allocate a free page. + */ + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) { + bytes = count; + } + + status = -ENOMEM; /* we'll assign it later anyway */ + page = __grab_cache_page(index); + if (!page) + break; + + kaddr = kmap(page); + status = llu_prepare_write(inode, page, offset, offset+bytes); + if (status) + goto sync_failure; + + memcpy(kaddr+offset, buf, bytes); + + status = llu_commit_write(inode, page, offset, offset+bytes); + if (!status) + status = bytes; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + } +unlock: + kunmap(page); + page_cache_release(page); + + if (status < 0) + break; + } while (count); +done: + err = written ? written : status; + +#if 0 + up(&inode->i_sem); +#endif + return err; + + status = -EFAULT; + goto unlock; + +sync_failure: + /* + * If blocksize < pagesize, prepare_write() may have instantiated a + * few blocks outside i_size. Trim these off again. + */ + kunmap(page); + page_cache_release(page); + goto done; +} + +ssize_t llu_file_write(struct inode *inode, const struct iovec *iovec, + size_t iovlen, loff_t pos) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct ll_file_data *fd = lli->lli_file_data; /* XXX not ready don't use it now */ + struct lustre_handle lockh = { 0 }; + struct lov_stripe_md *lsm = lli->lli_smd; + struct ldlm_extent extent; + ldlm_error_t err; + ssize_t retval = 0; + ENTRY; + + /* XXX consider other types later */ + if (!S_ISREG(lli->lli_st_mode)) + LBUG(); +#if 0 + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu,size="LPSZ",offset=%Ld\n", + inode->i_ino, count, *ppos); + + /* + * sleep doing some writeback work of this mount's dirty data + * if the VM thinks we're low on memory.. other dirtying code + * paths should think about doing this, too, but they should be + * careful not to hold locked pages while they do so. like + * ll_prepare_write. *cough* + */ + ll_check_dirty(inode->i_sb); +#endif + while (iovlen--) { + const char *buf = iovec[iovlen].iov_base; + size_t count = iovec[iovlen].iov_len; + + /* POSIX, but surprised the VFS doesn't check this already */ + if (count == 0) + continue; + +#if 0 + if (!S_ISBLK(lli->lli_st_mode) && file->f_flags & O_APPEND) { + extent.start = 0; + extent.end = OBD_OBJECT_EOF; + } else { + extent.start = *ppos; + extent.end = *ppos + count - 1; + } +#else + extent.start = pos; + extent.end = pos + count - 1; +#endif + + err = llu_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh); + if (err != ELDLM_OK) + RETURN(-ENOLCK); + +#if 0 + if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + *ppos = inode->i_size; + + CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", + inode->i_ino, count, *ppos); +#endif + retval += llu_generic_file_write(inode, buf, count, pos); + } + + /* XXX errors? */ + ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); + return(retval); +} + +static void llu_update_atime(struct inode *inode) +{ +#if 0 + struct llu_inode_info *lli = llu_i2info(inode); + +#ifdef USE_ATIME + struct iattr attr; + + attr.ia_atime = LTIME_S(CURRENT_TIME); + attr.ia_valid = ATTR_ATIME; + + if (lli->lli_st_atime == attr.ia_atime) return; + if (IS_RDONLY(inode)) return; + if (IS_NOATIME(inode)) return; + + /* ll_inode_setattr() sets inode->i_atime from attr.ia_atime */ + llu_inode_setattr(inode, &attr, 0); +#else + /* update atime, but don't explicitly write it out just this change */ + inode->i_atime = CURRENT_TIME; +#endif +#endif +} + +static size_t llu_generic_file_read(struct inode *inode, char *buf, + size_t count, loff_t pos) +{ + struct llu_inode_info *lli = llu_i2info(inode); + unsigned long index, offset; + int error = 0; + size_t readed = 0; + + index = pos >> PAGE_CACHE_SHIFT; + offset = pos & ~PAGE_CACHE_MASK; + + do { + struct page *page; + unsigned long end_index, nr; + + end_index = lli->lli_st_size >> PAGE_CACHE_SHIFT; + + if (index > end_index) + break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = lli->lli_st_size & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + nr = nr - offset; + if (nr > count) + nr = count; + + page = grab_cache_page(index); + if (!page) { + error = -ENOMEM; + break; + } + + error = llu_brw(OBD_BRW_READ, inode, page, 0); + if (error) { + page_cache_release(page); + break; + } + + memcpy(buf, kmap(page)+offset, nr); + offset += nr; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + readed += nr; + count -= nr; + + page_cache_release(page); + } while (count); + + if (error) + return error; + return readed; +} + +ssize_t llu_file_read(struct inode *inode, const struct iovec *iovec, + size_t iovlen, loff_t pos) +{ + struct llu_inode_info *lli = llu_i2info(inode); + struct ll_file_data *fd = lli->lli_file_data; + struct lov_stripe_md *lsm = lli->lli_smd; + struct lustre_handle lockh = { 0 }; +#if 0 + struct ll_read_extent rextent; +#else + struct ldlm_extent extent; +#endif + ldlm_error_t err; + ssize_t retval = 0; + ENTRY; + + while (iovlen--) { + char *buf = iovec[iovlen].iov_base; + size_t count = iovec[iovlen].iov_len; + + /* "If nbyte is 0, read() will return 0 and have no other results." + * -- Single Unix Spec */ + if (count == 0) + RETURN(0); + +#if 0 + rextent.re_extent.start = pos; + rextent.re_extent.end = pos + count - 1; +#else + extent.start = pos; + extent.end = pos + count - 1; +#endif + err = llu_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh); + if (err != ELDLM_OK) + RETURN(-ENOLCK); +#if 0 + rextent.re_task = current; + spin_lock(&lli->lli_read_extent_lock); + list_add(&rextent.re_lli_item, &lli->lli_read_extents); + spin_unlock(&lli->lli_read_extent_lock); +#endif + CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n", + lli->lli_st_ino, count, pos); + retval = llu_generic_file_read(inode, buf, count, pos); +#if 0 + spin_lock(&lli->lli_read_extent_lock); + list_del(&rextent.re_lli_item); + spin_unlock(&lli->lli_read_extent_lock); +#endif + } + + if (retval > 0) + llu_update_atime(inode); + + /* XXX errors? */ + ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); + RETURN(retval); +} + diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c new file mode 100644 index 0000000..27ac231 --- /dev/null +++ b/lustre/liblustre/super.c @@ -0,0 +1,781 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Lustre Light Super operations + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include <stdlib.h> +#include <string.h> +#include <error.h> +#include <assert.h> +#include <time.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <sysio.h> +#include <fs.h> +#include <mount.h> +#include <inode.h> +#include <file.h> + +#include "llite_lib.h" + +static void llu_fsop_gone(struct filesys *fs) +{ + /* FIXME */ +} + +static struct inode_ops llu_inode_ops; + +void llu_update_inode(struct inode *inode, struct mds_body *body, + struct lov_stripe_md *lsm) +{ + struct llu_inode_info *lli = llu_i2info(inode); + + LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (lli->lli_smd == NULL) + lli->lli_smd = lsm; + else + LASSERT (!memcmp (lli->lli_smd, lsm, + sizeof (*lsm))); + } + + if (body->valid & OBD_MD_FLID) + lli->lli_st_ino = body->ino; + if (body->valid & OBD_MD_FLATIME) + LTIME_S(lli->lli_st_atime) = body->atime; + if (body->valid & OBD_MD_FLMTIME) + LTIME_S(lli->lli_st_mtime) = body->mtime; + if (body->valid & OBD_MD_FLCTIME) + LTIME_S(lli->lli_st_ctime) = body->ctime; + if (body->valid & OBD_MD_FLMODE) + lli->lli_st_mode = (lli->lli_st_mode & S_IFMT)|(body->mode & ~S_IFMT); + if (body->valid & OBD_MD_FLTYPE) + lli->lli_st_mode = (lli->lli_st_mode & ~S_IFMT)|(body->mode & S_IFMT); + if (body->valid & OBD_MD_FLUID) + lli->lli_st_uid = body->uid; + if (body->valid & OBD_MD_FLGID) + lli->lli_st_gid = body->gid; + if (body->valid & OBD_MD_FLFLAGS) + lli->lli_st_flags = body->flags; + if (body->valid & OBD_MD_FLNLINK) + lli->lli_st_nlink = body->nlink; + if (body->valid & OBD_MD_FLGENER) + lli->lli_st_generation = body->generation; + if (body->valid & OBD_MD_FLRDEV) + lli->lli_st_rdev = body->rdev; + if (body->valid & OBD_MD_FLSIZE) + lli->lli_st_size = body->size; + if (body->valid & OBD_MD_FLBLOCKS) + lli->lli_st_blocks = body->blocks; + + /* fillin fid */ + if (body->valid & OBD_MD_FLID) + lli->lli_fid.id = body->ino; + if (body->valid & OBD_MD_FLGENER) + lli->lli_fid.generation = body->generation; + if (body->valid & OBD_MD_FLTYPE) + lli->lli_fid.f_type = body->mode & S_IFMT; +} + +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) +{ + struct llu_inode_info *lli = llu_i2info(dst); + + valid &= src->o_valid; + + if (valid & OBD_MD_FLATIME) + LTIME_S(lli->lli_st_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME) + LTIME_S(lli->lli_st_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(lli->lli_st_ctime)) + LTIME_S(lli->lli_st_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + lli->lli_st_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + lli->lli_st_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + lli->lli_st_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + lli->lli_st_mode = (lli->lli_st_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + lli->lli_st_mode = (lli->lli_st_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + lli->lli_st_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + lli->lli_st_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + lli->lli_st_flags = src->o_flags; + if (valid & OBD_MD_FLNLINK) + lli->lli_st_nlink = src->o_nlink; + if (valid & OBD_MD_FLGENER) + lli->lli_st_generation = src->o_generation; + if (valid & OBD_MD_FLRDEV) + lli->lli_st_rdev = src->o_rdev; +} + +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) +{ + struct llu_inode_info *lli = llu_i2info(src); + + if (valid & OBD_MD_FLATIME) + dst->o_atime = LTIME_S(lli->lli_st_atime); + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = LTIME_S(lli->lli_st_mtime); + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = LTIME_S(lli->lli_st_ctime); + if (valid & OBD_MD_FLSIZE) + dst->o_size = lli->lli_st_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = lli->lli_st_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = lli->lli_st_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (lli->lli_st_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (lli->lli_st_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = lli->lli_st_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = lli->lli_st_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = lli->lli_st_flags; + if (valid & OBD_MD_FLNLINK) + dst->o_nlink = lli->lli_st_nlink; + if (valid & OBD_MD_FLGENER) + dst->o_generation = lli->lli_st_generation; + if (valid & OBD_MD_FLRDEV) + dst->o_rdev = (__u32)(lli->lli_st_rdev); + + dst->o_valid |= (valid & ~OBD_MD_FLID); +} + +int llu_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, + char *ostdata) +{ + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct obdo oa; + int rc; + ENTRY; + + LASSERT(lsm); + LASSERT(sbi); + + memset(&oa, 0, sizeof oa); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | + OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + + if (ostdata != NULL) { + memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE); + oa.o_valid |= OBD_MD_FLHANDLE; + } + + rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); + if (rc) + RETURN(rc); + + obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); + + RETURN(0); +} + +struct inode* llu_new_inode(struct filesys *fs, ino_t ino, mode_t mode) +{ + struct inode *inode; + struct llu_inode_info *lli; + + OBD_ALLOC(lli, sizeof(*lli)); + if (!lli) + return NULL; + + /* initialize lli here */ + lli->lli_sbi = llu_fs2sbi(fs); + lli->lli_smd = NULL; + lli->lli_symlink_name = NULL; + lli->lli_flags = 0; + INIT_LIST_HEAD(&lli->lli_read_extents); + lli->lli_file_data = NULL; + + /* could file_identifier be 0 ? FIXME */ + inode = _sysio_i_new(fs, ino, NULL, +#ifndef AUTOMOUNT_FILE_NAME + mode & S_IFMT, +#else + mode, /* all of the bits! */ +#endif + 0, + &llu_inode_ops, lli); + + if (!inode) + OBD_FREE(lli, sizeof(*lli)); + + return inode; +} + +static int llu_iop_lookup(struct pnode *pnode, + struct inode **inop, + struct intent *intnt __IS_UNUSED, + const char *path __IS_UNUSED) +{ + struct pnode_base *pb_dir = pnode->p_parent->p_base; + struct ptlrpc_request *request = NULL; + struct llu_sb_info *sbi = llu_i2sbi(pb_dir->pb_ino); + struct ll_fid *fid = &llu_i2info(pb_dir->pb_ino)->lli_fid; + struct qstr *name = &pnode->p_base->pb_name; + struct mds_body *body; + unsigned long valid; + char *pname; + int rc, easize; + struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lsm = NULL}; + + /* the mount root inode have no name, so don't call + * remote in this case. but probably we need revalidate + * it here? FIXME */ + if (pnode->p_mount->mnt_root == pnode) { + struct inode *i = pnode->p_base->pb_ino; + I_REF(i); + *inop = i; + return 0; + } + + if (!name->len) + return -EINVAL; + + /* mdc_getattr_name require NULL-terminated name */ + OBD_ALLOC(pname, name->len + 1); + if (!pname) + return -ENOMEM; + memcpy(pname, name->name, name->len); + pname[name->len] = 0; + + valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE; + + /* FIXME before getattr_name, we don't know whether + * the inode we are finding is regular or not, so here + * we blindly require server feed in EA data */ + easize = obd_size_diskmd(&sbi->ll_osc_conn, NULL); + valid |= OBD_MD_FLEASIZE; + + rc = mdc_getattr_name(&sbi->ll_mdc_conn, fid, + pname, name->len + 1, + valid, easize, &request); + if (rc < 0) { + CERROR("mdc_getattr_name: %d\n", rc); + rc = -ENOENT; + goto out; + } + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*body)); + + *inop = llu_new_inode(pnode->p_mount->mnt_fs, body->ino, body->mode); + if (!inop) + goto out; + + lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*lic.lic_body)); + LASSERT (lic.lic_body != NULL); + LASSERT_REPSWABBED (request, 0); + + if (S_ISREG(lic.lic_body->mode) && + lic.lic_body->valid & OBD_MD_FLEASIZE) { + struct lov_mds_md *lmm; + int lmm_size; + int rc; + + lmm_size = lic.lic_body->eadatasize; + if (lmm_size == 0) { + CERROR ("OBD_MD_FLEASIZE set but eadatasize 0\n"); + RETURN (-EPROTO); + } + lmm = lustre_msg_buf(request->rq_repmsg, 0 + 1, lmm_size); + LASSERT(lmm != NULL); + LASSERT_REPSWABBED (request, 0 + 1); + + rc = obd_unpackmd (&sbi->ll_osc_conn, + &lic.lic_lsm, lmm, lmm_size); + if (rc < 0) { + CERROR ("Error %d unpacking eadata\n", rc); + RETURN (rc); + } + LASSERT (rc >= sizeof (*lic.lic_lsm)); + + } else { + lic.lic_lsm = NULL; + } + + llu_update_inode(*inop, body, lic.lic_lsm); + + if (llu_i2info(*inop)->lli_smd) { + rc = llu_inode_getattr(*inop, llu_i2info(*inop)->lli_smd, NULL); + if (rc) + _sysio_i_gone(*inop); + } + +out: + ptlrpc_req_finished(request); + OBD_FREE(pname, name->len + 1); + + return rc; +} + +static int llu_iop_getattr(struct pnode *pno, + struct inode *ino, + struct intnl_stat *b) +{ + struct llu_inode_info *lli = llu_i2info(ino); + + b->st_dev = lli->lli_st_dev; + b->st_ino = lli->lli_st_ino; + b->st_mode = lli->lli_st_mode; + b->st_nlink = lli->lli_st_nlink; + b->st_uid = lli->lli_st_uid; + b->st_gid = lli->lli_st_gid; + b->st_rdev = lli->lli_st_rdev; + b->st_size = lli->lli_st_size; + b->st_blksize = lli->lli_st_blksize; + b->st_blocks = lli->lli_st_blocks; + b->st_atime = lli->lli_st_atime; + b->st_mtime = lli->lli_st_mtime; + b->st_ctime = lli->lli_st_ctime; + + return 0; +} + +int llu_mdc_cancel_unused(struct lustre_handle *conn, + struct llu_inode_info *lli, + int flags) +{ + struct ldlm_res_id res_id = + { .name = {lli->lli_st_ino, lli->lli_st_generation} }; + struct obd_device *obddev = class_conn2obd(conn); + ENTRY; + RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags)); +} + +static void llu_clear_inode(struct inode *inode) +{ + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct llu_inode_info *lli = llu_i2info(inode); + int rc; + ENTRY; + + CDEBUG(D_INODE, "clear inode: %lu\n", lli->lli_st_ino); + rc = llu_mdc_cancel_unused(&sbi->ll_mdc_conn, lli, + LDLM_FL_NO_CALLBACK); + if (rc < 0) { + CERROR("ll_mdc_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ + } + + if (lli->lli_smd) { + rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0); + if (rc < 0) { + CERROR("obd_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ + } + } + + if (lli->lli_smd) + obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + + if (lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name, + strlen(lli->lli_symlink_name) + 1); + lli->lli_symlink_name = NULL; + } + + EXIT; +} + +void llu_iop_gone(struct inode *inode) +{ + struct llu_inode_info *lli = llu_i2info(inode); + + llu_clear_inode(inode); + + OBD_FREE(lli, sizeof(*lli)); +} + +static int llu_setattr_raw(struct inode *inode, struct iattr *attr) +{ + struct ptlrpc_request *request = NULL; + struct llu_sb_info *sbi = llu_i2sbi(inode); + struct llu_inode_info *lli = llu_i2info(inode); + struct mdc_op_data op_data; + int err = 0; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", lli->lli_st_ino); + + /* if need truncate, do it at first */ + if (attr->ia_valid & ATTR_SIZE) { + printf("************* don't support truncate now !!!!!!!!\n"); + LBUG(); + } + + /* Don't send size changes to MDS to avoid "fast EA" problems, and + * also avoid a pointless RPC (we get file size from OST anyways). + */ + attr->ia_valid &= ~ATTR_SIZE; + if (!attr->ia_valid) + RETURN(0); + + llu_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, &request); + if (err) + CERROR("mdc_setattr fails: err = %d\n", err); + + ptlrpc_req_finished(request); + + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + struct lov_stripe_md *lsm = lli->lli_smd; + struct obdo oa; + int err2; + + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", + lli->lli_st_ino, attr->ia_mtime); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME; + oa.o_mtime = attr->ia_mtime; + err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (err2) { + CERROR("obd_setattr fails: rc=%d\n", err); + if (!err) + err = err2; + } + } + RETURN(err); +} + +/* FIXME here we simply act as a thin layer to glue it with + * llu_setattr_raw(), which is copy from kernel + */ +static int llu_iop_setattr(struct pnode *pno, + struct inode *ino, + unsigned mask, + struct intnl_stat *stbuf) +{ + struct iattr iattr; + + memset(&iattr, 0, sizeof(iattr)); + + if (mask & SETATTR_MODE) { + iattr.ia_mode = stbuf->st_mode; + iattr.ia_valid |= ATTR_MODE; + } + if (mask & SETATTR_MTIME) { + iattr.ia_mtime = stbuf->st_mtime; + iattr.ia_valid |= ATTR_MTIME; + } + if (mask & SETATTR_ATIME) { + iattr.ia_atime = stbuf->st_atime; + iattr.ia_valid |= ATTR_ATIME; + } + if (mask & SETATTR_UID) { + iattr.ia_uid = stbuf->st_uid; + iattr.ia_valid |= ATTR_UID; + } + if (mask & SETATTR_GID) { + iattr.ia_gid = stbuf->st_gid; + iattr.ia_valid |= ATTR_GID; + } + if (mask & SETATTR_LEN) { + iattr.ia_size = stbuf->st_size; /* FIXME signed expansion problem */ + iattr.ia_valid |= ATTR_SIZE; + } + + iattr.ia_valid |= ATTR_RAW; + /* FIXME FIXME FIXME FIXME FIXME FIXME FIXME + * without ATTR_FROM_OPEN, mds_reint_setattr will call + * mds_fid2locked_dentry() and deadlocked at completion_ast call. + * Here we workaround it and avoid any locking. + * FIXME FIXME FIXME FIXME FIXME FIXME FIXME + */ + iattr.ia_valid |= ATTR_FROM_OPEN; + + return llu_setattr_raw(ino, &iattr); +} + + +static int llu_mkdir2(struct inode *dir, const char *name, int len, int mode) +{ + struct ptlrpc_request *request = NULL; + time_t curtime = CURRENT_TIME; + struct llu_sb_info *sbi = llu_i2sbi(dir); + struct llu_inode_info *lli = llu_i2info(dir); + struct mdc_op_data op_data; + int err = -EMLINK; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu\n", + name, lli->lli_st_ino); + + /* FIXME check this later */ +#if 0 + if (dir->i_nlink >= EXT2_LINK_MAX) + RETURN(err); + mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; +#endif + mode |= S_IFDIR; + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode, + current->fsuid, current->fsgid, + curtime, 0, &request); + ptlrpc_req_finished(request); + RETURN(err); +} + +static int llu_iop_mkdir(struct pnode *pno, mode_t mode) +{ + struct inode *dir = pno->p_base->pb_parent->pb_ino; + struct qstr *qstr = &pno->p_base->pb_name; + int rc; + + LASSERT(dir); + + rc = llu_mkdir2(dir, qstr->name, qstr->len, mode); + + return rc; +} + +#ifndef S_IRWXUGO +#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +#endif + +static int llu_symlink2(struct inode *dir, const char *name, int len, + const char *tgt) +{ + struct ptlrpc_request *request = NULL; + time_t curtime = CURRENT_TIME; + struct llu_sb_info *sbi = llu_i2sbi(dir); + struct llu_inode_info *lli = llu_i2info(dir); + struct mdc_op_data op_data; + int err = -EMLINK; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu,target=%s\n", + name, lli->lli_st_ino, tgt); + +#if 0 + if (dir->i_nlink >= EXT2_LINK_MAX) + RETURN(err); +#endif + llu_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + err = mdc_create(&sbi->ll_mdc_conn, &op_data, + tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, + current->fsuid, current->fsgid, curtime, 0, &request); + ptlrpc_req_finished(request); + RETURN(err); +} + +static int llu_iop_symlink(struct pnode *pno, const char *data) +{ + struct inode *dir = pno->p_base->pb_parent->pb_ino; + struct qstr *qstr = &pno->p_base->pb_name; + int rc; + + LASSERT(dir); + + rc = llu_symlink2(dir, qstr->name, qstr->len, data); + + return rc; +} + +struct filesys_ops llu_filesys_ops = +{ + fsop_gone: llu_fsop_gone, +}; + + +static struct inode_ops llu_inode_ops = { + inop_lookup: llu_iop_lookup, + inop_getattr: llu_iop_getattr, + inop_setattr: llu_iop_setattr, + inop_getdirentries: NULL, + inop_mkdir: llu_iop_mkdir, + inop_rmdir: NULL, + inop_symlink: llu_iop_symlink, + inop_readlink: NULL, + inop_open: llu_iop_open, + inop_close: llu_iop_close, + inop_unlink: NULL, + inop_ipreadv: llu_iop_ipreadv, + inop_ipwritev: llu_iop_ipwritev, + inop_iodone: llu_iop_iodone, + inop_fcntl: NULL, + inop_sync: NULL, + inop_datasync: NULL, + inop_ioctl: NULL, + inop_mknod: NULL, + inop_statvfs: NULL, + inop_gone: llu_iop_gone, +}; + + +static int +llu_fsswop_mount(const char *source, + unsigned flags, + const void *data __IS_UNUSED, + struct pnode *tocover, + struct mount **mntp) +{ + struct filesys *fs; + struct inode *root; + struct pnode_base *rootpb; + static struct qstr noname = { NULL, 0, 0 }; + struct ll_fid rootfid; + + struct llu_sb_info *sbi; + struct ptlrpc_connection *mdc_conn; + struct ptlrpc_request *request = NULL; + struct mds_body *root_body; + struct obd_uuid param_uuid; + class_uuid_t uuid; + struct obd_device *obd; + char *osc=mount_option.osc_uuid; + char *mdc=mount_option.mdc_uuid; + int err = -EINVAL; + + ENTRY; + + OBD_ALLOC(sbi, sizeof(*sbi)); + if (!sbi) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&sbi->ll_conn_chain); + generate_random_uuid(uuid); + class_uuid_unparse(uuid, &sbi->ll_sb_uuid); + + fs = _sysio_fs_new(&llu_filesys_ops, flags, sbi); + if (!fs) { + err = -ENOMEM; + goto out_free; + } + + strncpy(param_uuid.uuid, mdc, sizeof(param_uuid.uuid)); + obd = class_uuid2obd(¶m_uuid); + if (!obd) { + CERROR("MDC %s: not setup or attached\n", mdc); + err = -EINVAL; + goto out_free; + } + + /* setup mdc */ + /* FIXME need recover stuff */ + err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid); + if (err) { + CERROR("cannot connect to %s: rc = %d\n", mdc, err); + goto out_free; + } + + mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection; + + /* setup osc */ + strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid)); + obd = class_uuid2obd(¶m_uuid); + if (!obd) { + CERROR("OSC %s: not setup or attached\n", osc); + err = -EINVAL; + goto out_mdc; + } + + err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid); + if (err) { + CERROR("cannot connect to %s: rc = %d\n", osc, err); + goto out_mdc; + } + + err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + goto out_osc; + } + CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); + sbi->ll_rootino = rootfid.id; + +/* XXX do we need this?? + memset(&osfs, 0, sizeof(osfs)); + rc = obd_statfs(&sbi->ll_mdc_conn, &osfs); +*/ + /* fetch attr of root inode */ + err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid, + OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); + if (err) { + CERROR("mdc_getattr failed for root: rc = %d\n", err); + goto out_request; + } + + root_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof(*root_body)); + LASSERT(sbi->ll_rootino != 0); + + root = llu_new_inode(fs, root_body->ino, root_body->mode); + if (!root) { + err = -ENOMEM; + goto out_request; + } + + llu_update_inode(root, root_body, NULL); + + /* + * Generate base path-node for root. + */ + rootpb = _sysio_pb_new(&noname, NULL, root); + if (!rootpb) { + err = -ENOMEM; + goto out_inode; + } + + err = _sysio_do_mount(fs, rootpb, flags, NULL, mntp); + if (err) { + _sysio_pb_gone(rootpb); + goto out_inode; + } + + ptlrpc_req_finished(request); + request = NULL; + + printf("************************************************\n"); + printf("* Mount successfully!!!!!!! *\n"); + printf("************************************************\n"); + + return 0; + +out_inode: + _sysio_i_gone(root); +out_request: + ptlrpc_req_finished(request); +out_osc: + obd_disconnect(&sbi->ll_osc_conn); +out_mdc: + obd_disconnect(&sbi->ll_mdc_conn); +out_free: + OBD_FREE(sbi, sizeof(*sbi)); + return err; +} + +struct fssw_ops llu_fssw_ops = { + llu_fsswop_mount +}; + diff --git a/lustre/llite/Makefile.am b/lustre/llite/Makefile.am index 309088b..ddb9657 100644 --- a/lustre/llite/Makefile.am +++ b/lustre/llite/Makefile.am @@ -11,6 +11,6 @@ EXTRA_PROGRAMS = llite llite_SOURCES = dcache.c commit_callback.c super.c rw.c iod.c super25.c llite_SOURCES += file.c dir.c sysctl.c symlink.c -llite_SOURCES += recover.c namei.c lproc_llite.c +llite_SOURCES += namei.c lproc_llite.c include $(top_srcdir)/Rules diff --git a/lustre/llite/commit_callback.c b/lustre/llite/commit_callback.c index f8b7e70..ee49bb8 100644 --- a/lustre/llite/commit_callback.c +++ b/lustre/llite/commit_callback.c @@ -34,6 +34,7 @@ #include <linux/lustre_lite.h> #include <linux/lustre_lib.h> +#include <linux/lustre_compat25.h> static int ll_commitcbd_check_event(struct ll_sb_info *sbi) { @@ -57,26 +58,17 @@ static int ll_commitcbd_main(void *arg) ENTRY; lock_kernel(); - daemonize(); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - spin_lock_irqsave(¤t->sigmask_lock, flags); - sigfillset(¤t->blocked); - our_recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); -#else + kportal_daemonize("lustre_commitcbd"); + + SIGNAL_MASK_LOCK(current, flags); sigfillset(¤t->blocked); - our_recalc_sigpending(current); -#endif + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); - sprintf(current->comm, "lustre_commitcbd"); unlock_kernel(); /* Record that the thread is running */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - sbi->ll_commitcbd_waketime = CURRENT_TIME; -#else - sbi->ll_commitcbd_waketime = CURRENT_TIME.tv_sec; -#endif + sbi->ll_commitcbd_waketime = LTIME_S(CURRENT_TIME); sbi->ll_commitcbd_timeout = 10 * HZ; sbi->ll_commitcbd_thread = current; sbi->ll_commitcbd_flags = LL_COMMITCBD_RUNNING; diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 41c68d9..0c9fcf7 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -39,6 +39,16 @@ void ll_release(struct dentry *de) EXIT; } +int ll_delete(struct dentry *de) +{ + if (de->d_it != 0) { + CERROR("%s put dentry %p+%p with d_it %p\n", current->comm, + de, de->d_fsdata, de->d_it); + LBUG(); + } + return 0; +} + void ll_set_dd(struct dentry *de) { ENTRY; @@ -61,8 +71,6 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it) struct lustre_handle *handle; ENTRY; - LASSERT(ll_d2d(de) != NULL); - if (it->it_lock_mode) { handle = (struct lustre_handle *)it->it_lock_handle; ldlm_lock_decref(handle, it->it_lock_mode); @@ -80,8 +88,9 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it) if (de->d_it == it) LL_GET_INTENT(de, it); - else - CERROR("STRANGE intent release: %p %p\n", de->d_it, it); + else + CDEBUG(D_INODE, "STRANGE intent release: %p %p\n", + de->d_it, it); EXIT; } @@ -89,25 +98,66 @@ void ll_intent_release(struct dentry *de, struct lookup_intent *it) extern struct dentry *ll_find_alias(struct inode *, struct dentry *); static int revalidate2_finish(int flag, struct ptlrpc_request *request, - struct dentry **de, struct lookup_intent *it, - int offset, obd_id ino) + struct inode *parent, struct dentry **de, + struct lookup_intent *it, int offset, obd_id ino) { - struct mds_body *body; - struct lov_mds_md *lmm = NULL; - int rc = 0; + struct ll_sb_info *sbi = ll_i2sbi(parent); + struct mds_body *body; + struct lov_stripe_md *lsm = NULL; + struct lov_mds_md *lmm; + int lmmsize; + int rc = 0; ENTRY; - if (!(flag & LL_LOOKUP_NEGATIVE)) { - body = lustre_msg_buf(request->rq_repmsg, offset); - if (body->valid & OBD_MD_FLEASIZE) - lmm = lustre_msg_buf(request->rq_repmsg, offset + 1); - ll_update_inode((*de)->d_inode, body, lmm); - mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, - (*de)->d_inode); - } else - rc = -ENOENT; - - ptlrpc_req_finished(request); + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ + + if ((flag & LL_LOOKUP_NEGATIVE) != 0) + GOTO (out, rc = -ENOENT); + + /* We only get called if the mdc_enqueue() called from + * ll_intent_lock() was successful. Therefore the mds_body is + * present and correct, and the eadata is present (but still + * opaque, so only obd_unpackmd() can check the size) */ + body = lustre_msg_buf(request->rq_repmsg, offset, sizeof (*body)); + LASSERT (body != NULL); + LASSERT_REPSWABBED (request, offset); + + if (body->valid & OBD_MD_FLEASIZE) { + /* Only bother with this if inodes's LSM not set? */ + + if (body->eadatasize == 0) { + CERROR ("OBD_MD_FLEASIZE set, but eadatasize 0\n"); + GOTO (out, rc = -EPROTO); + } + lmmsize = body->eadatasize; + lmm = lustre_msg_buf (request->rq_repmsg, offset + 1, lmmsize); + LASSERT (lmm != NULL); + LASSERT_REPSWABBED (request, offset + 1); + + rc = obd_unpackmd (&sbi->ll_osc_conn, + &lsm, lmm, lmmsize); + if (rc < 0) { + CERROR ("Error %d unpacking eadata\n", rc); + LBUG(); + /* XXX don't know if I should do this... */ + GOTO (out, rc); + /* or skip the ll_update_inode but still do + * mdc_lock_set_inode() */ + } + LASSERT (rc >= sizeof (*lsm)); + rc = 0; + } + + ll_update_inode((*de)->d_inode, body, lsm); + + if (lsm != NULL && + ll_i2info((*de)->d_inode)->lli_smd != lsm) + obd_free_memmd (&sbi->ll_osc_conn, &lsm); + + ll_mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, + (*de)->d_inode); + out: RETURN(rc); } @@ -146,6 +196,8 @@ int ll_revalidate2(struct dentry *de, int flags, struct lookup_intent *it) { int rc; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,intent=%s\n", de->d_name.name, + LL_IT2STR(it)); /* We don't want to cache negative dentries, so return 0 immediately. * We believe that this is safe, that negative dentries cannot be @@ -221,4 +273,5 @@ struct dentry_operations ll_d_ops = { .d_revalidate2 = ll_revalidate2, .d_intent_release = ll_intent_release, .d_release = ll_release, + .d_delete = ll_delete, }; diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 21192aa..8759598 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -58,7 +58,7 @@ typedef struct ext2_dir_entry_2 ext2_dirent; static int ll_dir_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:\n"); return 0; } @@ -67,17 +67,18 @@ static int ll_dir_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct ll_sb_info *sbi = ll_i2sbi(inode); - char *buf; __u64 offset; int rc = 0; struct ptlrpc_request *request; struct lustre_handle lockh; struct mds_body *body; struct lookup_intent it = { .it_op = IT_READDIR }; + struct mdc_op_data data; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); if ((inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT <= page->index){ /* XXX why do we need this exactly, and why do we think that * an all-zero directory page is useful? @@ -89,8 +90,11 @@ static int ll_dir_readpage(struct file *file, struct page *page) GOTO(readpage_out, rc); } - rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_PR, inode, - NULL, &lockh, NULL, 0, inode, sizeof(*inode)); + ll_prepare_mdc_op_data(&data, inode, NULL, NULL, 0, 0); + + rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_PR, + &data, &lockh, NULL, 0, + ldlm_completion_ast, ll_mdc_blocking_ast, inode); request = (struct ptlrpc_request *)it.it_data; if (request) ptlrpc_req_finished(request); @@ -107,16 +111,14 @@ static int ll_dir_readpage(struct file *file, struct page *page) } offset = page->index << PAGE_SHIFT; - buf = kmap(page); rc = mdc_readpage(&sbi->ll_mdc_conn, inode->i_ino, - S_IFDIR, offset, buf, &request); - kunmap(page); + S_IFDIR, offset, page, &request); if (!rc) { - body = lustre_msg_buf(request->rq_repmsg, 0); - if (!body) - rc = -EINVAL; - else - inode->i_size = body->size; + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); /* checked by mdc_readpage() */ + LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_readpage() */ + + inode->i_size = body->size; } ptlrpc_req_finished(request); EXIT; @@ -398,7 +400,8 @@ int ll_readdir(struct file * filp, void * dirent, filldir_t filldir) int need_revalidate = (filp->f_version != inode->i_version); ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) GOTO(done, 0); @@ -764,15 +767,17 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, struct ll_sb_info *sbi = ll_i2sbi(inode); struct obd_ioctl_data *data; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino, + inode->i_generation, inode, cmd); switch(cmd) { case IOC_MDC_LOOKUP: { struct ptlrpc_request *request = NULL; + struct ll_fid fid; char *buf = NULL; + struct mds_body *body; char *filename; int namelen, rc, err, len = 0; - int ea_size = 0; // obd_size_wiremd(&sbi->ll_osc_conn, NULL); unsigned long valid; rc = obd_ioctl_getdata(&buf, &len, (void *)arg); @@ -789,29 +794,32 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, } valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE; - rc = mdc_getattr_name(&sbi->ll_mdc_conn, inode, filename, - namelen, valid, ea_size, &request); + ll_inode2fid(&fid, inode); + rc = mdc_getattr_name(&sbi->ll_mdc_conn, &fid, + filename, namelen, valid, 0, &request); if (rc < 0) { CERROR("mdc_getattr_name: %d\n", rc); GOTO(out, rc); - } else { - struct mds_body *body; - body = lustre_msg_buf(request->rq_repmsg, 0); - /* surely there's a better way -phik */ - data->ioc_obdo1.o_mode = body->mode; - data->ioc_obdo1.o_uid = body->uid; - data->ioc_obdo1.o_gid = body->gid; } + body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); /* checked by mdc_getattr_name() */ + LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_getattr_name() */ + + /* surely there's a better way -phik */ + data->ioc_obdo1.o_mode = body->mode; + data->ioc_obdo1.o_uid = body->uid; + data->ioc_obdo1.o_gid = body->gid; + + ptlrpc_req_finished(request); + err = copy_to_user((void *)arg, buf, len); if (err) - GOTO(out_req, rc = -EFAULT); + GOTO(out, rc = -EFAULT); EXIT; - out_req: - ptlrpc_req_finished(request); out: - OBD_FREE(buf, len); + obd_ioctl_freedata(buf, len); return rc; } default: @@ -820,8 +828,21 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, } } +int ll_dir_open(struct inode *inode, struct file *file) +{ + return ll_file_open(inode, file); +} + +int ll_dir_release(struct inode *inode, struct file *file) +{ + return ll_file_release(inode, file); +} + struct file_operations ll_dir_operations = { + open: ll_dir_open, + release: ll_dir_release, read: generic_read_dir, readdir: ll_readdir, ioctl: ll_dir_ioctl }; + diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 4c16e1c..3429b28 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -23,11 +23,14 @@ */ #define DEBUG_SUBSYSTEM S_LLITE - #include <linux/lustre_dlm.h> #include <linux/lustre_lite.h> #include <linux/obd_lov.h> /* for lov_mds_md_size() in lov_setstripe() */ #include <linux/random.h> +#include <linux/pagemap.h> +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include <linux/lustre_compat25.h> +#endif int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc); extern int ll_setattr(struct dentry *de, struct iattr *attr); @@ -44,22 +47,25 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, /* Complete the open request and remove it from replay list */ rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino, - inode->i_mode, &fd->fd_mdshandle, &req); + inode->i_mode, &fd->fd_mds_och.och_fh, &req); if (rc) CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc); - imp = fd->fd_req->rq_import; + imp = fd->fd_mds_och.och_req->rq_import; LASSERT(imp != NULL); spin_lock_irqsave(&imp->imp_lock, flags); - DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req); + DEBUG_REQ(D_HA, fd->fd_mds_och.och_req, "matched open req %p", + fd->fd_mds_och.och_req); /* We held on to the request for replay until we saw a close for that * file. Now that we've closed it, it gets replayed on the basis of * its transno only. */ - fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY; + spin_lock (&fd->fd_mds_och.och_req->rq_lock); + fd->fd_mds_och.och_req->rq_replay = 0; + spin_unlock (&fd->fd_mds_och.och_req->rq_lock); - if (fd->fd_req->rq_transno) { + if (fd->fd_mds_och.och_req->rq_transno) { /* This open created a file, so it needs replay as a * normal transaction now. Our reference to it now * effectively owned by the imp_replay_list, and it'll @@ -78,7 +84,7 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, * the basis of that and we don't need to do anything * magical here. */ if (!req->rq_transno) { - req->rq_transno = fd->fd_req->rq_transno; + req->rq_transno = fd->fd_mds_och.och_req->rq_transno; ptlrpc_retain_replayable_request(req, imp); } spin_unlock_irqrestore(&imp->imp_lock, flags); @@ -92,14 +98,14 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, /* No transno means that we can just drop our ref. */ spin_unlock_irqrestore(&imp->imp_lock, flags); } - ptlrpc_req_finished(fd->fd_req); + ptlrpc_req_finished(fd->fd_mds_och.och_req); /* Do this after the fd_req->rq_transno check, because we don't want * to bounce off zero references. */ ptlrpc_req_finished(req); - fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; + fd->fd_mds_och.och_fh.cookie = DEAD_HANDLE_MAGIC; file->private_data = NULL; - kmem_cache_free(ll_file_data_slab, fd); + OBD_SLAB_FREE(fd, ll_file_data_slab, sizeof *fd); RETURN(-abs(rc)); } @@ -109,7 +115,7 @@ static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, * rarely check close errors and even if an error is returned they will not * re-try the close call. */ -static int ll_file_release(struct inode *inode, struct file *file) +int ll_file_release(struct inode *inode, struct file *file) { struct ll_file_data *fd; struct obdo oa; @@ -119,6 +125,12 @@ static int ll_file_release(struct inode *inode, struct file *file) int rc = 0, rc2; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + + /* don't do anything for / */ + if (inode->i_sb->s_root == file->f_dentry) + RETURN(0); fd = (struct ll_file_data *)file->private_data; if (!fd) /* no process opened the file after an mcreate */ @@ -126,22 +138,24 @@ static int ll_file_release(struct inode *inode, struct file *file) /* we might not be able to get a valid handle on this file * again so we really want to flush our write cache.. */ - filemap_fdatasync(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) { + filemap_fdatasync(inode->i_mapping); + filemap_fdatawait(inode->i_mapping); - if (lsm != NULL) { - memset(&oa, 0, sizeof(oa)); - oa.o_id = lsm->lsm_object_id; - oa.o_mode = S_IFREG; - oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; + if (lsm != NULL) { + memset(&oa, 0, sizeof(oa)); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; - memcpy(&oa.o_inline, fd->fd_ostdata, FD_OSTDATA_SIZE); - oa.o_valid |= OBD_MD_FLHANDLE; + memcpy(&oa.o_inline, &fd->fd_ost_och, FD_OSTDATA_SIZE); + oa.o_valid |= OBD_MD_FLHANDLE; - rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); - if (rc) - CERROR("inode %lu object close failed: rc = %d\n", - inode->i_ino, rc); + rc = obd_close(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (rc) + CERROR("inode %lu object close failed: rc = " + "%d\n", inode->i_ino, rc); + } } rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file); @@ -155,20 +169,24 @@ static int ll_local_open(struct file *file, struct lookup_intent *it) { struct ptlrpc_request *req = it->it_data; struct ll_file_data *fd; - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct mds_body *body; ENTRY; + body = lustre_msg_buf (req->rq_repmsg, 1, sizeof (*body)); + LASSERT (body != NULL); /* reply already checked out */ + LASSERT_REPSWABBED (req, 1); /* and swabbed down */ + LASSERT(!file->private_data); - fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL); + OBD_SLAB_ALLOC(fd, ll_file_data_slab, SLAB_KERNEL, sizeof *fd); /* We can't handle this well without reorganizing ll_file_open and * ll_mdc_close, so don't even try right now. */ LASSERT(fd != NULL); memset(fd, 0, sizeof(*fd)); - memcpy(&fd->fd_mdshandle, &body->handle, sizeof(body->handle)); - fd->fd_req = it->it_data; + memcpy(&fd->fd_mds_och.och_fh, &body->handle, sizeof(body->handle)); + fd->fd_mds_och.och_req = it->it_data; file->private_data = fd; RETURN(0); @@ -189,16 +207,13 @@ static int ll_osc_open(struct lustre_handle *conn, struct inode *inode, oa->o_mode = S_IFREG; oa->o_valid = (OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME); - rc = obd_open(conn, oa, lsm, NULL); + rc = obd_open(conn, oa, lsm, NULL, &fd->fd_ost_och); if (rc) GOTO(out, rc); file->f_flags &= ~O_LOV_DELAY_CREATE; - obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | - OBD_MD_FLCTIME); - - if (oa->o_valid & OBD_MD_FLHANDLE) - memcpy(fd->fd_ostdata, obdo_handle(oa), FD_OSTDATA_SIZE); + obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | + OBD_MD_FLMTIME | OBD_MD_FLCTIME); EXIT; out: @@ -219,6 +234,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, struct lov_mds_md *lmm = NULL; struct obdo *oa; struct iattr iattr; + struct mdc_op_data op_data; int rc, err, lmm_size = 0;; ENTRY; @@ -230,8 +246,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, oa->o_id = inode->i_ino; /* Keep these 0 for now, because chown/chgrp does not change the * ownership on the OST, and we don't want to allow BA OST NFS - * users to access these objects by mistake. - */ + * users to access these objects by mistake. */ oa->o_uid = 0; oa->o_gid = 0; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | @@ -247,6 +262,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, } GOTO(out_oa, rc); } + obdo_to_inode(inode, oa, OBD_MD_FLBLKSZ); LASSERT(lsm && lsm->lsm_object_id); rc = obd_packmd(conn, &lmm, lsm); @@ -258,11 +274,14 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, /* Save the stripe MD with this file on the MDS */ memset(&iattr, 0, sizeof(iattr)); iattr.ia_valid = ATTR_FROM_OPEN; - rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, inode, &iattr, - lmm, lmm_size, &req); + + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + rc = mdc_setattr(&ll_i2sbi(inode)->ll_mdc_conn, &op_data, + &iattr, lmm, lmm_size, &req); ptlrpc_req_finished(req); - obd_free_wiremd(conn, &lmm); + obd_free_diskmd (conn, &lmm); /* If we couldn't complete mdc_open() and store the stripe MD on the * MDS, we need to destroy the objects now or they will be leaked. @@ -273,6 +292,7 @@ static int ll_create_obj(struct lustre_handle *conn, struct inode *inode, GOTO(out_destroy, rc); } lli->lli_smd = lsm; + lli->lli_maxbytes = lsm->lsm_maxbytes; EXIT; out_oa: @@ -308,7 +328,7 @@ out_destroy: */ extern int ll_it_open_error(int phase, struct lookup_intent *it); -static int ll_file_open(struct inode *inode, struct file *file) +int ll_file_open(struct inode *inode, struct file *file) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); @@ -318,7 +338,13 @@ static int ll_file_open(struct inode *inode, struct file *file) int rc = 0; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + + /* don't do anything for / */ + if (inode->i_sb->s_root == file->f_dentry) + RETURN(0); + LL_GET_INTENT(file->f_dentry, it); rc = ll_it_open_error(IT_OPEN_OPEN, it); if (rc) @@ -328,7 +354,10 @@ static int ll_file_open(struct inode *inode, struct file *file) if (rc) LBUG(); - mdc_set_open_replay_data((struct ll_file_data *)file->private_data); + mdc_set_open_replay_data(&((struct ll_file_data *) + file->private_data)->fd_mds_och); + if (!S_ISREG(inode->i_mode)) + RETURN(0); lsm = lli->lli_smd; if (lsm == NULL) { @@ -364,69 +393,86 @@ static int ll_file_open(struct inode *inode, struct file *file) * really does the getattr on the inode and updates its fields */ int ll_inode_getattr(struct inode *inode, struct lov_stripe_md *lsm, - char *ostdata) + void *ostdata) { struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct ptlrpc_request_set *set; struct obdo oa; + int bef, aft; + unsigned long before, after; int rc; ENTRY; LASSERT(lsm); LASSERT(sbi); + LASSERT(lli); memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS | OBD_MD_FLMTIME | OBD_MD_FLCTIME; + OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | + OBD_MD_FLCTIME; if (ostdata != NULL) { memcpy(&oa.o_inline, ostdata, FD_OSTDATA_SIZE); oa.o_valid |= OBD_MD_FLHANDLE; } - rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); - if (rc) - RETURN(rc); - - obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); - - CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu\n", lsm->lsm_object_id, - inode->i_size, inode->i_size); - RETURN(0); -} - -/* - * we've acquired a lock and need to see if we should perform a getattr - * to update the file size that may have been updated by others that had - * their locks canceled. - */ -static int ll_size_validate(struct inode *inode, struct lov_stripe_md *lsm, - char *ostdata, struct ldlm_extent *extent) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc = 0; - ENTRY; - - if (test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) + /* getattr can race with writeback. we don't want to trust a getattr + * that doesn't include the writeback of our farthest cached pages + * that it raced with. */ + do { + bef = ll_farthest_dirty(&lli->lli_dirty, &before); +#if 0 + rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); +#else + set = ptlrpc_prep_set (); + if (set == NULL) { + CERROR ("ENOMEM allocing request set\n"); + rc = -ENOMEM; + } else { + rc = obd_getattr_async(&sbi->ll_osc_conn, &oa, lsm, set); + if (rc == 0) + rc = ptlrpc_set_wait (set); + ptlrpc_set_destroy (set); + } +#endif + if (rc) + RETURN(rc); + + aft = ll_farthest_dirty(&lli->lli_dirty, &after); + CDEBUG(D_INODE, " %d,%lu -> %d,%lu\n", bef, before, aft, after); + } while (bef == 0 && + (aft != 0 || after < before) && + oa.o_size < ((u64)before + 1) << PAGE_CACHE_SHIFT); + + obdo_to_inode(inode, &oa, (OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | + OBD_MD_FLMTIME | OBD_MD_FLCTIME)); + if (inode->i_blksize < PAGE_CACHE_SIZE) + inode->i_blksize = PAGE_CACHE_SIZE; + + /* make sure getattr doesn't return a size that causes writeback + * to forget about cached writes */ + if ((aft == 0) && oa.o_size < ((u64)after + 1) << PAGE_CACHE_SHIFT) { + CDEBUG(D_INODE, "cached at %lu, keeping %llu i_size instead " + "of oa "LPU64"\n", after, inode->i_size, + oa.o_size); RETURN(0); - - down(&lli->lli_getattr_sem); - - if (!test_bit(LLI_F_DID_GETATTR, &lli->lli_flags)) { - rc = ll_inode_getattr(inode, lsm, ostdata); - if ( rc == 0 ) - set_bit(LLI_F_DID_GETATTR, &lli->lli_flags); } - up(&lli->lli_getattr_sem); - RETURN(rc); + obdo_to_inode(inode, &oa, OBD_MD_FLSIZE); + + CDEBUG(D_INODE, "objid "LPX64" size %Lu/%Lu blksize %lu\n", + lsm->lsm_object_id, inode->i_size, inode->i_size, + inode->i_blksize); + RETURN(0); } /* * some callers, notably truncate, really don't want i_size set based - * on the the size returned by the getattr, or lock acquisition in + * on the the size returned by the getattr, or lock acquisition in * the future. */ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode, @@ -438,14 +484,14 @@ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode, int rc, flags = 0; ENTRY; - LASSERT(lockh->addr == 0 && lockh->cookie == 0); + LASSERT(lockh->cookie == 0); /* XXX phil: can we do this? won't it screw the file size up? */ if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || (sbi->ll_flags & LL_SBI_NOLCK)) RETURN(0); - CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n", + CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n", inode->i_ino, extent->start, extent->end); rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, extent, @@ -454,30 +500,53 @@ int ll_extent_lock_no_validate(struct ll_file_data *fd, struct inode *inode, RETURN(rc); } + /* - * this grabs a lock and manually implements behaviour that makes it look - * like the OST is returning the file size with each lock acquisition + * this grabs a lock and manually implements behaviour that makes it look like + * the OST is returning the file size with each lock acquisition. */ int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, struct ldlm_extent *extent, struct lustre_handle *lockh) { - int rc; + struct ll_inode_info *lli = ll_i2info(inode); + struct ldlm_extent size_lock; + struct lustre_handle match_lockh = {0}; + int flags = LDLM_FL_CBPENDING | LDLM_FL_BLOCK_GRANTED; + int rc, matched; ENTRY; rc = ll_extent_lock_no_validate(fd, inode, lsm, mode, extent, lockh); + if (rc != ELDLM_OK) + RETURN(rc); - if (rc == ELDLM_OK) { - rc = ll_size_validate(inode, lsm, fd ? fd->fd_ostdata : NULL, - extent); - if ( rc != 0 ) { - ll_extent_unlock(fd, inode, lsm, mode, lockh); - rc = ELDLM_GETATTR_ERROR; - } + if (test_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags)) + RETURN(0); + + rc = ll_inode_getattr(inode, lsm, fd ? &fd->fd_ost_och : NULL); + if (rc) { + ll_extent_unlock(fd, inode, lsm, mode, lockh); + RETURN(rc); } - RETURN(rc); + size_lock.start = inode->i_size; + size_lock.end = OBD_OBJECT_EOF; + + /* XXX I bet we should be checking the lock ignore flags.. */ + matched = obd_match(&ll_i2sbi(inode)->ll_osc_conn, lsm, LDLM_EXTENT, + &size_lock, sizeof(size_lock), LCK_PR, &flags, + &match_lockh); + + /* hey, alright, we hold a size lock that covers the size we + * just found, its not going to change for a while.. */ + if (matched == 1) { + set_bit(LLI_F_HAVE_SIZE_LOCK, &lli->lli_flags); + obd_cancel(&ll_i2sbi(inode)->ll_osc_conn, lsm, LCK_PR, + &match_lockh); + } + + RETURN(0); } int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, @@ -513,16 +582,13 @@ static inline void ll_remove_suid(struct inode *inode) } } +#if 0 static void ll_update_atime(struct inode *inode) { #ifdef USE_ATIME struct iattr attr; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - attr.ia_atime = CURRENT_TIME; -#else - attr.ia_atime = CURRENT_TIME.tv_sec; -#endif + attr.ia_atime = LTIME_S(CURRENT_TIME); attr.ia_valid = ATTR_ATIME; if (inode->i_atime == attr.ia_atime) return; @@ -536,19 +602,170 @@ static void ll_update_atime(struct inode *inode) inode->i_atime = CURRENT_TIME; #endif } +#endif + +/* + * flush the page cache for an extent as its canceled. when we're on an + * lov we get a lock cancelation for each of the obd locks under the lov + * so we have to map the obd's region back onto the stripes in the file + * that it held. + * + * no one can dirty the extent until we've finished our work and they + * can enqueue another lock. + * + * XXX this could be asking the inode's dirty tree for info + */ +void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, + struct ldlm_lock *lock) +{ + struct ldlm_extent *extent = &lock->l_extent; + unsigned long start, end, count, skip, i, j; + struct page *page; + int ret; + ENTRY; + + CDEBUG(D_INODE, "obdo %lu inode %p ["LPU64"->"LPU64"] size: %llu\n", + inode->i_ino, inode, extent->start, extent->end, inode->i_size); + + start = extent->start >> PAGE_CACHE_SHIFT; + count = ~0; + skip = 0; + end = (extent->end >> PAGE_CACHE_SHIFT) + 1; + if ((end << PAGE_CACHE_SHIFT) < extent->end) + end = ~0; + if (lsm->lsm_stripe_count > 1) { + struct { + char name[16]; + struct ldlm_lock *lock; + struct lov_stripe_md *lsm; + } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm }; + __u32 stripe; + __u32 vallen = sizeof(stripe); + int rc; + + /* get our offset in the lov */ + rc = obd_get_info(ll_i2obdconn(inode), sizeof(key), + &key, &vallen, &stripe); + if (rc != 0) { + CERROR("obd_get_info: rc = %d\n", rc); + LBUG(); + } + LASSERT(stripe < lsm->lsm_stripe_count); + + count = lsm->lsm_stripe_size >> PAGE_CACHE_SHIFT; + skip = (lsm->lsm_stripe_count - 1) * count; + start += (start/count * skip) + (stripe * count); + if (end != ~0) + end += (end/count * skip) + (stripe * count); + } + + i = (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + if (end >= i) + clear_bit(LLI_F_HAVE_SIZE_LOCK, &(ll_i2info(inode)->lli_flags)); + if (i < end) + end = i; + + CDEBUG(D_INODE, "start: %lu j: %lu count: %lu skip: %lu end: %lu\n", + start, start % count, count, skip, end); + + /* start writeback on dirty pages in the extent when its PW */ + for (i = start, j = start % count; + lock->l_granted_mode == LCK_PW && i < end; j++, i++) { + if (j == count) { + i += skip; + j = 0; + } + /* its unlikely, but give us a chance to bail when we're out */ + PGCACHE_WRLOCK(inode->i_mapping); + if (list_empty(&inode->i_mapping->dirty_pages)) { + CDEBUG(D_INODE, "dirty list empty\n"); + PGCACHE_WRUNLOCK(inode->i_mapping); + break; + } + PGCACHE_WRUNLOCK(inode->i_mapping); + + if (need_resched()) + schedule(); + + page = find_get_page(inode->i_mapping, i); + if (page == NULL) + continue; + if (!PageDirty(page) || TryLockPage(page)) { + page_cache_release(page); + continue; + } + if (PageDirty(page)) { + CDEBUG(D_INODE, "writing page %p\n", page); + PGCACHE_WRLOCK(inode->i_mapping); + list_del(&page->list); + list_add(&page->list, &inode->i_mapping->locked_pages); + PGCACHE_WRUNLOCK(inode->i_mapping); + + /* this writepage might write out pages outside + * this extent, but that's ok, the pages are only + * still dirty because a lock still covers them */ + ClearPageDirty(page); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + ret = inode->i_mapping->a_ops->writepage(page); +#else + ret = inode->i_mapping->a_ops->writepage(page, NULL); +#endif + if (ret != 0) + unlock_page(page); + } else { + unlock_page(page); + } + page_cache_release(page); + + } + + /* our locks are page granular thanks to osc_enqueue, we invalidate the + * whole page. */ + LASSERT((extent->start & ~PAGE_CACHE_MASK) == 0); + LASSERT(((extent->end+1) & ~PAGE_CACHE_MASK) == 0); + for (i = start, j = start % count ; i < end ; j++, i++) { + if ( j == count ) { + i += skip; + j = 0; + } + PGCACHE_WRLOCK(inode->i_mapping); + if (list_empty(&inode->i_mapping->dirty_pages) && + list_empty(&inode->i_mapping->clean_pages) && + list_empty(&inode->i_mapping->locked_pages)) { + CDEBUG(D_INODE, "nothing left\n"); + PGCACHE_WRUNLOCK(inode->i_mapping); + break; + } + PGCACHE_WRUNLOCK(inode->i_mapping); + if (need_resched()) + schedule(); + page = find_get_page(inode->i_mapping, i); + if (page == NULL) + continue; + CDEBUG(D_INODE, "dropping page %p at %lu\n", page, page->index); + lock_page(page); + if (page->mapping) /* might have raced */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + truncate_complete_page(page); +#else + truncate_complete_page(page->mapping, page); +#endif + unlock_page(page); + page_cache_release(page); + } + EXIT; +} int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, int flag) { struct inode *inode = data; struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_handle lockh = { 0, 0 }; + struct lustre_handle lockh = { 0 }; int rc; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); - if (inode == NULL) - LBUG(); + LASSERT(inode != NULL); switch (flag) { case LDLM_CB_BLOCKING: @@ -562,11 +779,10 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, * could know to write-back or simply throw away the pages * based on if the cancel comes from a desire to, say, * read or truncate.. */ - CDEBUG(D_INODE, "invalidating obdo/inode %lu\n", inode->i_ino); - filemap_fdatasync(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); - clear_bit(LLI_F_DID_GETATTR, &lli->lli_flags); - truncate_inode_pages(inode->i_mapping, 0); + LASSERT((unsigned long)inode > 0x1000); + LASSERT((unsigned long)lli > 0x1000); + LASSERT((unsigned long)lli->lli_smd > 0x1000); + ll_pgcache_remove_extent(inode, lli->lli_smd, lock); break; default: LBUG(); @@ -582,27 +798,29 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, struct inode *inode = filp->f_dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct lustre_handle lockh = { 0, 0 }; + struct lustre_handle lockh = { 0 }; struct ll_read_extent rextent; ldlm_error_t err; ssize_t retval; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", + inode->i_ino, inode->i_generation, inode, count, *ppos); /* "If nbyte is 0, read() will return 0 and have no other results." * -- Single Unix Spec */ if (count == 0) RETURN(0); + /* grab a -> eof extent to push extending writes out of node's caches + * so we can see them at the getattr after lock acquisition. this will + * turn into a seperate [*ppos + count, EOF] 'size intent' lock attempt + * in the future. */ rextent.re_extent.start = *ppos; - rextent.re_extent.end = *ppos + count - 1; + rextent.re_extent.end = OBD_OBJECT_EOF; - err = ll_extent_lock(fd, inode, lsm, - LCK_PR, &rextent.re_extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) { - retval = -ENOLCK; - RETURN(retval); - } + err = ll_extent_lock(fd, inode, lsm, LCK_PR, &rextent.re_extent,&lockh); + if (err != ELDLM_OK) + RETURN(-ENOLCK); /* XXX tell ll_readpage what pages have a PR lock.. */ rextent.re_task = current; @@ -618,9 +836,6 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, list_del(&rextent.re_lli_item); spin_unlock(&lli->lli_read_extent_lock); - if (retval > 0) - ll_update_atime(inode); - /* XXX errors? */ ll_extent_unlock(fd, inode, lsm, LCK_PR, &lockh); RETURN(retval); @@ -634,40 +849,72 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { struct ll_file_data *fd = file->private_data; struct inode *inode = file->f_dentry->d_inode; - struct lustre_handle lockh = { 0, 0 }; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct lustre_handle lockh = { 0 }; struct ldlm_extent extent; + loff_t maxbytes = ll_file_maxbytes(inode); ldlm_error_t err; ssize_t retval; + char should_validate = 1; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", + inode->i_ino, inode->i_generation, inode, count, *ppos); + + /* + * sleep doing some writeback work of this mount's dirty data + * if the VM thinks we're low on memory.. other dirtying code + * paths should think about doing this, too, but they should be + * careful not to hold locked pages while they do so. like + * ll_prepare_write. *cough* + */ + LL_CHECK_DIRTY(inode->i_sb); /* POSIX, but surprised the VFS doesn't check this already */ if (count == 0) RETURN(0); - CDEBUG(D_VFSTRACE, "VFS Op\n"); - if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) { + if (file->f_flags & O_APPEND) { extent.start = 0; extent.end = OBD_OBJECT_EOF; } else { extent.start = *ppos; extent.end = *ppos + count - 1; + /* we really don't care what i_size is if we're doing + * fully page aligned writes */ + if ((*ppos & ~PAGE_CACHE_MASK) == 0 && + (count & ~PAGE_CACHE_MASK) == 0) + should_validate = 0; } - err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) { - retval = -ENOLCK; - RETURN(retval); - } - - if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + if (should_validate) + err = ll_extent_lock(fd, inode, lsm, LCK_PW, &extent, &lockh); + else + err = ll_extent_lock_no_validate(fd, inode, lsm, LCK_PW, + &extent, &lockh); + if (err != ELDLM_OK) + RETURN(-ENOLCK); + + /* this is ok, g_f_w will overwrite this under i_sem if it races + * with a local truncate, it just makes our maxbyte checking easier */ + if (file->f_flags & O_APPEND) *ppos = inode->i_size; + if (*ppos >= maxbytes) { + if (count || *ppos > maxbytes) { + send_sig(SIGXFSZ, current, 0); + GOTO(out, retval = -EFBIG); + } + } + if (*ppos + count > maxbytes) + count = maxbytes - *ppos; + CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", inode->i_ino, count, *ppos); + /* generic_file_write handles O_APPEND after getting i_sem */ retval = generic_file_write(file, buf, count, ppos); +out: /* XXX errors? */ ll_extent_unlock(fd, inode, lsm, LCK_PW, &lockh); RETURN(retval); @@ -686,7 +933,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, lsm = lli->lli_smd; if (lsm) { up(&lli->lli_open_sem); - CERROR("stripe already set for ino %lu\n", inode->i_ino); + CERROR("stripe already exists for ino %lu\n", inode->i_ino); /* If we haven't already done the open, do so now */ if (file->f_flags & O_LOV_DELAY_CREATE) { int rc2 = ll_osc_open(conn, inode, file, lsm); @@ -694,7 +941,7 @@ static int ll_lov_setstripe(struct inode *inode, struct file *file, RETURN(rc2); } - RETURN(-EALREADY); + RETURN(-EEXIST); } rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg); @@ -730,8 +977,8 @@ int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, struct ll_file_data *fd = file->private_data; struct lustre_handle *conn; int flags; - - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),cmd=%u\n", inode->i_ino, + inode->i_generation, inode, cmd); if ((cmd & 0xffffff00) == ((int)'T') << 8) /* tty ioctls */ return -ENOTTY; @@ -780,19 +1027,19 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) struct inode *inode = file->f_dentry->d_inode; struct ll_file_data *fd = file->private_data; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct lustre_handle lockh = {0, 0}; + struct lustre_handle lockh = {0}; loff_t retval; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),to=%llu\n", inode->i_ino, + inode->i_generation, inode, + offset + ((origin==2) ? inode->i_size : file->f_pos)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); if (origin == 2) { /* SEEK_END */ ldlm_error_t err; struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; err = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) { - retval = -ENOLCK; - RETURN(retval); - } + if (err != ELDLM_OK) + RETURN(-ENOLCK); offset += inode->i_size; } else if (origin == 1) { /* SEEK_CUR */ @@ -800,7 +1047,7 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) } retval = -EINVAL; - if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset >= 0 && offset <= ll_file_maxbytes(inode)) { if (offset != file->f_pos) { file->f_pos = offset; #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) @@ -819,7 +1066,10 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) int ll_fsync(struct file *file, struct dentry *dentry, int data) { int ret; + struct inode *inode = dentry->d_inode; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); /* * filemap_fdata{sync,wait} are also called at PW lock cancelation so @@ -837,14 +1087,15 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data) int ll_inode_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - struct lov_stripe_md *lsm; + struct lov_stripe_md *lsm = NULL; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); if (!inode) { CERROR("REPORT THIS LINE TO PETER\n"); RETURN(0); } + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),name=%s\n", + inode->i_ino, inode->i_generation, inode, dentry->d_name.name); /* this is very tricky. it is unsafe to call ll_have_md_lock when we have a referenced lock: because it may cause an RPC @@ -855,37 +1106,67 @@ int ll_inode_revalidate(struct dentry *dentry) !ll_have_md_lock(dentry)) { struct ptlrpc_request *req = NULL; struct ll_sb_info *sbi = ll_i2sbi(dentry->d_inode); + struct ll_fid fid; struct mds_body *body; + struct lov_mds_md *lmm; unsigned long valid = 0; - int datalen = 0, rc; + int eadatalen = 0, rc; /* Why don't we update all valid MDS fields here, if we're * doing an RPC anyways? -phil */ if (S_ISREG(inode->i_mode)) { - datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL); + eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL); valid |= OBD_MD_FLEASIZE; } - rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino, - inode->i_mode, valid, datalen, &req); + ll_inode2fid(&fid, inode); + rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, + valid, eadatalen, &req); if (rc) { CERROR("failure %d inode %lu\n", rc, inode->i_ino); - ptlrpc_req_finished(req); RETURN(-abs(rc)); } - body = lustre_msg_buf(req->rq_repmsg, 0); + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); /* checked by mdc_getattr() */ + LASSERT_REPSWABBED (req, 0); /* swabbed by mdc_getattr() */ if (S_ISREG(inode->i_mode) && - body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) { + (body->valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))) { CERROR("MDS sent back size for regular file\n"); body->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); } - if (body->valid & OBD_MD_FLEASIZE) - ll_update_inode(inode, body, - lustre_msg_buf(req->rq_repmsg, 1)); - else - ll_update_inode(inode, body, NULL); + /* XXX Too paranoid? */ + if ((body->valid ^ valid) & OBD_MD_FLEASIZE) + CERROR("Asked for %s eadata but got %s\n", + (valid & OBD_MD_FLEASIZE) ? "some" : "no", + (body->valid & OBD_MD_FLEASIZE) ? "some":"none"); + + if (S_ISREG(inode->i_mode) && + (body->valid & OBD_MD_FLEASIZE)) { + if (body->eadatasize == 0) { /* no EA data */ + CERROR("OBD_MD_FLEASIZE set but no data\n"); + RETURN(-EPROTO); + } + /* Only bother with this if inode's lsm not set? */ + lmm = lustre_msg_buf(req->rq_repmsg,1,body->eadatasize); + LASSERT(lmm != NULL); /* mdc_getattr() checked */ + LASSERT_REPSWABBED(req, 1); /* mdc_getattr() swabbed */ + + rc = obd_unpackmd (&sbi->ll_osc_conn, + &lsm, lmm, body->eadatasize); + if (rc < 0) { + CERROR("Error %d unpacking eadata\n", rc); + ptlrpc_req_finished(req); + RETURN(rc); + } + LASSERT(rc >= sizeof (*lsm)); + } + + ll_update_inode(inode, body, lsm); + if (lsm != NULL && ll_i2info(inode)->lli_smd != lsm) + obd_free_memmd(&sbi->ll_osc_conn, &lsm); + ptlrpc_req_finished(req); } @@ -901,12 +1182,12 @@ int ll_inode_revalidate(struct dentry *dentry) */ { struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; - struct lustre_handle lockh = {0, 0}; + struct lustre_handle lockh = {0}; ldlm_error_t err; err = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED ) - RETURN(-abs(err)); /* XXX can't be right */ + if (err != ELDLM_OK) + RETURN(err); ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh); } diff --git a/lustre/llite/iod.c b/lustre/llite/iod.c index 3a045f4..f88ed87 100644 --- a/lustre/llite/iod.c +++ b/lustre/llite/iod.c @@ -24,6 +24,7 @@ * to force writeback.. the throttling in prepare_write and kupdate's usual * writeback pressure got rid of our thread, but the file name remains. */ + #include <linux/version.h> #include <linux/config.h> #include <linux/module.h> @@ -34,6 +35,9 @@ #include <linux/kmod.h> #include <linux/pagemap.h> #include <linux/mm.h> +#include <linux/rbtree.h> +#include <linux/seq_file.h> +#include <linux/time.h> /* PG_inactive_clean is shorthand for rmap, we want free_high/low here.. */ #ifdef PG_inactive_clean @@ -51,53 +55,15 @@ extern spinlock_t inode_lock; -#define LLWP_MAX_PAGES (PTL_MD_MAX_IOV) struct ll_writeback_pages { - unsigned has_whole_pages:1, - num_frags:2, - num_pages:29; - struct brw_page pgs[LLWP_MAX_PAGES]; + obd_count npgs, max; + struct brw_page *pga; }; - -/* - * ugh, we want disk allocation on the target to happen in offset order. we'll - * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do - * fine for our small page arrays and doesn't require allocation. its an - * insertion sort that swaps elements that are strides apart, shrinking the - * stride down until its '1' and the array is sorted. - */ -void sort_brw_pages(struct brw_page *array, int num) -{ - int stride, i, j; - struct brw_page tmp; - - if ( num == 1 ) - return; - - for( stride = 1; stride < num ; stride = (stride*3) +1 ) - ; - - do { - stride /= 3; - for ( i = stride ; i < num ; i++ ) { - tmp = array[i]; - j = i; - while ( j >= stride && - array[j - stride].off > tmp.off ) { - array[j] = array[j - stride]; - j -= stride; - } - array[j] = tmp; - } - } while ( stride > 1 ); -} - /* - * returns 0 if the page was inserted in the array because it was - * within i_size. if we raced with truncate and i_size was less - * than the page we can unlock the page because truncate_inode_pages will - * be waiting to cleanup the page + * check to see if we're racing with truncate and put the page in + * the brw_page array. returns 0 if there is more room and 1 + * if the array is full. */ static int llwp_consume_page(struct ll_writeback_pages *llwp, struct inode *inode, struct page *page) @@ -107,31 +73,24 @@ static int llwp_consume_page(struct ll_writeback_pages *llwp, /* we raced with truncate? */ if ( off >= inode->i_size ) { + ll_remove_dirty(inode, page->index, page->index); unlock_page(page); - goto out; + return 0; } page_cache_get(page); - pg = &llwp->pgs[llwp->num_pages]; - llwp->num_pages++; + pg = &llwp->pga[llwp->npgs]; + llwp->npgs++; + LASSERT(llwp->npgs <= llwp->max); pg->pg = page; pg->off = off; pg->flag = OBD_BRW_CREATE; - pg->count = PAGE_SIZE; + pg->count = PAGE_CACHE_SIZE; /* catch partial writes for files that end mid-page */ - if ( pg->off + pg->count > inode->i_size ) - pg->count = inode->i_size & ~PAGE_MASK; - - if ( pg->count == PAGE_SIZE ) { - if ( ! llwp->has_whole_pages ) { - llwp->has_whole_pages = 1; - llwp->num_frags++; - } - } else { - llwp->num_frags++; - } + if (pg->off + pg->count > inode->i_size) + pg->count = inode->i_size & ~PAGE_CACHE_MASK; /* * matches ptlrpc_bulk_get assert that trickles down @@ -141,14 +100,10 @@ static int llwp_consume_page(struct ll_writeback_pages *llwp, LASSERT(pg->count >= 0); CDEBUG(D_CACHE, "brw_page %p: off "LPU64" cnt %d, page %p: ind %ld" - " i_size: "LPU64"\n", pg, pg->off, pg->count, page, + " i_size: %llu\n", pg, pg->off, pg->count, page, page->index, inode->i_size); - if ( llwp->num_frags == 3 || llwp->num_pages == LLWP_MAX_PAGES ) - return -1; - -out: - return 0; + return llwp->npgs == llwp->max; } /* @@ -165,7 +120,7 @@ static void ll_get_dirty_pages(struct inode *inode, struct list_head *pos, *n; ENTRY; - spin_lock(&pagecache_lock); + PGCACHE_WRLOCK(mapping); list_for_each_prev_safe(pos, n, &mapping->dirty_pages) { page = list_entry(pos, struct page, list); @@ -186,46 +141,51 @@ static void ll_get_dirty_pages(struct inode *inode, break; } - spin_unlock(&pagecache_lock); + PGCACHE_WRUNLOCK(mapping); EXIT; } -static void ll_brw_pages_unlock( struct inode *inode, - struct ll_writeback_pages *llwp) +static void ll_writeback(struct inode *inode, struct ll_writeback_pages *llwp) { int rc, i; - struct obd_brw_set *set; + struct ptlrpc_request_set *set; ENTRY; - sort_brw_pages(llwp->pgs, llwp->num_pages); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),bytes=%u\n", + inode->i_ino, inode->i_generation, inode, + ((llwp->npgs-1) << PAGE_SHIFT) + llwp->pga[llwp->npgs-1].count); - set = obd_brw_set_new(); + set = ptlrpc_prep_set(); if (set == NULL) { - EXIT; - return; + CERROR ("Can't create request set\n"); + rc = -ENOMEM; + } else { + rc = obd_brw_async(OBD_BRW_WRITE, ll_i2obdconn(inode), + ll_i2info(inode)->lli_smd, llwp->npgs, + llwp->pga, set, NULL); + if (rc == 0) + rc = ptlrpc_set_wait (set); + ptlrpc_set_destroy (set); } - set->brw_callback = ll_brw_sync_wait; - - rc = obd_brw(OBD_BRW_WRITE, ll_i2obdconn(inode), - ll_i2info(inode)->lli_smd, llwp->num_pages, llwp->pgs, - set, NULL); + /* + * b=1038, we need to pass _brw errors up so that writeback + * doesn't get stuck in recovery leaving processes stuck in + * D waiting for pages + */ if (rc) { - CERROR("error from obd_brw: rc = %d\n", rc); + CERROR("error from obd_brw_async: rc = %d\n", rc); + INODE_IO_STAT_ADD(inode, wb_fail, llwp->npgs); } else { - rc = ll_brw_sync_wait(set, CB_PHASE_START); - if (rc) - CERROR("error from callback: rc = %d\n", rc); + INODE_IO_STAT_ADD(inode, wb_ok, llwp->npgs); } - obd_brw_set_decref(set); - /* XXX this doesn't make sense to me */ - rc = 0; + for (i = 0 ; i < llwp->npgs ; i++) { + struct page *page = llwp->pga[i].pg; - for ( i = 0 ; i < llwp->num_pages ; i++) { - struct page *page = llwp->pgs[i].pg; - - CDEBUG(D_CACHE, "cleaning page %p\n", page); + CDEBUG(D_CACHE, "finished page %p at index %lu\n", page, + page->index); LASSERT(PageLocked(page)); + ll_remove_dirty(inode, page->index, page->index); unlock_page(page); page_cache_release(page); } @@ -233,10 +193,13 @@ static void ll_brw_pages_unlock( struct inode *inode, EXIT; } +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + #ifndef PG_inactive_clean #ifdef CONFIG_DISCONTIGMEM #error "sorry, we don't support DISCONTIGMEM yet" #endif + /* * __alloc_pages marks a zone as needing balancing if an allocation is * performed when the zone has fewer free pages than its 'low' water @@ -280,24 +243,35 @@ static int should_writeback(void) return 0; } -int ll_check_dirty( struct super_block *sb) +static int ll_alloc_brw(struct inode *inode, struct ll_writeback_pages *llwp) +{ + memset(llwp, 0, sizeof(struct ll_writeback_pages)); + + llwp->max = inode->i_blksize >> PAGE_CACHE_SHIFT; + if (llwp->max == 0) { + CERROR("forcing llwp->max to 1. blksize: %lu\n", + inode->i_blksize); + llwp->max = 1; + } + llwp->pga = kmalloc(llwp->max * sizeof(*llwp->pga), GFP_ATOMIC); + if (llwp->pga == NULL) + RETURN(-ENOMEM); + RETURN(0); +} + +int ll_check_dirty(struct super_block *sb) { unsigned long old_flags; /* hack? */ int making_progress; - struct ll_writeback_pages *llwp; struct inode *inode; int rc = 0; ENTRY; - if ( ! should_writeback() ) + if (!should_writeback()) return 0; old_flags = current->flags; current->flags |= PF_MEMALLOC; - llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC); - if ( llwp == NULL ) - GOTO(cleanup, rc = -ENOMEM); - memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs)); spin_lock(&inode_lock); @@ -306,6 +280,7 @@ int ll_check_dirty( struct super_block *sb) * until the VM thinkgs we're ok again.. */ do { + struct ll_writeback_pages llwp; struct list_head *pos; inode = NULL; making_progress = 0; @@ -313,14 +288,14 @@ int ll_check_dirty( struct super_block *sb) list_for_each_prev(pos, &sb->s_dirty) { inode = list_entry(pos, struct inode, i_list); - if ( ! (inode->i_state & I_DIRTY_PAGES) ) { + if (!(inode->i_state & I_DIRTY_PAGES)) { inode = NULL; continue; } break; } - if ( inode == NULL ) + if (inode == NULL) break; /* duplicate __sync_one, *sigh* */ @@ -331,19 +306,25 @@ int ll_check_dirty( struct super_block *sb) spin_unlock(&inode_lock); - do { - memset(llwp, 0, sizeof(*llwp)); - ll_get_dirty_pages(inode, llwp); - if ( llwp->num_pages ) { - ll_brw_pages_unlock(inode, llwp); - rc += llwp->num_pages; + rc = ll_alloc_brw(inode, &llwp); + if (rc != 0) + GOTO(cleanup, rc); + + do { + llwp.npgs = 0; + ll_get_dirty_pages(inode, &llwp); + if (llwp.npgs) { + INODE_IO_STAT_ADD(inode, wb_from_pressure, + llwp.npgs); + ll_writeback(inode, &llwp); + rc += llwp.npgs; making_progress = 1; } - } while (llwp->num_pages && should_writeback() ); + } while (llwp.npgs && should_writeback()); spin_lock(&inode_lock); - if ( ! list_empty(&inode->i_mapping->dirty_pages) ) + if (!list_empty(&inode->i_mapping->dirty_pages)) inode->i_state |= I_DIRTY_PAGES; inode->i_state &= ~I_LOCK; @@ -356,19 +337,19 @@ int ll_check_dirty( struct super_block *sb) list_add(&inode->i_list, &inode->i_sb->s_dirty); } wake_up(&inode->i_wait); - - } while ( making_progress && should_writeback() ); + kfree(llwp.pga); + } while (making_progress && should_writeback()); /* * and if that didn't work, we sleep on any data that might * be under writeback.. */ - while ( should_writeback() ) { - if ( list_empty(&sb->s_locked_inodes) ) + while (should_writeback()) { + if (list_empty(&sb->s_locked_inodes)) break; - inode = list_entry(sb->s_locked_inodes.next, struct inode, - i_list); + inode = list_entry(sb->s_locked_inodes.next, struct inode, + i_list); atomic_inc(&inode->i_count); /* XXX hack? */ spin_unlock(&inode_lock); @@ -380,36 +361,339 @@ int ll_check_dirty( struct super_block *sb) spin_unlock(&inode_lock); cleanup: - if ( llwp != NULL ) - kfree(llwp); current->flags = old_flags; RETURN(rc); } +#endif /* linux 2.5 */ -int ll_batch_writepage( struct inode *inode, struct page *page ) +int ll_batch_writepage(struct inode *inode, struct page *page) { unsigned long old_flags; /* hack? */ - struct ll_writeback_pages *llwp; + struct ll_writeback_pages llwp; int rc = 0; ENTRY; old_flags = current->flags; current->flags |= PF_MEMALLOC; - llwp = kmalloc(sizeof(struct ll_writeback_pages), GFP_ATOMIC); - if ( llwp == NULL ) - GOTO(cleanup, rc = -ENOMEM); - memset(llwp, 0, offsetof(struct ll_writeback_pages, pgs)); + rc = ll_alloc_brw(inode, &llwp); + if (rc != 0) + GOTO(cleanup, rc); - llwp_consume_page(llwp, inode, page); + if (llwp_consume_page(&llwp, inode, page) == 0) + ll_get_dirty_pages(inode, &llwp); - ll_get_dirty_pages(inode, llwp); - if ( llwp->num_pages ) - ll_brw_pages_unlock(inode, llwp); + if (llwp.npgs) { + INODE_IO_STAT_ADD(inode, wb_from_writepage, llwp.npgs); + ll_writeback(inode, &llwp); + } + kfree(llwp.pga); cleanup: - if ( llwp != NULL ) - kfree(llwp); current->flags = old_flags; RETURN(rc); } + +/* + * we aggressively track offsets of pages that have been dirtied. we need this + * to make file size decisions around lock acquisition and cancelation. all + * extents include the offsets at their endpoints. + */ +struct offset_extent { + rb_node_t oe_node; + unsigned long oe_start, oe_end; +}; + +static struct offset_extent *ll_find_oe(rb_root_t *root, + struct offset_extent *needle) +{ + struct rb_node_s *node = root->rb_node; + struct offset_extent *oe; + ENTRY; + + CDEBUG(D_INODE, "searching [%lu -> %lu]\n", needle->oe_start, + needle->oe_end); + + while (node) { + oe = rb_entry(node, struct offset_extent, oe_node); + if (needle->oe_end < oe->oe_start) + node = node->rb_left; + else if (needle->oe_start > oe->oe_end) + node = node->rb_right; + else { + CDEBUG(D_INODE, "returning [%lu -> %lu]\n", + oe->oe_start, oe->oe_end); + RETURN(oe); + } + } + RETURN(NULL); +} + +/* do the rbtree mechanics to insert a node, callers are responsible + * for making sure that this new node doesn't overlap with existing + * nodes */ +static void ll_insert_oe(rb_root_t *root, struct offset_extent *new_oe) +{ + rb_node_t ** p = &root->rb_node; + rb_node_t * parent = NULL; + struct offset_extent *oe; + ENTRY; + + LASSERT(new_oe->oe_start <= new_oe->oe_end); + + while (*p) { + parent = *p; + oe = rb_entry(parent, struct offset_extent, oe_node); + if ( new_oe->oe_end < oe->oe_start ) + p = &(*p)->rb_left; + else if ( new_oe->oe_start > oe->oe_end ) + p = &(*p)->rb_right; + else + LBUG(); + } + rb_link_node(&new_oe->oe_node, parent, p); + rb_insert_color(&new_oe->oe_node, root); + EXIT; +} + +static inline void lldo_dirty_add(struct inode *inode, + struct ll_dirty_offsets *lldo, + long val) +{ + lldo->do_num_dirty += val; + INODE_IO_STAT_ADD(inode, dirty_pages, val); +} + +void ll_record_dirty(struct inode *inode, unsigned long offset) +{ + struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty; + struct offset_extent needle, *oe, *new_oe; + int rc; + ENTRY; + + /* will allocate more intelligently later */ + OBD_ALLOC(new_oe, sizeof(*new_oe)); + LASSERT(new_oe); /* will have to do for now :/ */ + + spin_lock(&lldo->do_lock); + + /* find neighbours that we might glom on to */ + needle.oe_start = (offset > 0) ? offset - 1 : offset; + needle.oe_end = (offset < ~0) ? offset + 1 : offset; + oe = ll_find_oe(&lldo->do_root, &needle); + if ( oe == NULL ) { + new_oe->oe_start = offset; + new_oe->oe_end = offset; + ll_insert_oe(&lldo->do_root, new_oe); + lldo_dirty_add(inode, lldo, 1); + new_oe = NULL; + GOTO(out, rc = 1); + } + + /* already recorded */ + if ( offset >= oe->oe_start && offset <= oe->oe_end ) + GOTO(out, rc = 2); + + /* ok, need to check for adjacent neighbours */ + needle.oe_start = offset; + needle.oe_end = offset; + if (ll_find_oe(&lldo->do_root, &needle)) + GOTO(out, rc = 3); + + /* ok, its safe to extend the oe we found */ + if ( offset == oe->oe_start - 1 ) + oe->oe_start--; + else if ( offset == oe->oe_end + 1 ) + oe->oe_end++; + else + LBUG(); + lldo_dirty_add(inode, lldo, 1); + +out: + CDEBUG(D_INODE, "%lu now dirty\n", lldo->do_num_dirty); + spin_unlock(&lldo->do_lock); + if ( new_oe ) + OBD_FREE(new_oe, sizeof(*new_oe)); + EXIT; + return; +} + +void ll_remove_dirty(struct inode *inode, unsigned long start, + unsigned long end) +{ + struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty; + struct offset_extent needle, *oe, *new_oe; + ENTRY; + + /* will allocate more intelligently later */ + OBD_ALLOC(new_oe, sizeof(*new_oe)); + LASSERT(new_oe); /* will have to do for now :/ */ + + needle.oe_start = start; + needle.oe_end = end; + + spin_lock(&lldo->do_lock); + for ( ; (oe = ll_find_oe(&lldo->do_root, &needle)) ; ) { + + /* see if we're punching a hole and need to create a node */ + if (oe->oe_start < start && oe->oe_end > end) { + new_oe->oe_start = end + 1; + new_oe->oe_end = oe->oe_end; + oe->oe_end = start - 1; + ll_insert_oe(&lldo->do_root, new_oe); + new_oe = NULL; + lldo_dirty_add(inode, lldo, -(end - start + 1)); + break; + } + + /* overlapping edges */ + if (oe->oe_start < start && oe->oe_end <= end) { + lldo_dirty_add(inode, lldo, -(oe->oe_end - start + 1)); + oe->oe_end = start - 1; + oe = NULL; + continue; + } + if (oe->oe_end > end && oe->oe_start >= start) { + lldo_dirty_add(inode, lldo, -(end - oe->oe_start + 1)); + oe->oe_start = end + 1; + oe = NULL; + continue; + } + + /* an extent entirely within the one we're clearing */ + rb_erase(&oe->oe_node, &lldo->do_root); + lldo_dirty_add(inode, lldo, -(oe->oe_end - oe->oe_start + 1)); + spin_unlock(&lldo->do_lock); + OBD_FREE(oe, sizeof(*oe)); + spin_lock(&lldo->do_lock); + } + CDEBUG(D_INODE, "%lu now dirty\n", lldo->do_num_dirty); + spin_unlock(&lldo->do_lock); + if (new_oe) + OBD_FREE(new_oe, sizeof(*new_oe)); + EXIT; +} + +int ll_find_dirty(struct ll_dirty_offsets *lldo, unsigned long *start, + unsigned long *end) +{ + struct offset_extent needle, *oe; + int rc = -ENOENT; + ENTRY; + + needle.oe_start = *start; + needle.oe_end = *end; + + spin_lock(&lldo->do_lock); + oe = ll_find_oe(&lldo->do_root, &needle); + if (oe) { + *start = oe->oe_start; + *end = oe->oe_end; + rc = 0; + } + spin_unlock(&lldo->do_lock); + + RETURN(rc); +} + +int ll_farthest_dirty(struct ll_dirty_offsets *lldo, unsigned long *farthest) +{ + struct rb_node_s *last, *node; + struct offset_extent *oe; + int rc = -1; + ENTRY; + + spin_lock(&lldo->do_lock); + for (node = lldo->do_root.rb_node, last = NULL; + node; + last = node, node = node->rb_right) + ; + + if (last) { + oe = rb_entry(last, struct offset_extent, oe_node); + *farthest = oe->oe_end; + rc = 0; + } + spin_unlock(&lldo->do_lock); + RETURN(rc); +} + +void ll_lldo_init(struct ll_dirty_offsets *lldo) +{ + spin_lock_init(&lldo->do_lock); + lldo->do_num_dirty = 0; + lldo->do_root.rb_node = NULL; +} + +/* seq file export of some page cache tracking stats */ +static int ll_pgcache_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct ll_sb_info *sbi = seq->private; + do_gettimeofday(&now); + + seq_printf(seq, "snapshot_time: %lu:%lu (secs:usecs)\n", + now.tv_sec, now.tv_usec); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + seq_printf(seq, "VM_under_pressure: %s\n", + should_writeback() ? "yes" : "no"); +#endif + seq_printf(seq, "dirty_pages: "LPU64"\n", + sbi->ll_iostats.fis_dirty_pages); + seq_printf(seq, "dirty_page_hits: "LPU64"\n", + sbi->ll_iostats.fis_dirty_hits); + seq_printf(seq, "dirty_page_misses: "LPU64"\n", + sbi->ll_iostats.fis_dirty_misses); + seq_printf(seq, "writeback_from_writepage: "LPU64"\n", + sbi->ll_iostats.fis_wb_from_writepage); + seq_printf(seq, "writeback_from_pressure: "LPU64"\n", + sbi->ll_iostats.fis_wb_from_pressure); + seq_printf(seq, "writeback_ok_pages: "LPU64"\n", + sbi->ll_iostats.fis_wb_ok); + seq_printf(seq, "writeback_failed_pages: "LPU64"\n", + sbi->ll_iostats.fis_wb_fail); + return 0; +} + +static void *ll_pgcache_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos == 0) + return (void *)1; + return NULL; +} +static void *ll_pgcache_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + return NULL; +} +static void ll_pgcache_seq_stop(struct seq_file *p, void *v) +{ +} + +struct seq_operations ll_pgcache_seq_sops = { + .start = ll_pgcache_seq_start, + .stop = ll_pgcache_seq_stop, + .next = ll_pgcache_seq_next, + .show = ll_pgcache_seq_show, +}; + +static int ll_pgcache_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = inode->u.generic_ip; + struct seq_file *seq; + int rc; + + rc = seq_open(file, &ll_pgcache_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = dp->data; + return 0; +} + +struct file_operations ll_pgcache_seq_fops = { + .open = ll_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h new file mode 100644 index 0000000..e53b605 --- /dev/null +++ b/lustre/llite/llite_internal.h @@ -0,0 +1,2 @@ +int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, + int flags, void *opaque); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index b5e6620..59cec1f 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -106,6 +106,7 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct lprocfs_vars lvars[2]; struct ll_sb_info *sbi = ll_s2sbi(sb); struct obd_device *obd; + struct proc_dir_entry *entry; char name[MAX_STRING_SIZE + 1]; struct obd_uuid uuid; int err; @@ -135,6 +136,13 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, if (err) RETURN(err); + /* llite page cache stats */ + entry = create_proc_entry("pgcache", 0444, sbi->ll_proc_root); + if (entry == NULL) + RETURN(-ENOMEM); + entry->proc_fops = &ll_pgcache_seq_fops; + entry->data = sbi; + /* MDC info */ strncpy(uuid.uuid, mdc, sizeof(uuid.uuid)); obd = class_uuid2obd(&uuid); diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 449cac7..5e37d55 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -98,12 +98,17 @@ static int ll_test_inode(struct inode *inode, void *opaque) struct ll_read_inode2_cookie *lic = opaque; struct mds_body *body = lic->lic_body; + if (!(lic->lic_body->valid & (OBD_MD_FLGENER | OBD_MD_FLID))) + CERROR("invalid generation\n"); + CDEBUG(D_VFSTRACE, "comparing inode %p ino %lu/%u to body %lu/%u\n", + inode, inode->i_ino, inode->i_generation, ino, + lic->lic_body->generation); + if (inode->i_generation != lic->lic_body->generation) return 0; /* Apply the attributes in 'opaque' to this inode */ - ll_update_inode(inode, body, lic->lic_lmm); - + ll_update_inode(inode, body, lic->lic_lsm); return 1; } @@ -118,6 +123,9 @@ int ll_unlock(__u32 mode, struct lustre_handle *lockh) RETURN(0); } +/* Get an inode by inode number (already instantiated by the intent lookup). + * Returns inode or NULL + */ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) extern int ll_read_inode2(struct inode *inode, void *opaque); struct inode *ll_iget(struct super_block *sb, ino_t hash, @@ -127,9 +135,8 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash, LASSERT(hash != 0); inode = iget5_locked(sb, hash, ll_test_inode, ll_read_inode2, lic); - - if (!inode) - return ERR_PTR(-ENOMEM); + if (inode == NULL) + return NULL; /* removed ERR_PTR(-ENOMEM) -eeb */ if (inode->i_state & I_NEW) unlock_new_inode(inode); @@ -144,6 +151,8 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash, struct inode *inode; LASSERT(hash != 0); inode = iget4(sb, hash, ll_find_inode, lic); + CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); return inode; } #endif @@ -186,18 +195,112 @@ int ll_it_open_error(int phase, struct lookup_intent *it) return 0; } +int ll_mdc_blocking_ast(struct ldlm_lock *lock, + struct ldlm_lock_desc *desc, + void *data, int flag) +{ + int rc; + struct lustre_handle lockh; + ENTRY; + + switch (flag) { + case LDLM_CB_BLOCKING: + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh); + if (rc < 0) { + CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); + RETURN(rc); + } + break; + case LDLM_CB_CANCELING: { + /* Invalidate all dentries associated with this inode */ + struct inode *inode = lock->l_data; + LASSERT(inode != NULL); + + //if (inode->i_state & I_FREEING) + // break; + + if (S_ISDIR(inode->i_mode)) { + CDEBUG(D_INODE, "invalidating inode %lu\n", + inode->i_ino); + + ll_invalidate_inode_pages(inode); + } + + if (inode->i_sb->s_root && + inode != inode->i_sb->s_root->d_inode) + d_unhash_aliases(inode); + break; + } + default: + LBUG(); + } + + RETURN(0); +} + +void ll_mdc_lock_set_inode(struct lustre_handle *lockh, struct inode *inode) +{ + struct ldlm_lock *lock = ldlm_handle2lock(lockh); + ENTRY; + + LASSERT(lock != NULL); + lock->l_data = inode; + LDLM_LOCK_PUT(lock); + EXIT; +} + +int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, + int flags, void *opaque) +{ + struct ldlm_res_id res_id = + { .name = {inode->i_ino, inode->i_generation} }; + struct obd_device *obddev = class_conn2obd(conn); + ENTRY; + RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags, + opaque)); +} + +void ll_prepare_mdc_op_data(struct mdc_op_data *data, + struct inode *i1, + struct inode *i2, + const char *name, + int namelen, + int mode) +{ + LASSERT(i1); + + data->ino1 = i1->i_ino; + data->gen1 = i1->i_generation; + data->typ1 = i1->i_mode & S_IFMT; + data->gid1 = i1->i_gid; + + if (i2) { + data->ino2 = i2->i_ino; + data->gen2 = i2->i_generation; + data->typ2 = i2->i_mode & S_IFMT; + data->gid2 = i2->i_gid; + } else { + data->ino2 = 0; + } + + data->name = name; + data->namelen = namelen; + data->mode = mode; +} + #define IT_ENQ_COMPLETE (1<<16) int ll_intent_lock(struct inode *parent, struct dentry **de, struct lookup_intent *it, intent_finish_cb intent_finish) { struct dentry *dentry = *de; + struct inode *inode = dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(parent); struct lustre_handle lockh; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct ptlrpc_request *request = NULL; - char *data = NULL; - int rc = 0, datalen = 0, offset, flag = 0; + int rc = 0, offset, flag = 0; obd_id ino = 0; ENTRY; @@ -208,17 +311,23 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, if (it == NULL) it = &lookup_it; - CDEBUG(D_INFO, "name: %*s, intent: %s\n", dentry->d_name.len, + CDEBUG(D_DLMTRACE, "name: %*s, intent: %s\n", dentry->d_name.len, dentry->d_name.name, ldlm_it2str(it->it_op)); if (dentry->d_name.len > EXT2_NAME_LEN) RETURN(-ENAMETOOLONG); if (!(it->it_disposition & IT_ENQ_COMPLETE)) { + struct mdc_op_data op_data; + + ll_prepare_mdc_op_data(&op_data, parent, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len, + 0); + rc = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, it, - ll_intent_to_lock_mode(it), parent, dentry, - &lockh, data, datalen, parent, - sizeof(*parent)); + ll_intent_to_lock_mode(it), &op_data, + &lockh, NULL, 0, ldlm_completion_ast, + ll_mdc_blocking_ast, parent); if (rc < 0) RETURN(rc); memcpy(it->it_lock_handle, &lockh, sizeof(lockh)); @@ -256,14 +365,17 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, */ offset = 1; - mds_body = lustre_msg_buf(request->rq_repmsg, offset); + mds_body = lustre_msg_buf(request->rq_repmsg, offset, + sizeof(*mds_body)); + LASSERT (mds_body != NULL); /* mdc_enqueue checked */ + LASSERT_REPSWABBED (request, offset); /* mdc_enqueue swabbed */ + ino = mds_body->fid1.id; mode = mds_body->mode; /*We were called from revalidate2: did we find the same inode?*/ - if ((*de)->d_inode && - (ino != (*de)->d_inode->i_ino || - mds_body->fid1.generation != (*de)->d_inode->i_generation)) { + if (inode && (ino != inode->i_ino || + mds_body->fid1.generation != inode->i_generation)) { it->it_disposition |= IT_ENQ_COMPLETE; RETURN(-ESTALE); } @@ -273,8 +385,13 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, * this request for unconditional replay. */ if (it->it_op & IT_OPEN && (!(it->it_disposition & IT_OPEN_OPEN) || - it->it_status != 0)) - request->rq_flags &= ~PTL_RPC_FL_REPLAY; + it->it_status != 0)) { + unsigned long flags; + + spin_lock_irqsave (&request->rq_lock, flags); + request->rq_replay = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); + } if (it->it_op & IT_CREAT) { mdc_store_inode_generation(request, 2, 1); @@ -329,7 +446,9 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, } else LBUG(); } else { + struct ll_fid fid; obd_flag valid; + int eadatalen; int mode; LBUG(); /* For the moment, no non-intent locks */ @@ -351,32 +470,44 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, valid = OBD_MD_FLNOTOBD; if (S_ISREG(mode)) { - datalen = obd_size_wiremd(&sbi->ll_osc_conn, NULL), + eadatalen = obd_size_diskmd(&sbi->ll_osc_conn, NULL), valid |= OBD_MD_FLEASIZE; } else { + eadatalen = 0; valid |= OBD_MD_FLBLOCKS; } - rc = mdc_getattr(&sbi->ll_mdc_conn, ino, mode, valid, - datalen, &request); + fid.id = ino; + fid.generation = 0; + fid.f_type = mode; + rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, valid, + eadatalen, &request); if (rc) { CERROR("failure %d inode "LPX64"\n", rc, ino); - GOTO(drop_req, rc = -abs(rc)); + GOTO(drop_lock, rc = -abs(rc)); } } + LASSERT (request != NULL); + if (intent_finish != NULL) { - rc = intent_finish(flag, request, de, it, offset, ino); + rc = intent_finish(flag, request, parent, de, it, offset, ino); dentry = *de; /* intent_finish may change *de */ - } else { - ptlrpc_req_finished(request); + inode = dentry->d_inode; + if (rc != 0) + GOTO(drop_lock, rc); } + ptlrpc_req_finished(request); /* This places the intent in the dentry so that the vfs_xxx * operation can lay its hands on it; but that is not always * needed... (we need to save it in the GETATTR case for the * benefit of ll_inode_revalidate -phil) */ - if (it->it_op & (IT_OPEN | IT_GETATTR)) + /* Ignore trying to save the intent for "special" inodes as + * they have special semantics that can cause deadlocks on + * the intent semaphore. -mmex */ + if ((!inode || S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || + S_ISLNK(inode->i_mode)) && (it->it_op & (IT_OPEN | IT_GETATTR))) LL_SAVE_INTENT(dentry, it); else CDEBUG(D_DENTRY, @@ -389,10 +520,10 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, RETURN(rc); + drop_lock: + ll_intent_release(dentry, it); drop_req: ptlrpc_req_finished(request); - drop_lock: -#warning FIXME: must release lock here RETURN(rc); } @@ -440,32 +571,87 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) } static int -lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, +lookup2_finish(int flag, struct ptlrpc_request *request, + struct inode *parent, struct dentry **de, struct lookup_intent *it, int offset, obd_id ino) { + struct ll_sb_info *sbi = ll_i2sbi(parent); struct dentry *dentry = *de, *saved = *de; struct inode *inode = NULL; - struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lmm = NULL}; + struct ll_read_inode2_cookie lic = {.lic_body = NULL, .lic_lsm = NULL}; + + /* NB 1 request reference will be taken away by ll_intent_lock() + * when I return */ if (!(flag & LL_LOOKUP_NEGATIVE)) { ENTRY; - lic.lic_body = lustre_msg_buf(request->rq_repmsg, offset); + + /* We only get called if the mdc_enqueue() called from + * ll_intent_lock() was successful. Therefore the mds_body + * is present and correct, and the eadata is present if + * body->eadatasize != 0 (but still opaque, so only + * obd_unpackmd() can check the size) */ + lic.lic_body = lustre_msg_buf(request->rq_repmsg, offset, + sizeof (*lic.lic_body)); + LASSERT(lic.lic_body != NULL); + LASSERT_REPSWABBED(request, offset); if (S_ISREG(lic.lic_body->mode) && - lic.lic_body->valid & OBD_MD_FLEASIZE) { - LASSERT(request->rq_repmsg->bufcount > offset); - lic.lic_lmm = lustre_msg_buf(request->rq_repmsg, - offset + 1); - } else { - lic.lic_lmm = NULL; + (lic.lic_body->valid & OBD_MD_FLEASIZE)) { + struct lov_mds_md *lmm; + int lmm_size; + int rc; + + lmm_size = lic.lic_body->eadatasize; + if (lmm_size == 0) { + CERROR("OBD_MD_FLEASIZE set but " + "eadatasize 0\n"); + RETURN(-EPROTO); + } + lmm = lustre_msg_buf(request->rq_repmsg, offset + 1, + lmm_size); + LASSERT(lmm != NULL); + LASSERT_REPSWABBED(request, offset + 1); + + rc = obd_unpackmd(&sbi->ll_osc_conn, + &lic.lic_lsm, lmm, lmm_size); + if (rc < 0) { + CERROR("Error %d unpacking eadata\n", rc); + RETURN(rc); + } + LASSERT(rc >= sizeof(*lic.lic_lsm)); } - /* No rpc's happen during iget4, -ENOMEM's are possible */ + /* Both ENOMEM and an RPC timeout are possible in ll_iget; which + * to pick? A more generic EIO? -phik */ inode = ll_iget(dentry->d_sb, ino, &lic); if (!inode) { - /* XXX make sure that request is freed in this case; - * I think it is, but double-check refcounts. -phil */ + /* free the lsm if we allocated one above */ + if (lic.lic_lsm != NULL) + obd_free_memmd(&sbi->ll_osc_conn, &lic.lic_lsm); RETURN(-ENOMEM); + } else if (lic.lic_lsm != NULL && + ll_i2info(inode)->lli_smd != lic.lic_lsm) { + obd_free_memmd(&sbi->ll_osc_conn, &lic.lic_lsm); + } + + /* If this is a stat, get the authoritative file size */ + if (it->it_op == IT_GETATTR && S_ISREG(inode->i_mode) && + ll_i2info(inode)->lli_smd != NULL) { + struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; + struct lustre_handle lockh = {0}; + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + ldlm_error_t rc; + + LASSERT(lsm->lsm_object_id != 0); + + rc = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, + &lockh); + if (rc != ELDLM_OK) { + iput(inode); + RETURN(-EIO); + } + ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh); } dentry = *de = ll_find_alias(inode, dentry); @@ -473,14 +659,12 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, /* We asked for a lock on the directory, and may have been * granted a lock on the inode. Just in case, fixup the data * pointer. */ - mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, - inode); + ll_mdc_lock_set_inode((struct lustre_handle*)it->it_lock_handle, + inode); } else { ENTRY; } - ptlrpc_req_finished(request); - dentry->d_op = &ll_d_ops; ll_set_dd(dentry); @@ -493,21 +677,26 @@ lookup2_finish(int flag, struct ptlrpc_request *request, struct dentry **de, static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry, struct lookup_intent *it) { - struct dentry *save = dentry; + struct dentry *save = dentry, *retval; int rc; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, parent->i_ino, parent->i_generation, + parent, LL_IT2STR(it)); + rc = ll_intent_lock(parent, &dentry, it, lookup2_finish); if (rc < 0) { CDEBUG(D_INFO, "ll_intent_lock: %d\n", rc); - RETURN(ERR_PTR(rc)); + GOTO(out, retval = ERR_PTR(rc)); } if (dentry == save) - RETURN(NULL); + GOTO(out, retval = NULL); else - RETURN(dentry); + GOTO(out, retval = dentry); + out: + return retval; } /* We depend on "mode" being set with the proper file type/umask by now */ @@ -519,20 +708,19 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, struct inode *inode; struct ptlrpc_request *request = NULL; struct mds_body *body; -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - time_t time = CURRENT_TIME.tv_sec; -#else - time_t time = CURRENT_TIME; -#endif + time_t time = LTIME_S(CURRENT_TIME); struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_read_inode2_cookie lic = { .lic_lmm = NULL, }; + struct ll_read_inode2_cookie lic; ENTRY; if (it && it->it_disposition) { ll_invalidate_inode_pages(dir); request = it->it_data; - body = lustre_msg_buf(request->rq_repmsg, 1); + body = lustre_msg_buf(request->rq_repmsg, 1, sizeof (*body)); + LASSERT (body != NULL); /* checked already */ + LASSERT_REPSWABBED (request, 1); /* swabbed already */ } else { + struct mdc_op_data op_data; int gid = current->fsgid; int rc; @@ -542,21 +730,29 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, mode |= S_ISGID; } - rc = mdc_create(&sbi->ll_mdc_conn, dir, name, namelen, + ll_prepare_mdc_op_data(&op_data, dir, NULL, name, namelen, 0); + rc = mdc_create(&sbi->ll_mdc_conn, &op_data, data, datalen, mode, current->fsuid, gid, time, extra, &request); if (rc) { inode = ERR_PTR(rc); GOTO(out, rc); } - body = lustre_msg_buf(request->rq_repmsg, 0); + body = lustre_swab_repbuf(request, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't unpack mds_body\n"); + GOTO (out, inode = ERR_PTR(-EPROTO)); + } } lic.lic_body = body; + lic.lic_lsm = NULL; inode = ll_iget(dir->i_sb, body->ino, &lic); - if (IS_ERR(inode)) { - int rc = PTR_ERR(inode); + if (!inode || is_bad_inode(inode)) { + /* XXX might need iput() for bad inode */ + int rc = -EIO; CERROR("new_inode -fatal: rc %d\n", rc); LBUG(); GOTO(out, rc); @@ -576,8 +772,8 @@ static struct inode *ll_create_node(struct inode *dir, const char *name, /* We asked for a lock on the directory, but were * granted a lock on the inode. Since we finally have * an inode pointer, stuff it in the lock. */ - mdc_lock_set_inode((struct lustre_handle *)it->it_lock_handle, - inode); + ll_mdc_lock_set_inode((struct lustre_handle*)it->it_lock_handle, + inode); } EXIT; @@ -592,22 +788,21 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode, struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); struct mds_body *body; + struct lov_mds_md *eadata; struct lov_stripe_md *lsm = NULL; struct lustre_handle lockh; struct lookup_intent it = { .it_op = IT_UNLINK }; struct obdo *oa; int err; - struct mdc_unlink_data data; + struct mdc_op_data op_data; ENTRY; - data.unl_dir = dir; - data.unl_de = child; - data.unl_mode = mode; - data.unl_name = name; - data.unl_len = len; + ll_prepare_mdc_op_data(&op_data, dir, child, name, len, mode); - err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX, dir, - NULL, &lockh, NULL, 0, &data, sizeof(data)); + err = mdc_enqueue(&sbi->ll_mdc_conn, LDLM_PLAIN, &it, LCK_EX, + &op_data, &lockh, NULL, 0, + ldlm_completion_ast, ll_mdc_blocking_ast, + dir); request = (struct ptlrpc_request *)it.it_data; if (err < 0) GOTO(out, err); @@ -615,21 +810,39 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode, GOTO(out, err = it.it_status); err = 0; - body = lustre_msg_buf(request->rq_repmsg, 1); - LASSERT(body != NULL); + body = lustre_msg_buf (request->rq_repmsg, 1, sizeof (*body)); + LASSERT (body != NULL); /* checked by mdc_enqueue() */ + LASSERT_REPSWABBED (request, 1); /* swabbed by mdc_enqueue() */ + if (!(body->valid & OBD_MD_FLEASIZE)) GOTO(out, 0); + if (body->eadatasize == 0) { + CERROR ("OBD_MD_FLEASIZE set but eadatasize zero\n"); + GOTO (out, err = -EPROTO); + } + /* The MDS sent back the EA because we unlinked the last reference - * to this file. Use this EA to unlink the objects on the OST */ - err = obd_unpackmd(ll_i2obdconn(dir), &lsm, - lustre_msg_buf(request->rq_repmsg, 2)); - if (err < 0) + * to this file. Use this EA to unlink the objects on the OST. + * Note that mdc_enqueue() has already checked there _is_ some EA + * data, but this data is opaque to both mdc_enqueue() and the MDS. + * We have to leave it to obd_unpackmd() to check it is complete + * and sensible. */ + eadata = lustre_msg_buf (request->rq_repmsg, 2, body->eadatasize); + LASSERT (eadata != NULL); + LASSERT_REPSWABBED (request, 2); + + err = obd_unpackmd(ll_i2obdconn(dir), &lsm, eadata, + body->eadatasize); + if (err < 0) { CERROR("obd_unpackmd: %d\n", err); + GOTO (out_unlock, err); + } + LASSERT (err >= sizeof (*lsm)); oa = obdo_alloc(); if (oa == NULL) - GOTO(out_unlock, err = -ENOMEM); + GOTO(out_free_memmd, err = -ENOMEM); oa->o_id = lsm->lsm_object_id; oa->o_mode = body->mode & S_IFMT; @@ -640,7 +853,7 @@ static int ll_mdc_unlink(struct inode *dir, struct inode *child, __u32 mode, if (err) CERROR("obd destroy objid 0x"LPX64" error %d\n", lsm->lsm_object_id, err); - + out_free_memmd: obd_free_memmd(ll_i2obdconn(dir), &lsm); out_unlock: ldlm_lock_decref_and_cancel(&lockh, LCK_EX); @@ -670,7 +883,10 @@ static int ll_create(struct inode *dir, struct dentry *dentry, int mode) int rc = 0; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); + it = dentry->d_it; rc = ll_it_open_error(IT_OPEN_CREATE, it); @@ -702,16 +918,15 @@ static int ll_mknod2(struct inode *dir, const char *name, int len, int mode, int rdev) { struct ptlrpc_request *request = NULL; -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - time_t time = CURRENT_TIME.tv_sec; -#else - time_t time = CURRENT_TIME; -#endif + time_t time = LTIME_S(CURRENT_TIME); struct ll_sb_info *sbi = ll_i2sbi(dir); + struct mdc_op_data op_data; int err = -EMLINK; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", + name, dir->i_ino, dir->i_generation, dir); + if (dir->i_nlink >= EXT2_LINK_MAX) RETURN(err); @@ -722,8 +937,9 @@ static int ll_mknod2(struct inode *dir, const char *name, int len, int mode, mode |= S_IFREG; /* for mode = 0 case, fallthrough */ case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0, - mode, current->fsuid, current->fsgid, time, + ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode, + current->fsuid, current->fsgid, time, rdev, &request); ptlrpc_req_finished(request); break; @@ -743,7 +959,10 @@ static int ll_mknod(struct inode *dir, struct dentry *dentry, int mode, struct inode *inode; int rc = 0; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); + LL_GET_INTENT(dentry, it); if ((mode & S_IFMT) == 0) @@ -767,20 +986,20 @@ static int ll_symlink2(struct inode *dir, const char *name, int len, const char *tgt) { struct ptlrpc_request *request = NULL; -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - time_t time = CURRENT_TIME.tv_sec; -#else - time_t time = CURRENT_TIME; -#endif + time_t time = LTIME_S(CURRENT_TIME); struct ll_sb_info *sbi = ll_i2sbi(dir); + struct mdc_op_data op_data; int err = -EMLINK; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),target=%s\n", + name, dir->i_ino, dir->i_generation, dir, tgt); + if (dir->i_nlink >= EXT2_LINK_MAX) RETURN(err); - err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, + ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + err = mdc_create(&sbi->ll_mdc_conn, &op_data, tgt, strlen(tgt) + 1, S_IFLNK | S_IRWXUGO, current->fsuid, current->fsgid, time, 0, &request); ptlrpc_req_finished(request); @@ -797,7 +1016,10 @@ static int ll_symlink(struct inode *dir, struct dentry *dentry, int err = 0; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); + LL_GET_INTENT(dentry, it); inode = ll_create_node(dir, dentry->d_name.name, dentry->d_name.len, @@ -830,13 +1052,17 @@ static int ll_link2(struct inode *src, struct inode *dir, const char *name, int len) { struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; int err; struct ll_sb_info *sbi = ll_i2sbi(dir); ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),dir=%lu/%u(%p),target=%s\n", + src->i_ino, src->i_generation, src, + dir->i_ino, dir->i_generation, dir, name); - CDEBUG(D_VFSTRACE, "VFS Op\n"); - err = mdc_link(&sbi->ll_mdc_conn, src, dir, name, len, &request); + ll_prepare_mdc_op_data(&op_data, src, dir, name, len, 0); + err = mdc_link(&sbi->ll_mdc_conn, &op_data, &request); ptlrpc_req_finished(request); RETURN(err); @@ -848,18 +1074,18 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir, struct lookup_intent *it; struct inode *inode = old_dentry->d_inode; int rc; + CDEBUG(D_VFSTRACE, + "VFS Op:inode=%lu/%u(%p),dir=%lu/%u(%p),target=%s,intent=%s\n", + inode->i_ino, inode->i_generation, inode, dir->i_ino, + dir->i_generation, dir, dentry->d_name.name, + LL_IT2STR(dentry->d_it)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); LL_GET_INTENT(dentry, it); if (it && it->it_disposition) { if (it->it_status) RETURN(it->it_status); -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec; -#else - inode->i_ctime = CURRENT_TIME; -#endif + LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME); ext2_inc_count(inode); atomic_inc(&inode->i_count); d_instantiate(dentry, inode); @@ -878,11 +1104,7 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir, if (rc) RETURN(rc); -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - inode->i_ctime.tv_sec = CURRENT_TIME.tv_sec; -#else - inode->i_ctime = CURRENT_TIME; -#endif + LTIME_S(inode->i_ctime) = LTIME_S(CURRENT_TIME); ext2_inc_count(inode); atomic_inc(&inode->i_count); @@ -892,22 +1114,21 @@ static int ll_link(struct dentry *old_dentry, struct inode * dir, static int ll_mkdir2(struct inode *dir, const char *name, int len, int mode) { struct ptlrpc_request *request = NULL; -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - time_t time = CURRENT_TIME.tv_sec; -#else - time_t time = CURRENT_TIME; -#endif + time_t time = LTIME_S(CURRENT_TIME); struct ll_sb_info *sbi = ll_i2sbi(dir); + struct mdc_op_data op_data; int err = -EMLINK; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", + name, dir->i_ino, dir->i_generation, dir); - CDEBUG(D_VFSTRACE, "VFS Op\n"); if (dir->i_nlink >= EXT2_LINK_MAX) RETURN(err); mode = (mode & (S_IRWXUGO|S_ISVTX) & ~current->fs->umask) | S_IFDIR; - err = mdc_create(&sbi->ll_mdc_conn, dir, name, len, NULL, 0, - mode, current->fsuid, current->fsgid, + ll_prepare_mdc_op_data(&op_data, dir, NULL, name, len, 0); + err = mdc_create(&sbi->ll_mdc_conn, &op_data, NULL, 0, mode, + current->fsuid, current->fsgid, time, 0, &request); ptlrpc_req_finished(request); RETURN(err); @@ -920,8 +1141,10 @@ static int ll_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct inode * inode; int err = -EMLINK; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); LL_GET_INTENT(dentry, it); if (dir->i_nlink >= EXT2_LINK_MAX) @@ -967,8 +1190,9 @@ static int ll_rmdir2(struct inode *dir, const char *name, int len) { int rc; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", + name, dir->i_ino, dir->i_generation, dir); - CDEBUG(D_VFSTRACE, "VFS Op\n"); rc = ll_mdc_unlink(dir, NULL, S_IFDIR, name, len); RETURN(rc); } @@ -977,8 +1201,9 @@ static int ll_unlink2(struct inode *dir, const char *name, int len) { int rc; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p)\n", + name, dir->i_ino, dir->i_generation, dir); - CDEBUG(D_VFSTRACE, "VFS Op\n"); rc = ll_mdc_unlink(dir, NULL, S_IFREG, name, len); RETURN(rc); } @@ -1029,8 +1254,10 @@ static int ll_unlink(struct inode *dir, struct dentry *dentry) { struct lookup_intent * it; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); LL_GET_INTENT(dentry, it); RETURN(ll_common_unlink(dir, dentry, it, S_IFREG)); @@ -1042,8 +1269,10 @@ static int ll_rmdir(struct inode *dir, struct dentry *dentry) struct lookup_intent *it; int rc; ENTRY; - - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s,dir=%lu/%u(%p),intent=%s\n", + dentry->d_name.name, dir->i_ino, dir->i_generation, dir, + LL_IT2STR(dentry->d_it)); + LL_GET_INTENT(dentry, it); if ((!it || !it->it_disposition) && !ext2_empty_dir(inode)) @@ -1065,11 +1294,15 @@ static int ll_rename2(struct inode *src, struct inode *tgt, { struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(src); + struct mdc_op_data op_data; int err; ENTRY; - - CDEBUG(D_VFSTRACE, "VFS Op\n"); - err = mdc_rename(&sbi->ll_mdc_conn, src, tgt, + CDEBUG(D_VFSTRACE, "VFS Op:oldname=%s,src_dir=%lu/%u(%p),newname=%s," + "tgt_dir=%lu/%u(%p)\n", oldname, src->i_ino, src->i_generation, + src, newname, tgt->i_ino, tgt->i_generation, tgt); + + ll_prepare_mdc_op_data(&op_data, src, tgt, NULL, 0, 0); + err = mdc_rename(&sbi->ll_mdc_conn, &op_data, oldname, oldlen, newname, newlen, &request); ptlrpc_req_finished(request); @@ -1089,8 +1322,12 @@ static int ll_rename(struct inode * old_dir, struct dentry * old_dentry, struct ext2_dir_entry_2 * old_de; struct page * old_page; int err; + CDEBUG(D_VFSTRACE, "VFS Op:oldname=%s,src_dir=%lu/%u(%p),newname=%s," + "tgt_dir=%lu/%u(%p),intent=%s\n", + old_dentry->d_name.name, old_dir->i_ino, old_dir->i_generation, + old_dir, new_dentry->d_name.name, new_dir->i_ino, + new_dir->i_generation, new_dir, LL_IT2STR(new_dentry->d_it)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); LL_GET_INTENT(new_dentry, it); if (it && it->it_disposition) { diff --git a/lustre/llite/recover.c b/lustre/llite/recover.c deleted file mode 100644 index 4c7ad42..0000000 --- a/lustre/llite/recover.c +++ /dev/null @@ -1,56 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Lustre Lite recovery infrastructure. - * - * Copyright (C) 2002 Cluster File Systems Inc. - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include <linux/lustre_lite.h> -#include <linux/lustre_ha.h> -#include <linux/lustre_dlm.h> -#include <linux/lustre_idl.h> - -static int ll_retry_recovery(struct ptlrpc_connection *conn) -{ - ENTRY; - RETURN(0); -} - -int ll_recover(struct recovd_data *rd, int phase) -{ - struct ptlrpc_connection *conn = class_rd2conn(rd); - struct list_head *tmp; - - LASSERT(conn); - ENTRY; - - switch (phase) { - case PTLRPC_RECOVD_PHASE_PREPARE: - case PTLRPC_RECOVD_PHASE_RECOVER: - list_for_each(tmp, &conn->c_imports) { - struct obd_import *imp = - list_entry(tmp, struct obd_import, imp_chain); - - if (phase == PTLRPC_RECOVD_PHASE_PREPARE) { - unsigned long flags; - spin_lock_irqsave(&imp->imp_lock, flags); - imp->imp_level = LUSTRE_CONN_RECOVD; - spin_unlock_irqrestore(&imp->imp_lock, flags); - } - imp->imp_recover(imp, phase); - } - - if (phase == PTLRPC_RECOVD_PHASE_PREPARE) - RETURN(ptlrpc_run_recovery_upcall(conn)); - RETURN(0); - - case PTLRPC_RECOVD_PHASE_FAILURE: - RETURN(ll_retry_recovery(conn)); - } - - LBUG(); - RETURN(-ENOSYS); -} diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 409f308..cd1fa90 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -52,6 +52,7 @@ #include <linux/lustre_mds.h> #include <linux/lustre_lite.h> #include <linux/lustre_lib.h> +#include <linux/lustre_compat25.h> /* * Remove page from dirty list @@ -64,9 +65,7 @@ static void __set_page_clean(struct page *page) if (!mapping) return; -#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0)) - spin_lock(&pagecache_lock); -#endif + PGCACHE_WRLOCK(mapping); list_del(&page->list); list_add(&page->list, &mapping->clean_pages); @@ -77,9 +76,8 @@ static void __set_page_clean(struct page *page) CDEBUG(D_INODE, "inode clean\n"); inode->i_state &= ~I_DIRTY_PAGES; } -#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,5,0)) - spin_unlock(&pagecache_lock); -#endif + + PGCACHE_WRUNLOCK(mapping); EXIT; } @@ -96,15 +94,10 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int flags) { struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_brw_set *set; struct brw_page pg; int rc; ENTRY; - set = obd_brw_set_new(); - if (set == NULL) - RETURN(-ENOMEM); - pg.pg = page; pg.off = ((obd_off)page->index) << PAGE_SHIFT; @@ -125,22 +118,14 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int flags) pg.flag = flags; - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set, NULL); - if (rc) { - if (rc != -EIO) - CERROR("error from obd_brw: rc = %d\n", rc); - } else { - rc = ll_brw_sync_wait(set, CB_PHASE_START); - if (rc) - CERROR("error from callback: rc = %d\n", rc); - } - obd_brw_set_decref(set); + rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, NULL); + if (rc) + CERROR("error from obd_brw: rc = %d\n", rc); RETURN(rc); } -/* +/* * we were asked to read a single page but we're going to try and read a batch * of pages all at once. this vaguely simulates 2.5's readpages. */ @@ -151,14 +136,17 @@ static int ll_readpage(struct file *file, struct page *first_page) struct page *page = first_page; struct list_head *pos; struct brw_page *pgs; - struct obd_brw_set *set; unsigned long end_index, extent_end = 0; - int npgs = 0, rc = 0; + struct ptlrpc_request_set *set; + int npgs = 0, rc = 0, max_pages; ENTRY; LASSERT(PageLocked(page)); LASSERT(!PageUptodate(page)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset="LPX64"\n", + inode->i_ino, inode->i_generation, inode, + (((obd_off)page->index) << PAGE_SHIFT)); + LASSERT(atomic_read(&file->f_dentry->d_inode->i_count) > 0); if (inode->i_size <= ((obd_off)page->index) << PAGE_SHIFT) { CERROR("reading beyond EOF\n"); @@ -169,56 +157,58 @@ static int ll_readpage(struct file *file, struct page *first_page) RETURN(rc); } - pgs = kmalloc(PTL_MD_MAX_IOV * sizeof(*pgs), GFP_USER); - if ( pgs == NULL ) - RETURN(-ENOMEM); - set = obd_brw_set_new(); - if ( set == NULL ) - GOTO(out_pgs, rc = -ENOMEM); - - /* arbitrarily try to read-ahead 8 times what we can pass on - * the wire at once, clamped to file size */ - end_index = first_page->index + - 8 * ((PTL_MD_MAX_IOV * PAGE_SIZE)>>PAGE_CACHE_SHIFT); - if ( end_index > inode->i_size >> PAGE_CACHE_SHIFT ) + /* try to read the file's preferred block size in a one-er */ + end_index = first_page->index + + (inode->i_blksize >> PAGE_CACHE_SHIFT); + if (end_index > (inode->i_size >> PAGE_CACHE_SHIFT)) end_index = inode->i_size >> PAGE_CACHE_SHIFT; + max_pages = ((end_index - first_page->index) << PAGE_CACHE_SHIFT) >> + PAGE_SHIFT; + pgs = kmalloc(max_pages * sizeof(*pgs), GFP_USER); + if (pgs == NULL) + RETURN(-ENOMEM); + /* * find how far we're allowed to read under the extent ll_file_read - * is passing us.. + * is passing us.. */ spin_lock(&lli->lli_read_extent_lock); list_for_each(pos, &lli->lli_read_extents) { struct ll_read_extent *rextent; rextent = list_entry(pos, struct ll_read_extent, re_lli_item); - if ( rextent->re_task != current ) + if (rextent->re_task != current) continue; if (rextent->re_extent.end + PAGE_SIZE < rextent->re_extent.end) /* extent wrapping */ extent_end = ~0; - else { - extent_end = ( rextent->re_extent.end + PAGE_SIZE ) + else { + extent_end = (rextent->re_extent.end + PAGE_SIZE) << PAGE_CACHE_SHIFT; /* 32bit indexes, 64bit extents.. */ - if ( ((u64)extent_end >> PAGE_CACHE_SHIFT ) < - rextent->re_extent.end ) + if (((u64)extent_end >> PAGE_CACHE_SHIFT) < + rextent->re_extent.end) extent_end = ~0; } break; } spin_unlock(&lli->lli_read_extent_lock); - if ( extent_end == 0 ) { - CERROR("readpage outside ll_file_read, no lock held?\n"); + if (extent_end == 0) { + static long next_print; + if (time_after(jiffies, next_print)) { + next_print = jiffies + 30 * HZ; + CDEBUG(D_INODE, "mmap readpage - check locks\n"); + } end_index = page->index + 1; - } else if ( extent_end < end_index ) + } else if (extent_end < end_index) end_index = extent_end; /* to balance the find_get_page ref the other pages get that is * decrefed on teardown.. */ page_cache_get(page); - do { + do { unsigned long index ; pgs[npgs].pg = page; @@ -240,32 +230,32 @@ static int ll_readpage(struct file *file, struct page *first_page) } npgs++; - if ( npgs == PTL_MD_MAX_IOV ) + if (npgs == max_pages) break; /* - * find pages ahead of us that we can read in. + * find pages ahead of us that we can read in. * grab_cache_page waits on pages that are locked so * we first try find_get_page, which doesn't. this stops - * the worst case behaviour of racing threads waiting on + * the worst case behaviour of racing threads waiting on * each other, but doesn't remove it entirely. */ - for ( index = page->index + 1, page = NULL ; - page == NULL && index < end_index ; index++ ) { + for (index = page->index + 1, page = NULL; + page == NULL && index < end_index; index++) { /* see if the page already exists and needs updating */ page = find_get_page(inode->i_mapping, index); - if ( page ) { - if ( Page_Uptodate(page) || TryLockPage(page) ) + if (page) { + if (Page_Uptodate(page) || TryLockPage(page)) goto out_release; - if ( !page->mapping || Page_Uptodate(page)) + if (!page->mapping || Page_Uptodate(page)) goto out_unlock; } else { /* ok, we have to create it.. */ page = grab_cache_page(inode->i_mapping, index); - if ( page == NULL ) + if (page == NULL) continue; - if ( Page_Uptodate(page) ) + if (Page_Uptodate(page)) goto out_unlock; } @@ -280,39 +270,45 @@ static int ll_readpage(struct file *file, struct page *first_page) } while (page); - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(OBD_BRW_READ, ll_i2obdconn(inode), - ll_i2info(inode)->lli_smd, npgs, pgs, set, NULL); - if (rc) { - CERROR("error from obd_brw: rc = %d\n", rc); + set = ptlrpc_prep_set(); + if (set == NULL) { + CERROR("ENOMEM allocing request set\n"); + rc = -ENOMEM; } else { - rc = ll_brw_sync_wait(set, CB_PHASE_START); - if (rc) - CERROR("error from callback: rc = %d\n", rc); + rc = obd_brw_async(OBD_BRW_READ, ll_i2obdconn(inode), + ll_i2info(inode)->lli_smd, npgs, pgs, + set, NULL); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + if (rc && rc != -EIO) + CERROR("error from obd_brw_async: rc = %d\n", rc); } - obd_brw_set_decref(set); - while ( --npgs > -1 ) { + while (npgs-- > 0) { page = pgs[npgs].pg; - if ( rc == 0 ) + if (rc == 0) SetPageUptodate(page); unlock_page(page); page_cache_release(page); } -out_pgs: + kfree(pgs); RETURN(rc); } /* ll_readpage */ +/* this isn't where truncate starts. roughly: + * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate + * we grab the lock back in setattr_raw to avoid races. */ void ll_truncate(struct inode *inode) { - struct obdo oa = {0}; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct lustre_handle lockh = { 0, 0 }; - struct ldlm_extent extent = {inode->i_size, OBD_OBJECT_EOF}; + struct obdo oa = {0}; int err; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); if (!lsm) { /* object not yet allocated */ @@ -321,22 +317,20 @@ void ll_truncate(struct inode *inode) return; } + /* vmtruncate just threw away our dirty pages, make sure + * we don't think they're still dirty, being careful to round + * i_size to the first whole page that was tossed */ + ll_remove_dirty(inode, + (inode->i_size + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT, + ~0); + oa.o_id = lsm->lsm_object_id; oa.o_mode = inode->i_mode; oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; - CDEBUG(D_VFSTRACE, "VFS Op\n"); CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n", oa.o_id, inode->i_size); - /* i_size has already been set to the new size */ - err = ll_extent_lock_no_validate(NULL, inode, lsm, LCK_PW, - &extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_LOCK_MATCHED) { - EXIT; - return; - } - /* truncate == punch from new size to absolute end of file */ err = obd_punch(ll_i2obdconn(inode), &oa, lsm, inode->i_size, OBD_OBJECT_EOF, NULL); @@ -345,10 +339,6 @@ void ll_truncate(struct inode *inode) else obdo_to_inode(inode, &oa, oa.o_valid); - err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); - if (err) - CERROR("ll_extent_unlock failed: %d\n", err); - EXIT; return; } /* ll_truncate */ @@ -359,12 +349,13 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { struct inode *inode = page->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; obd_off offset = ((obd_off)page->index) << PAGE_SHIFT; + struct brw_page pg; int rc = 0; ENTRY; - ll_check_dirty(inode->i_sb); - if (!PageLocked(page)) LBUG(); @@ -373,11 +364,19 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from, //POISON(addr + from, 0xca, to - from); + /* Check to see if we should return -EIO right away */ + pg.pg = page; + pg.off = offset; + pg.count = PAGE_SIZE; + pg.flag = 0; + rc = obd_brw(OBD_BRW_CHECK, ll_i2obdconn(inode), lsm, 1, &pg, NULL); + if (rc) + RETURN(rc); + /* We're completely overwriting an existing page, so _don't_ set it up * to date until commit_write */ if (from == 0 && to == PAGE_SIZE) RETURN(0); - CDEBUG(D_VFSTRACE, "VFS Op\n"); /* If are writing to a new page, no need to read old data. * the extent locking and getattr procedures in ll_file_write have @@ -411,6 +410,7 @@ static int ll_prepare_write(struct file *file, struct page *page, unsigned from, * free some more pages that our allocating writeback may need, but it isn't * yet. */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) static int ll_writepage(struct page *page) { struct inode *inode = page->mapping->host; @@ -418,7 +418,6 @@ static int ll_writepage(struct page *page) CDEBUG(D_CACHE, "page %p [lau %d] inode %p\n", page, PageLaunder(page), inode); - CDEBUG(D_VFSTRACE, "VFS Op\n"); LASSERT(PageLocked(page)); /* XXX should obd_brw errors trickle up? */ @@ -440,23 +439,50 @@ static int ll_commit_write(struct file *file, struct page *page, LASSERT(inode == file->f_dentry->d_inode); LASSERT(PageLocked(page)); - CDEBUG(D_VFSTRACE, "VFS Op\n"); CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n", inode, page, from, to, page->index); - /* to match full page case in prepare_write */ SetPageUptodate(page); /* mark the page dirty, put it on mapping->dirty, * mark the inode PAGES_DIRTY, put it on sb->dirty */ - set_page_dirty(page); + if (!PageDirty(page)) + INODE_IO_STAT_ADD(inode, dirty_misses, 1); + else + INODE_IO_STAT_ADD(inode, dirty_hits, 1); - /* this is matched by a hack in obdo_to_inode at the moment */ size = (((obd_off)page->index) << PAGE_SHIFT) + to; if (size > inode->i_size) inode->i_size = size; + /* XXX temporary, bug 1286 */ + { + struct ll_dirty_offsets *lldo = &ll_i2info(inode)->lli_dirty; + int rc; + if ((lldo->do_num_dirty * PAGE_CACHE_SIZE) > 10 * 1024 * 1024) { + rc = ll_batch_writepage(inode, page); + lock_page(page); /* caller expects to unlock */ + RETURN(rc); + } + } + + set_page_dirty(page); + ll_record_dirty(inode, page->index); + RETURN(0); } /* ll_commit_write */ +#else +static int ll_writepage(struct page *page, + struct writeback_control *wbc) +{ + + return 0; +} +static int ll_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return 0; +} +#endif #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, @@ -465,12 +491,11 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; struct brw_page *pga; - struct obd_brw_set *set; - loff_t offset; + struct ptlrpc_request_set *set; int length, i, flags, rc = 0; + loff_t offset; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); if (!lsm || !lsm->lsm_object_id) RETURN(-ENOMEM); @@ -478,26 +503,18 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, (iobuf->length & (blocksize - 1))) RETURN(-EINVAL); -#if 0 - /* XXX Keep here until we find ia64 problem, it crashes otherwise */ - if (blocksize != PAGE_SIZE) { - CERROR("direct_IO blocksize != PAGE_SIZE\n"); - RETURN(-EINVAL); - } -#endif - - set = obd_brw_set_new(); + set = ptlrpc_prep_set(); if (set == NULL) RETURN(-ENOMEM); OBD_ALLOC(pga, sizeof(*pga) * iobuf->nr_pages); if (!pga) { - obd_brw_set_decref(set); + ptlrpc_set_destroy(set); RETURN(-ENOMEM); } flags = (rw == WRITE ? OBD_BRW_CREATE : 0) /* | OBD_BRW_DIRECTIO */; - offset = (blocknr << inode->i_blkbits); + offset = ((obd_off)blocknr << inode->i_blkbits); length = iobuf->length; for (i = 0, length = iobuf->length; length > 0; @@ -514,18 +531,18 @@ static int ll_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, } } - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - ll_i2obdconn(inode), lsm, iobuf->nr_pages, pga, set, NULL); + rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, + ll_i2obdconn(inode), lsm, iobuf->nr_pages, pga, set, + NULL); if (rc) { CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, - "error from obd_brw: rc = %d\n", rc); + "error from obd_brw_async: rc = %d\n", rc); } else { - rc = ll_brw_sync_wait(set, CB_PHASE_START); + rc = ptlrpc_set_wait(set); if (rc) CERROR("error from callback: rc = %d\n", rc); } - obd_brw_set_decref(set); + ptlrpc_set_destroy(set); if (rc == 0) rc = iobuf->length; diff --git a/lustre/llite/super.c b/lustre/llite/super.c index ff754a0..66563c7 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -32,6 +32,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/lprocfs_status.h> +#include "llite_internal.h" #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) kmem_cache_t *ll_file_data_slab; @@ -132,7 +133,7 @@ static struct super_block *ll_read_super(struct super_block *sb, ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); OBD_ALLOC(sbi, sizeof(*sbi)); if (!sbi) RETURN(NULL); @@ -140,6 +141,7 @@ static struct super_block *ll_read_super(struct super_block *sb, INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); generate_random_uuid(uuid); + spin_lock_init(&sbi->ll_iostats.fis_lock); class_uuid_unparse(uuid, &sbi->ll_sb_uuid); sb->u.generic_sbp = sbi; @@ -163,15 +165,13 @@ static struct super_block *ll_read_super(struct super_block *sb, GOTO(out_free, sb = NULL); } - err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid, - ptlrpc_recovd, ll_recover); + err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); GOTO(out_free, sb = NULL); } - mdc_conn = sbi2mdc(sbi)->cl_import.imp_connection; - list_add(&mdc_conn->c_sb_chain, &sbi->ll_conn_chain); + mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection; strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid)); obd = class_uuid2obd(¶m_uuid); @@ -180,8 +180,7 @@ static struct super_block *ll_read_super(struct super_block *sb, GOTO(out_mdc, sb = NULL); } - err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid, - ptlrpc_recovd, ll_recover); + err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); GOTO(out_mdc, sb = NULL); @@ -190,7 +189,7 @@ static struct super_block *ll_read_super(struct super_block *sb, err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); if (err) { CERROR("cannot mds_connect: rc = %d\n", err); - GOTO(out_mdc, sb = NULL); + GOTO(out_osc, sb = NULL); } CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); sbi->ll_rootino = rootfid.id; @@ -200,16 +199,17 @@ static struct super_block *ll_read_super(struct super_block *sb, sb->s_blocksize = osfs.os_bsize; sb->s_blocksize_bits = log2(osfs.os_bsize); sb->s_magic = LL_SUPER_MAGIC; - sb->s_maxbytes = (1ULL << (32 + 9)) - osfs.os_bsize; + sb->s_maxbytes = PAGE_CACHE_MAXBYTES; sb->s_op = &ll_super_operations; - /* make root inode */ - err = mdc_getattr(&sbi->ll_mdc_conn, sbi->ll_rootino, S_IFDIR, + /* make root inode + * XXX: move this to after cbd setup? */ + err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid, OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); if (err) { CERROR("mdc_getattr failed for root: rc = %d\n", err); - GOTO(out_request, sb = NULL); + GOTO(out_osc, sb = NULL); } /* initialize committed transaction callback daemon */ @@ -220,23 +220,29 @@ static struct super_block *ll_read_super(struct super_block *sb, err = ll_commitcbd_setup(sbi); if (err) { CERROR("failed to start commit callback daemon: rc = %d\n",err); - GOTO(out_request, sb = NULL); + ptlrpc_req_finished (request); + GOTO(out_osc, sb = NULL); } - lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0); - lic.lic_lmm = NULL; + lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, + sizeof(*lic.lic_body)); + LASSERT (lic.lic_body != NULL); /* checked by mdc_getattr() */ + LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_getattr() */ + + lic.lic_lsm = NULL; + LASSERT(sbi->ll_rootino != 0); root = iget4(sb, sbi->ll_rootino, NULL, &lic); - if (root) { - sb->s_root = d_alloc_root(root); - } else { + ptlrpc_req_finished(request); + + if (root == NULL || is_bad_inode(root)) { + /* XXX might need iput() for bad inode */ CERROR("lustre_lite: bad iget4 for root\n"); - GOTO(out_cdb, sb = NULL); + GOTO(out_cbd, sb = NULL); } - ptlrpc_req_finished(request); - request = NULL; + sb->s_root = d_alloc_root(root); if (proc_lustre_fs_root) { err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, @@ -253,13 +259,12 @@ out_dev: RETURN(sb); -out_cdb: +out_cbd: ll_commitcbd_cleanup(sbi); -out_request: - ptlrpc_req_finished(request); - obd_disconnect(&sbi->ll_osc_conn); +out_osc: + obd_disconnect(&sbi->ll_osc_conn, 0); out_mdc: - obd_disconnect(&sbi->ll_mdc_conn); + obd_disconnect(&sbi->ll_mdc_conn, 0); out_free: OBD_FREE(sbi, sizeof(*sbi)); @@ -271,12 +276,13 @@ static void ll_put_super(struct super_block *sb) struct ll_sb_info *sbi = ll_s2sbi(sb); struct list_head *tmp, *next; struct ll_fid rootfid; + struct obd_device *obd = class_conn2obd(&sbi->ll_mdc_conn); ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); list_del(&sbi->ll_conn_chain); ll_commitcbd_cleanup(sbi); - obd_disconnect(&sbi->ll_osc_conn); + obd_disconnect(&sbi->ll_osc_conn, 0); /* NULL request to force sync on the MDS, and get the last_committed * value to flush remaining RPCs from the sending queue on client. @@ -284,14 +290,15 @@ static void ll_put_super(struct super_block *sb) * XXX This should be an mdc_sync() call to sync the whole MDS fs, * which we can call for other reasons as well. */ - mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); + if (!obd->obd_no_recov) + mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); if (sbi->ll_proc_root) { lprocfs_remove(sbi->ll_proc_root); sbi->ll_proc_root = NULL; } - obd_disconnect(&sbi->ll_mdc_conn); + obd_disconnect(&sbi->ll_mdc_conn, 0); spin_lock(&dcache_lock); list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) { @@ -312,27 +319,29 @@ static void ll_clear_inode(struct inode *inode) int rc; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); - rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); + rc = ll_mdc_cancel_unused(&sbi->ll_mdc_conn, inode, + LDLM_FL_NO_CALLBACK, inode); if (rc < 0) { - CERROR("mdc_cancel_unused: %d\n", rc); + CERROR("ll_mdc_cancel_unused: %d\n", rc); /* XXX FIXME do something dramatic */ } + if (atomic_read(&inode->i_count) != 0) + CERROR("clearing in-use inode %lu: count = %d\n", + inode->i_ino, atomic_read(&inode->i_count)); + if (lli->lli_smd) { - rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0); + rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, + LDLM_FL_WARN, inode); if (rc < 0) { CERROR("obd_cancel_unused: %d\n", rc); /* XXX FIXME do something dramatic */ } - } - - if (atomic_read(&inode->i_count) != 0) - CERROR("clearing in-use inode %lu: count = %d\n", - inode->i_ino, atomic_read(&inode->i_count)); - - if (lli->lli_smd) obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + lli->lli_smd = NULL; + } if (lli->lli_symlink_name) { OBD_FREE(lli->lli_symlink_name, @@ -347,7 +356,8 @@ static void ll_clear_inode(struct inode *inode) static void ll_delete_inode(struct inode *inode) { ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); if (S_ISREG(inode->i_mode)) { int err; struct obdo *oa; @@ -390,6 +400,10 @@ static int ll_attr2inode(struct inode *inode, struct iattr *attr, int trunc) int error = 0; if ((ia_valid & ATTR_SIZE) && trunc) { + if (attr->ia_size > ll_file_maxbytes(inode)) { + error = -EFBIG; + goto out; + } error = vmtruncate(inode, attr->ia_size); if (error) goto out; @@ -423,15 +437,20 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) ENTRY; /* change incore inode */ - ll_attr2inode(inode, attr, do_trunc); + err = ll_attr2inode(inode, attr, do_trunc); + if (err) + RETURN(err); /* Don't send size changes to MDS to avoid "fast EA" problems, and * also avoid a pointless RPC (we get file size from OST anyways). */ attr->ia_valid &= ~ATTR_SIZE; if (attr->ia_valid) { - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0, - &request); + struct mdc_op_data op_data; + + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, &request); if (err) CERROR("mdc_setattr fails: err = %d\n", err); @@ -461,31 +480,63 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) int ll_setattr_raw(struct inode *inode, struct iattr *attr) { - struct ptlrpc_request *request = NULL; + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct ll_sb_info *sbi = ll_i2sbi(inode); - int err = 0; + struct ptlrpc_request *request = NULL; + struct mdc_op_data op_data; + int rc = 0, err; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); if ((attr->ia_valid & ATTR_SIZE)) { + struct ldlm_extent extent = {attr->ia_size, OBD_OBJECT_EOF}; + struct lustre_handle lockh = { 0 }; + + if (attr->ia_size > ll_file_maxbytes(inode)) + RETURN(-EFBIG); + /* writeback uses inode->i_size to determine how far out * its cached pages go. ll_truncate gets a PW lock, canceling * our lock, _after_ it has updated i_size. this can confuse - * us into zero extending the file to the newly truncated - * size, and this has bad implications for a racing o_append. - * if we're extending our size we need to flush the pages - * with the correct i_size before vmtruncate stomps on - * the new i_size. again, this can only find pages to - * purge if the PW lock that generated them is still held. - */ - if ( attr->ia_size > inode->i_size ) { - filemap_fdatasync(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + * + * If this file doesn't have stripes yet, it is already, + * by definition, truncated. */ + if ((attr->ia_valid & ATTR_FROM_OPEN) && lsm == NULL) { + LASSERT(attr->ia_size == 0); + GOTO(skip_extent_lock, rc = 0); + } + + /* we really need to get our PW lock before we change + * inode->i_size. if we don't we can race with other + * i_size updaters on our node, like ll_file_read. we + * can also race with i_size propogation to other + * nodes through dirtying and writeback of final cached + * pages. this last one is especially bad for racing + * o_append users on other nodes. */ + rc = ll_extent_lock_no_validate(NULL, inode, lsm, LCK_PW, + &extent, &lockh); + if (rc != ELDLM_OK) { + if (rc > 0) + RETURN(-ENOLCK); + RETURN(rc); } - err = vmtruncate(inode, attr->ia_size); + + rc = vmtruncate(inode, attr->ia_size); + if (rc == 0) + set_bit(LLI_F_HAVE_SIZE_LOCK, + &ll_i2info(inode)->lli_flags); + + /* unlock now as we don't mind others file lockers racing with + * the mds updates below? */ + err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); if (err) - RETURN(err); + CERROR("ll_extent_unlock failed: %d\n", err); + if (rc) + RETURN(rc); } +skip_extent_lock: /* Don't send size changes to MDS to avoid "fast EA" problems, and * also avoid a pointless RPC (we get file size from OST anyways). */ @@ -493,18 +544,25 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) if (!attr->ia_valid) RETURN(0); - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0, - &request); + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, &request); if (err) CERROR("mdc_setattr fails: err = %d\n", err); ptlrpc_req_finished(request); - if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_MTIME_SET)) { struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct obdo oa; int err2; + if (lsm == NULL) { + CDEBUG(D_INODE, "no lsm: not setting mtime on OSTs\n"); + RETURN(err); + } + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", inode->i_ino, attr->ia_mtime); oa.o_id = lsm->lsm_object_id; @@ -524,8 +582,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) int ll_setattr(struct dentry *de, struct iattr *attr) { int rc = inode_change_ok(de->d_inode, attr); - - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:name=%s\n", de->d_name.name); if (rc) return rc; @@ -539,7 +596,7 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs) int rc; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); + CDEBUG(D_VFSTRACE, "VFS Op:\n"); memset(sfs, 0, sizeof(*sfs)); rc = obd_statfs(&sbi->ll_mdc_conn, &osfs); statfs_unpack(sfs, &osfs); @@ -570,33 +627,68 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs) osfs.os_bfree >>= 1; osfs.os_bavail >>= 1; } + sfs->f_blocks = osfs.os_blocks; sfs->f_bfree = osfs.os_bfree; sfs->f_bavail = osfs.os_bavail; - if (osfs.os_ffree < (__u64)sfs->f_ffree) + + /* If we don't have as many objects free on the OST as inodes + * on the MDS, we reduce the total number of inodes to + * compensate, so that the "inodes in use" number is correct. + */ + if (osfs.os_ffree < (__u64)sfs->f_ffree) { + sfs->f_files = (sfs->f_files - sfs->f_ffree) + + osfs.os_ffree; sfs->f_ffree = osfs.os_ffree; + } } out: RETURN(rc); } +void dump_lsm(int level, struct lov_stripe_md *lsm) +{ + CDEBUG(level, "objid "LPX64", maxbytes "LPX64", magic %#08x, " + "stripe_size %#08x, offset %u, stripe_count %u\n", + lsm->lsm_object_id, lsm->lsm_maxbytes, lsm->lsm_magic, + lsm->lsm_stripe_size, lsm->lsm_stripe_offset, + lsm->lsm_stripe_count); +} + void ll_update_inode(struct inode *inode, struct mds_body *body, - struct lov_mds_md *lmm) + struct lov_stripe_md *lsm) { struct ll_inode_info *lli = ll_i2info(inode); - if (lmm != NULL) - obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm); + LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (lli->lli_smd == NULL) { + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) + lli->lli_maxbytes = PAGE_CACHE_MAXBYTES; + lli->lli_smd = lsm; + } else { + if (memcmp(lli->lli_smd, lsm, sizeof(*lsm))) { + CERROR("lsm mismatch for inode %ld\n", + inode->i_ino); + CERROR("lli_smd:\n"); + dump_lsm(D_ERROR, lli->lli_smd); + CERROR("lsm:\n"); + dump_lsm(D_ERROR, lsm); + LBUG(); + } + } + } if (body->valid & OBD_MD_FLID) inode->i_ino = body->ino; if (body->valid & OBD_MD_FLATIME) - inode->i_atime = body->atime; + LTIME_S(inode->i_atime) = body->atime; if (body->valid & OBD_MD_FLMTIME) - inode->i_mtime = body->mtime; + LTIME_S(inode->i_mtime) = body->mtime; if (body->valid & OBD_MD_FLCTIME) - inode->i_ctime = body->ctime; + LTIME_S(inode->i_ctime) = body->ctime; if (body->valid & OBD_MD_FLMODE) inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); if (body->valid & OBD_MD_FLTYPE) @@ -625,37 +717,22 @@ static void ll_read_inode2(struct inode *inode, void *opaque) struct mds_body *body = lic->lic_body; struct ll_inode_info *lli = ll_i2info(inode); ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p)\n", inode->i_ino, + inode->i_generation, inode); - CDEBUG(D_VFSTRACE, "VFS Op\n"); sema_init(&lli->lli_open_sem, 1); - atomic_set(&lli->lli_open_count, 0); - lli->lli_flags = 0; - init_MUTEX(&lli->lli_getattr_sem); spin_lock_init(&lli->lli_read_extent_lock); INIT_LIST_HEAD(&lli->lli_read_extents); + ll_lldo_init(&lli->lli_dirty); + lli->lli_flags = 0; + /* We default to 2T-4k until the LSM is created/read, at which point + * it'll be updated. */ + lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES; LASSERT(!lli->lli_smd); /* core attributes from the MDS first */ - ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL); - - /* Get the authoritative file size */ - if (lli->lli_smd && (inode->i_mode & S_IFREG)) { - struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; - struct lustre_handle lockh = {0, 0}; - struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - ldlm_error_t rc; - - LASSERT(lli->lli_smd->lsm_object_id != 0); - - rc = ll_extent_lock(NULL, inode, lsm, LCK_PR, &extent, &lockh); - if (rc != ELDLM_OK && rc != ELDLM_LOCK_MATCHED) { - ll_clear_inode(inode); - make_bad_inode(inode); - } else { - ll_extent_unlock(NULL, inode, lsm, LCK_PR, &lockh); - } - } + ll_update_inode(inode, body, lic->lic_lsm); /* OIDEBUG(inode); */ @@ -679,41 +756,30 @@ static void ll_read_inode2(struct inode *inode, void *opaque) } } -static inline void invalidate_request_list(struct list_head *req_list) -{ - struct list_head *tmp, *n; - list_for_each_safe(tmp, n, req_list) { - struct ptlrpc_request *req = - list_entry(tmp, struct ptlrpc_request, rq_list); - CERROR("invalidating req xid "LPU64" op %d to %s:%d\n", - req->rq_xid, req->rq_reqmsg->opc, - req->rq_connection->c_remote_uuid.uuid, - req->rq_import->imp_client->cli_request_portal); - req->rq_flags |= PTL_RPC_FL_ERR; - wake_up(&req->rq_wait_for_rep); - } -} - void ll_umount_begin(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); - struct list_head *ctmp; + struct obd_device *obd; + struct obd_ioctl_data ioc_data = { 0 }; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op\n"); - - list_for_each(ctmp, &sbi->ll_conn_chain) { - struct ptlrpc_connection *conn; - conn = list_entry(ctmp, struct ptlrpc_connection, c_sb_chain); - - spin_lock(&conn->c_lock); - /* XXX should just be dealing with imports, probably through - * XXX iocontrol, need next-gen recovery! */ - conn->c_flags |= CONN_INVALID; - /* invalidate_request_list(&conn->c_sending_head); */ - invalidate_request_list(&conn->c_delayed_head); - spin_unlock(&conn->c_lock); - } + CDEBUG(D_VFSTRACE, "VFS Op:\n"); + + obd = class_conn2obd(&sbi->ll_mdc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_mdc_conn, sizeof ioc_data, + &ioc_data, NULL); + + obd = class_conn2obd(&sbi->ll_osc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_osc_conn, sizeof ioc_data, + &ioc_data, NULL); + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just invalidate the requests, + * schedule, and hope. + */ + schedule(); EXIT; } diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index f296d10..680c47f 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -32,6 +32,7 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/lprocfs_status.h> +#include "llite_internal.h" #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) #include <asm/statfs.h> @@ -136,6 +137,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) struct obd_uuid param_uuid; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:\n"); OBD_ALLOC(sbi, sizeof(*sbi)); if (!sbi) @@ -167,24 +169,22 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) GOTO(out_free, sb = NULL); } - err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid, - ptlrpc_recovd, ll_recover); + err = obd_connect(&sbi->ll_mdc_conn, obd, &sbi->ll_sb_uuid); if (err) { CERROR("cannot connect to %s: rc = %d\n", mdc, err); GOTO(out_free, sb = NULL); } - mdc_conn = sbi2mdc(sbi)->cl_import.imp_connection; - list_add(&mdc_conn->c_sb_chain, &sbi->ll_conn_chain); + mdc_conn = sbi2mdc(sbi)->cl_import->imp_connection; + strncpy(param_uuid.uuid, osc, sizeof(param_uuid.uuid)); - obd = class_uuid2obd(osc); + obd = class_uuid2obd(¶m_uuid); if (!obd) { CERROR("OSC %s: not setup or attached\n", osc); GOTO(out_mdc, sb = NULL); } - err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid, - ptlrpc_recovd, ll_recover); + err = obd_connect(&sbi->ll_osc_conn, obd, &sbi->ll_sb_uuid); if (err) { CERROR("cannot connect to %s: rc = %d\n", osc, err); GOTO(out_mdc, sb = NULL); @@ -193,7 +193,7 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) err = mdc_getstatus(&sbi->ll_mdc_conn, &rootfid); if (err) { CERROR("cannot mds_connect: rc = %d\n", err); - GOTO(out_mdc, sb = NULL); + GOTO(out_osc, sb = NULL); } CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); sbi->ll_rootino = rootfid.id; @@ -203,16 +203,17 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize = osfs.os_bsize; sb->s_blocksize_bits = log2(osfs.os_bsize); sb->s_magic = LL_SUPER_MAGIC; - sb->s_maxbytes = (1ULL << (32 + 9)) - osfs.os_bsize; + sb->s_maxbytes = PAGE_CACHE_MAXBYTES; sb->s_op = &ll_super_operations; - /* make root inode */ - err = mdc_getattr(&sbi->ll_mdc_conn, sbi->ll_rootino, S_IFDIR, + /* make root inode + * XXX: move this to after cbd setup? */ + err = mdc_getattr(&sbi->ll_mdc_conn, &rootfid, OBD_MD_FLNOTOBD|OBD_MD_FLBLOCKS, 0, &request); if (err) { CERROR("mdc_getattr failed for root: rc = %d\n", err); - GOTO(out_request, sb = NULL); + GOTO(out_osc, sb = NULL); } /* initialize committed transaction callback daemon */ @@ -223,25 +224,30 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) err = ll_commitcbd_setup(sbi); if (err) { CERROR("failed to start commit callback daemon: rc = %d\n",err); - GOTO(out_request, sb = NULL); + ptlrpc_req_finished (request); + GOTO(out_osc, sb = NULL); } - lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0); - lic.lic_lmm = NULL; + lic.lic_body = lustre_msg_buf(request->rq_repmsg, 0, sizeof (*lic.lic_body)); + LASSERT (lic.lic_body != NULL); /* checked by mdc_getattr() */ + LASSERT_REPSWABBED (request, 0); /* swabbed by mdc_getattr() */ + + lic.lic_lsm = NULL; + root = iget5_locked(sb, sbi->ll_rootino, NULL, ll_read_inode2, &lic); - if (root) { - sb->s_root = d_alloc_root(root); - root->i_state &= ~(I_LOCK | I_NEW); - } else { - CERROR("lustre_lite: bad iget4 for root\n"); - GOTO(out_cdb, sb = NULL); - } - ptlrpc_req_finished(request); - request = NULL; + if (root == NULL || is_bad_inode(root)) { + /* XXX might need iput() for bad inode */ + CERROR("lustre_lite: bad iget5 for root\n"); + GOTO(out_cbd, sb = NULL); + } + + sb->s_root = d_alloc_root(root); + root->i_state &= ~(I_LOCK | I_NEW); + printk("AMRUT 1\n"); if (proc_lustre_fs_root) { err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, osc, mdc); @@ -254,22 +260,88 @@ out_dev: OBD_FREE(mdc, strlen(mdc) + 1); if (osc) OBD_FREE(osc, strlen(osc) + 1); + printk("AMRUT 2\n"); RETURN(0); -out_cdb: +out_cbd: ll_commitcbd_cleanup(sbi); -out_request: - ptlrpc_req_finished(request); - obd_disconnect(&sbi->ll_osc_conn); +out_osc: + obd_disconnect(&sbi->ll_osc_conn, 0); out_mdc: - obd_disconnect(&sbi->ll_mdc_conn); + obd_disconnect(&sbi->ll_mdc_conn, 0); out_free: OBD_FREE(sbi, sizeof(*sbi)); goto out_dev; } /* ll_fill_super */ + +int ll_setattr_raw(struct inode *inode, struct iattr *attr) +{ + struct ptlrpc_request *request = NULL; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct mdc_op_data op_data; + int err = 0; + ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); + + if ((attr->ia_valid & ATTR_SIZE)) { + /* writeback uses inode->i_size to determine how far out + * its cached pages go. ll_truncate gets a PW lock, canceling + * our lock, _after_ it has updated i_size. this can confuse + * us into zero extending the file to the newly truncated + * size, and this has bad implications for a racing o_append. + * if we're extending our size we need to flush the pages + * with the correct i_size before vmtruncate stomps on + * the new i_size. again, this can only find pages to + * purge if the PW lock that generated them is still held. + */ + if ( attr->ia_size > inode->i_size ) { + filemap_fdatasync(inode->i_mapping); + filemap_fdatawait(inode->i_mapping); + } + err = vmtruncate(inode, attr->ia_size); + if (err) + RETURN(err); + } + + /* Don't send size changes to MDS to avoid "fast EA" problems, and + * also avoid a pointless RPC (we get file size from OST anyways). + */ + attr->ia_valid &= ~ATTR_SIZE; + if (!attr->ia_valid) + RETURN(0); + + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, &request); + if (err) + CERROR("mdc_setattr fails: err = %d\n", err); + + ptlrpc_req_finished(request); + + if (S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_MTIME_SET) { + struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct obdo oa; + int err2; + + CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", + inode->i_ino, attr->ia_mtime); + oa.o_id = lsm->lsm_object_id; + oa.o_mode = S_IFREG; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMTIME; + oa.o_mtime = LTIME_S(attr->ia_mtime); + err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); + if (err2) { + CERROR("obd_setattr fails: rc=%d\n", err); + if (!err) + err = err2; + } + } + RETURN(err); +} struct super_block * ll_get_sb(struct file_system_type *fs_type, int flags, char *devname, void * data) { @@ -282,10 +354,11 @@ static void ll_put_super(struct super_block *sb) struct list_head *tmp, *next; struct ll_fid rootfid; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:\n"); list_del(&sbi->ll_conn_chain); ll_commitcbd_cleanup(sbi); - obd_disconnect(&sbi->ll_osc_conn); + obd_disconnect(&sbi->ll_osc_conn, 0); /* NULL request to force sync on the MDS, and get the last_committed * value to flush remaining RPCs from the pending queue on client. @@ -300,7 +373,7 @@ static void ll_put_super(struct super_block *sb) sbi->ll_proc_root = NULL; } - obd_disconnect(&sbi->ll_mdc_conn); + obd_disconnect(&sbi->ll_mdc_conn, 0); spin_lock(&dcache_lock); list_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list){ @@ -320,12 +393,13 @@ static void ll_clear_inode(struct inode *inode) struct ll_inode_info *lli = ll_i2info(inode); int rc; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); #warning "Is there a reason we don't do this in 2.5, but we do in 2.4?" #if 0 - rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK); + rc = ll_mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK); if (rc < 0) { - CERROR("mdc_cancel_unused: %d\n", rc); + CERROR("ll_mdc_cancel_unused: %d\n", rc); /* XXX FIXME do something dramatic */ } @@ -342,8 +416,10 @@ static void ll_clear_inode(struct inode *inode) CERROR("clearing in-use inode %lu: count = %d\n", inode->i_ino, atomic_read(&inode->i_count)); - if (lli->lli_smd) + if (lli->lli_smd) { obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + lli->lli_smd = NULL; + } if (lli->lli_symlink_name) { OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1); @@ -357,6 +433,7 @@ static void ll_clear_inode(struct inode *inode) static void ll_delete_inode(struct inode *inode) { ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); if (S_ISREG(inode->i_mode)) { int err; struct obdo *oa; @@ -399,6 +476,10 @@ static int ll_attr2inode(struct inode * inode, struct iattr * attr, int trunc) int error = 0; if ((ia_valid & ATTR_SIZE) && trunc) { + if (attr->ia_size > ll_file_maxbytes(inode)) { + error = -EFBIG; + goto out; + } error = vmtruncate(inode, attr->ia_size); if (error) goto out; @@ -433,15 +514,21 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) ENTRY; /* change incore inode */ - ll_attr2inode(inode, attr, do_trunc); + err = ll_attr2inode(inode, attr, do_trunc); + if (err) + RETURN(err); /* Don't send size changes to MDS to avoid "fast EA" problems, and * also avoid a pointless RPC (we get file size from OST anyways). */ attr->ia_valid &= ~ATTR_SIZE; if (attr->ia_valid) { - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, NULL, 0, - &request); + struct mdc_op_data op_data; + + ll_prepare_mdc_op_data(&op_data, inode, NULL, NULL, 0, 0); + + err = mdc_setattr(&sbi->ll_mdc_conn, &op_data, + attr, NULL, 0, &request); if (err) CERROR("mdc_setattr fails: err = %d\n", err); @@ -455,7 +542,7 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID |OBD_MD_FLTYPE |OBD_MD_FLMTIME; - oa.o_mtime = attr->ia_mtime.tv_sec; + oa.o_mtime = LTIME_S(attr->ia_mtime); err2 = obd_setattr(&sbi->ll_osc_conn, &oa, lsm, NULL); if (err2) { CERROR("obd_setattr fails: rc=%d\n", err); @@ -471,7 +558,7 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) int ll_setattr(struct dentry *de, struct iattr *attr) { int rc = inode_change_ok(de->d_inode, attr); - + CDEBUG(D_VFSTRACE, "VFS Op:name=%s\n", de->d_name.name); if (rc) return rc; @@ -484,6 +571,7 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs) struct obd_statfs osfs; int rc; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:\n"); memset(sfs, 0, sizeof(*sfs)); rc = obd_statfs(&sbi->ll_mdc_conn, &osfs); @@ -518,8 +606,11 @@ static int ll_statfs(struct super_block *sb, struct statfs *sfs) sfs->f_blocks = osfs.os_blocks; sfs->f_bfree = osfs.os_bfree; sfs->f_bavail = osfs.os_bavail; - if (osfs.os_ffree < (__u64)sfs->f_ffree) + if (osfs.os_ffree < (__u64)sfs->f_ffree) { + sfs->f_files = (sfs->f_files - sfs->f_ffree) + + osfs.os_ffree; sfs->f_ffree = osfs.os_ffree; + } } out: @@ -527,21 +618,30 @@ out: } void ll_update_inode(struct inode *inode, struct mds_body *body, - struct lov_mds_md *lmm) + struct lov_stripe_md *lsm) { struct ll_inode_info *lli = ll_i2info(inode); - if (lmm != NULL) - obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lmm); + LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + if (lsm != NULL) { + if (lli->lli_smd == NULL) { + lli->lli_smd = lsm; + lli->lli_maxbytes = lsm->lsm_maxbytes; + if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) + lli->lli_maxbytes = PAGE_CACHE_MAXBYTES; + } else { + LASSERT (!memcmp (lli->lli_smd, lsm, sizeof (*lsm))); + } + } if (body->valid & OBD_MD_FLID) inode->i_ino = body->ino; if (body->valid & OBD_MD_FLATIME) - inode->i_atime.tv_sec = body->atime; + LTIME_S(inode->i_atime) = body->atime; if (body->valid & OBD_MD_FLMTIME) - inode->i_mtime.tv_sec = body->mtime; + LTIME_S(inode->i_mtime) = body->mtime; if (body->valid & OBD_MD_FLCTIME) - inode->i_ctime.tv_sec = body->ctime; + LTIME_S(inode->i_ctime) = body->ctime; if (body->valid & OBD_MD_FLMODE) inode->i_mode = (inode->i_mode & S_IFMT)|(body->mode & ~S_IFMT); if (body->valid & OBD_MD_FLTYPE) @@ -571,36 +671,20 @@ int ll_read_inode2(struct inode *inode, void *opaque) struct ll_inode_info *lli = ll_i2info(inode); int rc = 0; ENTRY; + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu\n", inode->i_ino); sema_init(&lli->lli_open_sem, 1); - lli->flags = 0; - init_MUTEX(&lli->lli_getattr_sem); /* these are 2.4 only, but putting them here for consistency.. */ spin_lock_init(&lli->lli_read_extent_lock); INIT_LIST_HEAD(&lli->lli_read_extents); + ll_lldo_init(&lli->lli_dirty); + lli->lli_flags = 0; + lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES; LASSERT(!lli->lli_smd); /* core attributes first */ - ll_update_inode(inode, body, lic ? lic->lic_lmm : NULL); - - /* Get the authoritative file size */ - if (lli->lli_smd && S_ISREG(inode->i_mode)) { - struct ll_file_data *fd = file->private_data; - struct ldlm_extent extent = {0, OBD_OBJECT_EOF}; - struct lustre_handle lockh = {0, 0}; - - LASSERT(lli->lli_smd->lsm_object_id != 0); - - rc = ll_extent_lock(fd, inode, lsm, LCK_PR, &extent, &lockh); - if (err != ELDLM_OK && err != ELDLM_MATCHED) { - ll_clear_inode(inode); - make_bad_inode(inode); - } else { - l_extent_unlock(fd, inode, lsm, LCK_PR, &extent, - &lockh); - } - } + ll_update_inode(inode, body, lic ? lic->lic_lsm : NULL); /* OIDEBUG(inode); */ @@ -618,6 +702,7 @@ int ll_read_inode2(struct inode *inode, void *opaque) inode->i_op = &ll_fast_symlink_inode_operations; EXIT; } else { + inode->i_op = &ll_special_inode_operations; init_special_inode(inode, inode->i_mode, kdev_t_to_nr(inode->i_rdev)); EXIT; @@ -626,62 +711,56 @@ int ll_read_inode2(struct inode *inode, void *opaque) return rc; } -static inline void invalidate_request_list(struct list_head *req_list) -{ - struct list_head *tmp, *n; - list_for_each_safe(tmp, n, req_list) { - struct ptlrpc_request *req = - list_entry(tmp, struct ptlrpc_request, rq_list); - CERROR("invalidating req xid %d op %d to %s:%d\n", - (unsigned long long)req->rq_xid, req->rq_reqmsg->opc, - req->rq_connection->c_remote_uuid, - req->rq_import->imp_client->cli_request_portal); - req->rq_flags |= PTL_RPC_FL_ERR; - wake_up(&req->rq_wait_for_rep); - } -} void ll_umount_begin(struct super_block *sb) { struct ll_sb_info *sbi = ll_s2sbi(sb); - struct list_head *ctmp; + struct obd_device *obd; + struct obd_ioctl_data ioc_data = { 0 }; ENTRY; - - list_for_each(ctmp, &sbi->ll_conn_chain) { - struct ptlrpc_connection *conn; - conn = list_entry(ctmp, struct ptlrpc_connection, c_sb_chain); - - spin_lock(&conn->c_lock); - conn->c_flags |= CONN_INVALID; - /*invalidate_request_list(&conn->c_sending_head);*/ - invalidate_request_list(&conn->c_delayed_head); - spin_unlock(&conn->c_lock); - } + CDEBUG(D_VFSTRACE, "VFS Op:\n"); + + obd = class_conn2obd(&sbi->ll_mdc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_mdc_conn, sizeof ioc_data, + &ioc_data, NULL); + + obd = class_conn2obd(&sbi->ll_osc_conn); + obd->obd_no_recov = 1; + obd_iocontrol(IOC_OSC_SET_ACTIVE, &sbi->ll_osc_conn, sizeof ioc_data, + &ioc_data, NULL); + + /* Really, we'd like to wait until there are no requests outstanding, + * and then continue. For now, we just invalidate the requests, + * schedule, and hope. + */ + schedule(); EXIT; } - static kmem_cache_t *ll_inode_cachep; static struct inode *ll_alloc_inode(struct super_block *sb) { struct ll_inode_info *lli; - lli = kmem_cache_alloc(ll_inode_cachep, SLAB_KERNEL); - if (!lli) + OBD_SLAB_ALLOC(lli, ll_inode_cachep, SLAB_KERNEL, sizeof *lli); + if (lli == NULL) return NULL; memset(lli, 0, (char *)&lli->lli_vfs_inode - (char *)lli); sema_init(&lli->lli_open_sem, 1); init_MUTEX(&lli->lli_size_valid_sem); + lli->lli_maxbytes = LUSTRE_STRIPE_MAXBYTES; return &lli->lli_vfs_inode; } static void ll_destroy_inode(struct inode *inode) { - kmem_cache_free(ll_inode_cachep, ll_i2info(inode)); + OBD_SLAB_FREE(ll_inode_cachep, ll_i2info(inode), + sizeof(struct ll_inode_info)); } static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index 6ebe7de..19d234e 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -36,6 +36,8 @@ static int ll_readlink_internal(struct inode *inode, { struct ll_inode_info *lli = ll_i2info(inode); struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_fid fid; + struct mds_body *body; int rc, symlen = inode->i_size + 1; ENTRY; @@ -47,14 +49,38 @@ static int ll_readlink_internal(struct inode *inode, RETURN(0); } - rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino, S_IFLNK, + ll_inode2fid(&fid, inode); + rc = mdc_getattr(&sbi->ll_mdc_conn, &fid, OBD_MD_LINKNAME, symlen, request); if (rc) { CERROR("inode %lu: rc = %d\n", inode->i_ino, rc); RETURN(rc); } - *symname = lustre_msg_buf((*request)->rq_repmsg, 1); + body = lustre_msg_buf ((*request)->rq_repmsg, 0, sizeof (*body)); + LASSERT (body != NULL); + LASSERT_REPSWABBED (*request, 0); + + if ((body->valid & OBD_MD_LINKNAME) == 0) { + CERROR ("OBD_MD_LINKNAME not set on reply\n"); + GOTO (failed, rc = -EPROTO); + } + + LASSERT (symlen != 0); + if (body->eadatasize != symlen) { + CERROR ("inode %lu: symlink length %d not expected %d\n", + inode->i_ino, body->eadatasize - 1, symlen - 1); + GOTO (failed, rc = -EPROTO); + } + + *symname = lustre_msg_buf ((*request)->rq_repmsg, 1, symlen); + if (*symname == NULL || + strnlen (*symname, symlen) != symlen - 1) { + /* not full/NULL terminated */ + CERROR ("inode %lu: symlink not NULL terminated string" + "of length %d\n", inode->i_ino, symlen - 1); + GOTO (failed, rc = -EPROTO); + } OBD_ALLOC(lli->lli_symlink_name, symlen); /* do not return an error if we cannot cache the symlink locally */ @@ -62,6 +88,10 @@ static int ll_readlink_internal(struct inode *inode, memcpy(lli->lli_symlink_name, *symname, symlen); RETURN(0); + + failed: + ptlrpc_req_finished (*request); + RETURN (-EPROTO); } static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) @@ -81,10 +111,9 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) GOTO(out, rc); rc = vfs_readlink(dentry, buffer, buflen, symname); + ptlrpc_req_finished(request); out: up(&lli->lli_open_sem); - ptlrpc_req_finished(request); - RETURN(rc); } @@ -119,9 +148,8 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd, } rc = vfs_follow_link_it(nd, symname, it); - out: ptlrpc_req_finished(request); - + out: RETURN(rc); } #else @@ -149,9 +177,9 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd) nd->it.it_mode = mode; rc = vfs_follow_link(nd, symname); + ptlrpc_req_finished(request); out: up(&lli->lli_open_sem); - ptlrpc_req_finished(request); RETURN(rc); } diff --git a/lustre/lov/Makefile.am b/lustre/lov/Makefile.am index 6b647b4..879e44d 100644 --- a/lustre/lov/Makefile.am +++ b/lustre/lov/Makefile.am @@ -7,18 +7,12 @@ DEFS= if LIBLUSTRE lib_LIBRARIES = liblov.a -LINX=client.c -liblov_a_SOURCES = lov_obd.c lov_pack.c $(LINX) +liblov_a_SOURCES = lov_obd.c lov_pack.c else MODULE = lov modulefs_DATA = lov.o EXTRA_PROGRAMS = lov -LINX=client.c -lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c $(LINX) +lov_SOURCES = lov_obd.c lov_pack.c lproc_lov.c endif - -client.c: - test -e client.c || ln -sf $(top_srcdir)/lib/client.c - include $(top_srcdir)/Rules diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 19738b9..1a4f6c4 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -45,29 +45,125 @@ #include <linux/obd_lov.h> #include <linux/lprocfs_status.h> -static kmem_cache_t *lov_file_cache; - struct lov_file_handles { + struct portals_handle lfh_handle; + atomic_t lfh_refcount; struct list_head lfh_list; - __u64 lfh_cookie; int lfh_count; - char *lfh_data; /* an array of opaque data saved on behalf of - * each osc, FD_OSTDATA_SIZE bytes for each */ + struct obd_client_handle *lfh_och; }; struct lov_lock_handles { - __u64 llh_cookie; + struct portals_handle llh_handle; + atomic_t llh_refcount; + int llh_stripe_count; struct lustre_handle llh_handles[0]; }; -extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm, - struct lov_stripe_md *lsm); -extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm, - struct lov_mds_md *lmm); -extern int lov_setstripe(struct lustre_handle *conn, - struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu); -extern int lov_getstripe(struct lustre_handle *conn, struct lov_mds_md *lmmu, - struct lov_stripe_md *lsm); +/* lov_file_handles helpers */ +static void lov_lfh_addref(void *lfhp) +{ + struct lov_file_handles *lfh = lfhp; + + atomic_inc(&lfh->lfh_refcount); + CDEBUG(D_INFO, "GETting lfh %p : new refcount %d\n", lfh, + atomic_read(&lfh->lfh_refcount)); +} + +static struct lov_file_handles *lov_lfh_new(void) +{ + struct lov_file_handles *lfh; + + OBD_ALLOC(lfh, sizeof *lfh); + if (lfh == NULL) { + CERROR("out of memory\n"); + return NULL; + } + + atomic_set(&lfh->lfh_refcount, 2); + + INIT_LIST_HEAD(&lfh->lfh_handle.h_link); + class_handle_hash(&lfh->lfh_handle, lov_lfh_addref); + + return lfh; +} + +static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle) +{ + ENTRY; + LASSERT(handle != NULL); + RETURN(class_handle2object(handle->cookie)); +} + +static void lov_lfh_put(struct lov_file_handles *lfh) +{ + CDEBUG(D_INFO, "PUTting lfh %p : new refcount %d\n", lfh, + atomic_read(&lfh->lfh_refcount) - 1); + LASSERT(atomic_read(&lfh->lfh_refcount) > 0 && + atomic_read(&lfh->lfh_refcount) < 0x5a5a); + if (atomic_dec_and_test(&lfh->lfh_refcount)) { + LASSERT(list_empty(&lfh->lfh_handle.h_link)); + OBD_FREE(lfh, sizeof *lfh); + } +} + +static void lov_lfh_destroy(struct lov_file_handles *lfh) +{ + class_handle_unhash(&lfh->lfh_handle); + lov_lfh_put(lfh); +} + +static void lov_llh_addref(void *llhp) +{ + struct lov_lock_handles *llh = llhp; + + atomic_inc(&llh->llh_refcount); + CDEBUG(D_INFO, "GETting llh %p : new refcount %d\n", llh, + atomic_read(&llh->llh_refcount)); +} + +static struct lov_lock_handles *lov_llh_new(struct lov_stripe_md *lsm) +{ + struct lov_lock_handles *llh; + + OBD_ALLOC(llh, sizeof *llh + + sizeof(*llh->llh_handles) * lsm->lsm_stripe_count); + if (llh == NULL) { + CERROR("out of memory\n"); + return NULL; + } + atomic_set(&llh->llh_refcount, 2); + llh->llh_stripe_count = lsm->lsm_stripe_count; + INIT_LIST_HEAD(&llh->llh_handle.h_link); + class_handle_hash(&llh->llh_handle, lov_llh_addref); + return llh; +} + +static struct lov_lock_handles *lov_handle2llh(struct lustre_handle *handle) +{ + ENTRY; + LASSERT(handle != NULL); + RETURN(class_handle2object(handle->cookie)); +} + +static void lov_llh_put(struct lov_lock_handles *llh) +{ + CDEBUG(D_INFO, "PUTting llh %p : new refcount %d\n", llh, + atomic_read(&llh->llh_refcount) - 1); + LASSERT(atomic_read(&llh->llh_refcount) > 0 && + atomic_read(&llh->llh_refcount) < 0x5a5a); + if (atomic_dec_and_test(&llh->llh_refcount)) { + LASSERT(list_empty(&llh->llh_handle.h_link)); + OBD_FREE(llh, sizeof *llh + + sizeof(*llh->llh_handles) * llh->llh_stripe_count); + } +} + +static void lov_llh_destroy(struct lov_lock_handles *llh) +{ + class_handle_unhash(&llh->llh_handle); + lov_llh_put(llh); +} /* obd methods */ int lov_attach(struct obd_device *dev, obd_count len, void *data) @@ -84,18 +180,18 @@ int lov_detach(struct obd_device *dev) } static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { struct ptlrpc_request *req = NULL; struct lov_obd *lov = &obd->u.lov; struct client_obd *mdc = &lov->mdcobd->u.cli; struct lov_desc *desc = &lov->desc; + struct lov_desc *mdesc; struct lov_tgt_desc *tgts; struct obd_export *exp; struct lustre_handle mdc_conn; struct obd_uuid lov_mds_uuid = {"LOV_MDS_UUID"}; - char *tmp; + struct obd_uuid *uuids; int rc, rc2, i; ENTRY; @@ -114,14 +210,14 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head); /* retrieve LOV metadata from MDS */ - rc = obd_connect(&mdc_conn, lov->mdcobd, &lov_mds_uuid, recovd,recover); + rc = obd_connect(&mdc_conn, lov->mdcobd, &lov_mds_uuid); if (rc) { CERROR("cannot connect to mdc: rc = %d\n", rc); GOTO(out_conn, rc); } rc = mdc_getlovinfo(obd, &mdc_conn, &req); - rc2 = obd_disconnect(&mdc_conn); + rc2 = obd_disconnect(&mdc_conn, 0); if (rc) { CERROR("cannot get lov info %d\n", rc); GOTO(out_conn, rc); @@ -129,36 +225,24 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, if (rc2) { CERROR("error disconnecting from MDS %d\n", rc2); - GOTO(out_conn, rc = rc2); - } - - /* sanity... */ - if (req->rq_repmsg->bufcount < 2 || - req->rq_repmsg->buflens[0] < sizeof(*desc)) { - CERROR("LOV desc: invalid descriptor returned\n"); - GOTO(out_conn, rc = -EINVAL); + GOTO(out_req, rc = rc2); } - memcpy(desc, lustre_msg_buf(req->rq_repmsg, 0), sizeof(*desc)); - lov_unpackdesc(desc); + /* mdc_getlovinfo() has checked and swabbed the reply. It has also + * done some simple checks (e.g. #uuids consistent with desc, uuid + * array fits in LOV_MAX_UUID_BUFFER_SIZE and all uuids are + * terminated), but I still need to verify it makes overall + * sense */ + mdesc = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*mdesc)); + LASSERT (mdesc != NULL); + LASSERT_REPSWABBED (req, 0); - if (req->rq_repmsg->buflens[1] < - sizeof(desc->ld_uuid.uuid) * desc->ld_tgt_count){ - CERROR("LOV desc: invalid uuid array returned\n"); - GOTO(out_conn, rc = -EINVAL); - } + *desc = *mdesc; - if (memcmp(obd->obd_uuid.uuid, desc->ld_uuid.uuid, - sizeof(desc->ld_uuid.uuid))) { + if (!obd_uuid_equals(&obd->obd_uuid, &desc->ld_uuid)) { CERROR("LOV desc: uuid %s not on mds device (%s)\n", obd->obd_uuid.uuid, desc->ld_uuid.uuid); - GOTO(out_conn, rc = -EINVAL); - } - - if (desc->ld_tgt_count > 1000) { - CERROR("LOV desc: target count > 1000 (%d)\n", - desc->ld_tgt_count); - GOTO(out_conn, rc = -EINVAL); + GOTO(out_req, rc = -EINVAL); } /* Because of 64-bit divide/mod operations only work with a 32-bit @@ -172,38 +256,45 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, desc->ld_default_stripe_size, desc->ld_default_stripe_count ? desc->ld_default_stripe_count : desc->ld_tgt_count,~0UL); - GOTO(out_conn, rc = -EINVAL); + GOTO(out_req, rc = -EINVAL); } + /* We know ld_tgt_count is reasonable (the array of UUIDS fits in + * the maximum buffer size, so we won't be making outrageous + * demands on memory here. */ lov->bufsize = sizeof(struct lov_tgt_desc) * desc->ld_tgt_count; OBD_ALLOC(lov->tgts, lov->bufsize); if (!lov->tgts) { CERROR("Out of memory\n"); - GOTO(out_conn, rc = -ENOMEM); + GOTO(out_req, rc = -ENOMEM); } - tmp = lustre_msg_buf(req->rq_repmsg, 1); + uuids = lustre_msg_buf(req->rq_repmsg, 1, + sizeof(*uuids) * desc->ld_tgt_count); + LASSERT (uuids != NULL); + LASSERT_REPSWABBED (req, 1); + for (i = 0, tgts = lov->tgts; i < desc->ld_tgt_count; i++, tgts++) { struct obd_uuid *uuid = &tgts->uuid; struct obd_device *tgt_obd; struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; - obd_str2uuid(uuid, tmp); + /* NULL termination already checked */ + *uuid = uuids[i]; + tgt_obd = client_tgtuuid2obd(uuid); - tmp += sizeof(uuid->uuid); if (!tgt_obd) { CERROR("Target %s not attached\n", uuid->uuid); GOTO(out_disc, rc = -EINVAL); } - if (!(tgt_obd->obd_flags & OBD_SET_UP)) { + if (!tgt_obd->obd_set_up) { CERROR("Target %s not set up\n", uuid->uuid); GOTO(out_disc, rc = -EINVAL); } - rc = obd_connect(&tgts->conn, tgt_obd, &lov_osc_uuid, recovd, - recover); + rc = obd_connect(&tgts->conn, tgt_obd, &lov_osc_uuid); if (rc) { CERROR("Target %s connect error %d\n", uuid->uuid, rc); @@ -215,7 +306,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, if (rc) { CERROR("Target %s REGISTER_LOV error %d\n", uuid->uuid, rc); - obd_disconnect(&tgts->conn); + obd_disconnect(&tgts->conn, 0); GOTO(out_disc, rc); } @@ -223,11 +314,10 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, tgts->active = 1; } - mdc->cl_max_mds_easize = obd_size_wiremd(conn, NULL); - - out: - ptlrpc_req_finished(req); - RETURN(rc); + mdc->cl_max_mds_easize = obd_size_diskmd(conn, NULL); + ptlrpc_req_finished (req); + class_export_put(exp); + RETURN (0); out_disc: while (i-- > 0) { @@ -235,25 +325,30 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, --tgts; --desc->ld_active_tgt_count; tgts->active = 0; - obd_str2uuid(&uuid, tgts->uuid.uuid); - rc2 = obd_disconnect(&tgts->conn); + /* save for CERROR below; (we know it's terminated) */ + uuid = tgts->uuid; + rc2 = obd_disconnect(&tgts->conn, 0); if (rc2) CERROR("error: LOV target %s disconnect on OST idx %d: " "rc = %d\n", uuid.uuid, i, rc2); } OBD_FREE(lov->tgts, lov->bufsize); + out_req: + ptlrpc_req_finished (req); out_conn: - class_disconnect(conn); - goto out; + class_export_put(exp); + class_disconnect(conn, 0); + RETURN (rc); } -static int lov_disconnect(struct lustre_handle *conn) +static int lov_disconnect(struct lustre_handle *conn, int failover) { struct obd_device *obd = class_conn2obd(conn); struct lov_obd *lov = &obd->u.lov; struct obd_export *exp; struct list_head *p, *n; int rc, i; + ENTRY; if (!lov->tgts) goto out_local; @@ -264,7 +359,16 @@ static int lov_disconnect(struct lustre_handle *conn) goto out_local; for (i = 0; i < lov->desc.ld_tgt_count; i++) { - rc = obd_disconnect(&lov->tgts[i].conn); + if (obd->obd_no_recov) { + /* Pass it on to our clients. + * XXX This should be an argument to disconnect, + * XXX not a back-door flag on the OBD. Ah well. + */ + struct obd_device *osc_obd = + class_conn2obd(&lov->tgts[i].conn); + osc_obd->obd_no_recov = 1; + } + rc = obd_disconnect(&lov->tgts[i].conn, failover); if (rc) { if (lov->tgts[i].active) { CERROR("Target %s disconnect error %d\n", @@ -282,22 +386,29 @@ static int lov_disconnect(struct lustre_handle *conn) lov->tgts = NULL; exp = class_conn2export(conn); + if (exp == NULL) { + CERROR("export handle "LPU64" invalid! If you can reproduce, " + "please send a full debug log to phik\n", conn->cookie); + RETURN(0); + } spin_lock(&exp->exp_lov_data.led_lock); list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) { /* XXX close these, instead of just discarding them? */ struct lov_file_handles *lfh; lfh = list_entry(p, typeof(*lfh), lfh_list); CERROR("discarding open LOV handle %p:"LPX64"\n", - lfh, lfh->lfh_cookie); + lfh, lfh->lfh_handle.h_cookie); list_del(&lfh->lfh_list); - OBD_FREE(lfh->lfh_data, lfh->lfh_count * FD_OSTDATA_SIZE); - PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh)); + OBD_FREE(lfh->lfh_och, lfh->lfh_count * FD_OSTDATA_SIZE); + lov_lfh_destroy(lfh); + lov_lfh_put(lfh); } spin_unlock(&exp->exp_lov_data.led_lock); + class_export_put(exp); out_local: - rc = class_disconnect(conn); - return rc; + rc = class_disconnect(conn, 0); + RETURN(rc); } /* Error codes: @@ -305,7 +416,6 @@ static int lov_disconnect(struct lustre_handle *conn) * -EINVAL : UUID can't be found in the LOV's target list * -ENOTCONN: The UUID is found, but the target connection is bad (!) * -EBADF : The UUID is found, but the OBD is the wrong type (!) - * -EALREADY: The OSC is already marked (in)active */ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, int activate) @@ -321,8 +431,8 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, spin_lock(&lov->lov_lock); for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n", - i, tgt->uuid.uuid, tgt->conn.addr); - if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof(uuid->uuid)) == 0) + i, tgt->uuid.uuid, tgt->conn.cookie); + if (strncmp(uuid->uuid, tgt->uuid.uuid, sizeof uuid->uuid) == 0) break; } @@ -331,22 +441,19 @@ static int lov_set_osc_active(struct lov_obd *lov, struct obd_uuid *uuid, obd = class_conn2obd(&tgt->conn); if (obd == NULL) { - LBUG(); + /* This can happen if OST failure races with node shutdown */ GOTO(out, rc = -ENOTCONN); } CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n", obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, obd->obd_type->typ_name, i); - if (strcmp(obd->obd_type->typ_name, "osc") != 0) { - LBUG(); - GOTO(out, rc = -EBADF); - } + LASSERT(strcmp(obd->obd_type->typ_name, "osc") == 0); if (tgt->active == activate) { CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, activate ? "" : "in"); - GOTO(out, rc = -EALREADY); + GOTO(out, rc); } CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in"); @@ -407,21 +514,55 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) RETURN(rc); } -static struct lov_file_handles *lov_handle2lfh(struct lustre_handle *handle) +/* compute object size given "stripeno" and the ost size */ +static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, + int stripeno) { - struct lov_file_handles *lfh = NULL; + unsigned long ssize = lsm->lsm_stripe_size; + unsigned long swidth = ssize * lsm->lsm_stripe_count; + unsigned long stripe_size; + obd_size lov_size; + + if (ost_size == 0) + return 0; + + /* do_div(a, b) returns a % b, and a = a / b */ + stripe_size = do_div(ost_size, ssize); - if (!handle || !handle->addr) - RETURN(NULL); + if (stripe_size) + lov_size = ost_size * swidth + stripeno * ssize + stripe_size; + else + lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; - lfh = (struct lov_file_handles *)(unsigned long)(handle->addr); - if (!kmem_cache_validate(lov_file_cache, lfh)) - RETURN(NULL); + return lov_size; +} - if (lfh->lfh_cookie != handle->cookie) - RETURN(NULL); +static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid, + struct lov_stripe_md *lsm, int stripeno, int *set) +{ + if (*set) { + if (valid & OBD_MD_FLSIZE) { + /* this handles sparse files properly */ + obd_size lov_size; - return lfh; + lov_size = lov_stripe_size(lsm, src->o_size, stripeno); + if (lov_size > tgt->o_size) + tgt->o_size = lov_size; + } + if (valid & OBD_MD_FLBLOCKS) + tgt->o_blocks += src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + tgt->o_blksize += src->o_blksize; + if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime) + tgt->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime) + tgt->o_mtime = src->o_mtime; + } else { + obdo_cpy_md(tgt, src, valid); + if (valid & OBD_MD_FLSIZE) + tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno); + *set = 1; + } } /* the LOV expects oa->o_id to be set to the LOV object id */ @@ -433,24 +574,24 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *lsm; struct lov_oinfo *loi; struct obdo *tmp; - int ost_count, ost_idx; - int first = 1, obj_alloc = 0; + unsigned ost_count, ost_idx; + int set = 0, obj_alloc = 0; int rc = 0, i; ENTRY; LASSERT(ea); if (!export) - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); lov = &export->exp_obd->u.lov; if (!lov->desc.ld_active_tgt_count) - RETURN(-EIO); + GOTO(out_exp, rc = -EIO); tmp = obdo_alloc(); if (!tmp) - RETURN(-ENOMEM); + GOTO(out_exp, rc = -ENOMEM); lsm = *ea; @@ -471,11 +612,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size; if (!*ea || lsm->lsm_stripe_offset >= ost_count) { - int mult = lsm->lsm_object_id * lsm->lsm_stripe_count; - int stripe_offset = mult % ost_count; - int sub_offset = (mult / ost_count); - - ost_idx = (stripe_offset + sub_offset) % ost_count; + get_random_bytes(&ost_idx, 2); + ost_idx %= ost_count; } else ost_idx = lsm->lsm_stripe_offset; @@ -517,10 +655,9 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n", lsm->lsm_object_id, loi->loi_id, ost_idx); - if (first) { + if (!set) lsm->lsm_stripe_offset = ost_idx; - first = 0; - } + lov_merge_attrs(oa, tmp, OBD_MD_FLBLKSZ, lsm, obj_alloc, &set); ++obj_alloc; ++loi; @@ -532,13 +669,15 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, } } - if (*ea) + if (*ea != NULL) { GOTO(out_cleanup, rc); - else { + } else { struct lov_stripe_md *lsm_new; /* XXX LOV STACKING call into osc for sizes */ - int size = lov_stripe_md_size(obj_alloc); + unsigned size = lov_stripe_md_size(obj_alloc); + CERROR("reallocating LSM for objid "LPX64": old %u new %u\n", + lsm->lsm_object_id, obj_alloc, lsm->lsm_stripe_count); OBD_ALLOC(lsm_new, size); if (!lsm_new) GOTO(out_cleanup, rc = -ENOMEM); @@ -554,6 +693,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, out_tmp: obdo_free(tmp); + out_exp: + class_export_put(export); return rc; out_cleanup: @@ -564,14 +705,15 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, /* destroy already created objects here */ memcpy(tmp, oa, sizeof(*tmp)); tmp->o_id = loi->loi_id; - err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL, NULL); + err = obd_destroy(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL, + NULL); if (err) CERROR("Failed to uncreate objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); } - if (!*ea) + if (*ea == NULL) obd_free_memmd(conn, &lsm); goto out_tmp; } @@ -589,17 +731,17 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, if (!lsm) { CERROR("LOV requires striping ea for destruction\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); if (oa->o_valid & OBD_MD_FLHANDLE) lfh = lov_handle2lfh(obdo_handle(oa)); @@ -616,8 +758,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, memcpy(&tmp, oa, sizeof(tmp)); tmp.o_id = loi->loi_id; if (lfh) - memcpy(obdo_handle(&tmp), - lfh->lfh_data + i * FD_OSTDATA_SIZE, + memcpy(obdo_handle(&tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); else tmp.o_valid &= ~OBD_MD_FLHANDLE; @@ -625,62 +766,18 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, NULL, NULL); if (err && lov->tgts[loi->loi_ost_idx].active) { CERROR("error: destroying objid "LPX64" subobj " - LPX64" on OST idx %d\n: rc = %d", + LPX64" on OST idx %d: rc = %d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); if (!rc) rc = err; } } - RETURN(rc); -} - -/* compute object size given "stripeno" and the ost size */ -static obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, - int stripeno) -{ - unsigned long ssize = lsm->lsm_stripe_size; - unsigned long swidth = ssize * lsm->lsm_stripe_count; - unsigned long stripe_size; - obd_size lov_size; - - if (ost_size == 0) - return 0; - - /* do_div(a, b) returns a % b, and a = a / b */ - stripe_size = do_div(ost_size, ssize); - - if (stripe_size) - lov_size = ost_size * swidth + stripeno * ssize + stripe_size; - else - lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; - - return lov_size; -} - -static void lov_merge_attrs(struct obdo *tgt, struct obdo *src, obd_flag valid, - struct lov_stripe_md *lsm, int stripeno, int *set) -{ - if (*set) { - if (valid & OBD_MD_FLSIZE) { - /* this handles sparse files properly */ - obd_size lov_size; - - lov_size = lov_stripe_size(lsm, src->o_size, stripeno); - if (lov_size > tgt->o_size) - tgt->o_size = lov_size; - } - if (valid & OBD_MD_FLBLOCKS) - tgt->o_blocks += src->o_blocks; - if (valid & OBD_MD_FLCTIME && tgt->o_ctime < src->o_ctime) - tgt->o_ctime = src->o_ctime; - if (valid & OBD_MD_FLMTIME && tgt->o_mtime < src->o_mtime) - tgt->o_mtime = src->o_mtime; - } else { - obdo_cpy_md(tgt, src, valid); - if (valid & OBD_MD_FLSIZE) - tgt->o_size = lov_stripe_size(lsm,src->o_size,stripeno); - *set = 1; - } + if (lfh != NULL) + lov_lfh_put(lfh); + EXIT; + out: + class_export_put(export); + return rc; } static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, @@ -691,23 +788,22 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, struct lov_obd *lov; struct lov_oinfo *loi; struct lov_file_handles *lfh = NULL; - int i; - int set = 0; + int i, rc = 0, set = 0; ENTRY; if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); lov = &export->exp_obd->u.lov; @@ -730,8 +826,7 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, memcpy(&tmp, oa, sizeof(tmp)); tmp.o_id = loi->loi_id; if (lfh) - memcpy(obdo_handle(&tmp), - lfh->lfh_data + i * FD_OSTDATA_SIZE, + memcpy(obdo_handle(&tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); else tmp.o_valid &= ~OBD_MD_FLHANDLE; @@ -743,14 +838,145 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, LPX64" on OST idx %d: rc = %d\n", oa->o_id, loi->loi_id, loi->loi_ost_idx, err); - RETURN(err); + GOTO(out, rc = err); } } else { lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &set); } } + if (!set) + rc = -EIO; + GOTO(out, rc); + out: + if (lfh != NULL) + lov_lfh_put(lfh); + class_export_put(export); + return rc; +} + +static int lov_getattr_interpret(struct ptlrpc_request_set *rqset, + struct lov_getattr_async_args *aa, int rc) +{ + struct lov_stripe_md *lsm = aa->aa_lsm; + struct obdo *oa = aa->aa_oa; + struct obdo *obdos = aa->aa_stripe_oas; + struct lov_oinfo *loi; + int i; + int set = 0; + ENTRY; + + if (rc == 0) { + /* NB all stripe requests succeeded to get here */ + + for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; + i++,loi++) { + if (obdos[i].o_valid == 0) /* inactive stripe */ + continue; + + lov_merge_attrs(oa, &obdos[i], obdos[i].o_valid, lsm, + i, &set); + } + + if (!set) { + CERROR ("No stripes had valid attrs\n"); + rc = -EIO; + } + } + + OBD_FREE (obdos, lsm->lsm_stripe_count * sizeof (*obdos)); + RETURN (rc); +} + +static int lov_getattr_async (struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *lsm, + struct ptlrpc_request_set *rqset) +{ + struct obdo *obdos; + struct obd_export *export = class_conn2export(conn); + struct lov_obd *lov; + struct lov_oinfo *loi; + struct lov_file_handles *lfh = NULL; + struct lov_getattr_async_args *aa; + int i; + int set = 0; + int rc = 0; + ENTRY; + + if (!lsm) { + CERROR("LOV requires striping ea\n"); + GOTO(out, rc = -EINVAL); + } + + if (lsm->lsm_magic != LOV_MAGIC) { + CERROR("LOV striping magic bad %#x != %#x\n", + lsm->lsm_magic, LOV_MAGIC); + GOTO(out, rc = -EINVAL); + } + + if (!export || !export->exp_obd) + GOTO(out, rc = -ENODEV); + + lov = &export->exp_obd->u.lov; + + OBD_ALLOC (obdos, lsm->lsm_stripe_count * sizeof (*obdos)); + if (obdos == NULL) + GOTO (out, rc = -ENOMEM); + + if (oa->o_valid & OBD_MD_FLHANDLE) + lfh = lov_handle2lfh(obdo_handle(oa)); + + CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n", + lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size); + for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { + int err; + + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + /* leaves obdos[i].obd_valid unset */ + continue; + } + + CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " + "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx); + /* create data objects with "parent" OA */ + memcpy(&obdos[i], oa, sizeof(obdos[i])); + obdos[i].o_id = loi->loi_id; + if (lfh) + memcpy(obdo_handle(&obdos[i]), lfh->lfh_och + i, + FD_OSTDATA_SIZE); + else + obdos[i].o_valid &= ~OBD_MD_FLHANDLE; - RETURN(set ? 0 : -EIO); + err = obd_getattr_async (&lov->tgts[loi->loi_ost_idx].conn, + &obdos[i], NULL, rqset); + if (err) { + CERROR("error: getattr objid "LPX64" subobj " + LPX64" on OST idx %d: rc = %d\n", + oa->o_id, loi->loi_id, loi->loi_ost_idx, + err); + GOTO(out_obdos, rc = err); + } + set = 1; + } + if (!set) + GOTO (out_obdos, rc = -EIO); + + LASSERT (rqset->set_interpret == NULL); + rqset->set_interpret = lov_getattr_interpret; + LASSERT (sizeof (rqset->set_args) >= sizeof (*aa)); + aa = (struct lov_getattr_async_args *)&rqset->set_args; + aa->aa_lsm = lsm; + aa->aa_oa = oa; + aa->aa_stripe_oas = obdos; + GOTO (out, rc = 0); + + out_obdos: + OBD_FREE (obdos, lsm->lsm_stripe_count * sizeof (*obdos)); + out: + if (lfh != NULL) + lov_lfh_put(lfh); + class_export_put(export); + RETURN (rc); } static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, @@ -766,17 +992,17 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); /* size changes should go through punch and not setattr */ LASSERT(!(oa->o_valid & OBD_MD_FLSIZE)); @@ -786,7 +1012,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, tmp = obdo_alloc(); if (!tmp) - RETURN(-ENOMEM); + GOTO(out, rc = -ENOMEM); if (oa->o_valid & OBD_MD_FLHANDLE) lfh = lov_handle2lfh(obdo_handle(oa)); @@ -803,8 +1029,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, obdo_cpy_md(tmp, oa, oa->o_valid); if (lfh) - memcpy(obdo_handle(tmp), - lfh->lfh_data + i * FD_OSTDATA_SIZE, + memcpy(obdo_handle(tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); else tmp->o_valid &= ~OBD_MD_FLHANDLE; @@ -828,45 +1053,50 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, obdo_free(tmp); if (!set && !rc) rc = -EIO; - RETURN(rc); + if (lfh != NULL) + lov_lfh_put(lfh); + GOTO(out, rc); + out: + class_export_put(export); + return rc; } static int lov_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *lsm, struct obd_trans_info *oti) + struct lov_stripe_md *lsm, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct obdo *tmp; /* on the heap here, on the stack in lov_close? */ struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; struct lov_oinfo *loi; struct lov_file_handles *lfh = NULL; - struct lustre_handle *handle; - int set = 0; - int rc = 0, i; + int set = 0, rc = 0, i; ENTRY; + LASSERT(och != NULL); if (!lsm) { CERROR("LOV requires striping ea for opening\n"); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out_exp, rc = -ENODEV); tmp = obdo_alloc(); if (!tmp) - RETURN(-ENOMEM); + GOTO(out_exp, rc = -ENOMEM); - PORTAL_SLAB_ALLOC(lfh, lov_file_cache, sizeof(*lfh)); - if (!lfh) + lfh = lov_lfh_new(); + if (lfh == NULL) GOTO(out_tmp, rc = -ENOMEM); - OBD_ALLOC(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE); - if (!lfh->lfh_data) + OBD_ALLOC(lfh->lfh_och, lsm->lsm_stripe_count * sizeof *och); + if (!lfh->lfh_och) GOTO(out_lfh, rc = -ENOMEM); lov = &export->exp_obd->u.lov; @@ -883,10 +1113,12 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, tmp->o_id = loi->loi_id; rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, - NULL, NULL); + NULL, NULL, lfh->lfh_och + i); if (rc) { - if (!lov->tgts[loi->loi_ost_idx].active) + if (!lov->tgts[loi->loi_ost_idx].active) { + rc = 0; continue; + } CERROR("error: open objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", oa->o_id, lsm->lsm_oinfo[i].loi_id, @@ -895,31 +1127,26 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, } lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &set); - - if (tmp->o_valid & OBD_MD_FLHANDLE) - memcpy(lfh->lfh_data + i * FD_OSTDATA_SIZE, - obdo_handle(tmp), FD_OSTDATA_SIZE); } - handle = obdo_handle(oa); - lfh->lfh_count = lsm->lsm_stripe_count; - get_random_bytes(&lfh->lfh_cookie, sizeof(lfh->lfh_cookie)); - - handle->addr = (__u64)(unsigned long)lfh; - handle->cookie = lfh->lfh_cookie; + och->och_fh.cookie = lfh->lfh_handle.h_cookie; + obdo_handle(oa)->cookie = lfh->lfh_handle.h_cookie; oa->o_valid |= OBD_MD_FLHANDLE; + + /* llfh refcount transfers to list */ spin_lock(&export->exp_lov_data.led_lock); list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head); spin_unlock(&export->exp_lov_data.led_lock); - if (!set && !rc) - rc = -EIO; -out_tmp: + GOTO(out_tmp, rc); + out_tmp: obdo_free(tmp); - RETURN(rc); + out_exp: + class_export_put(export); + return rc; -out_handles: + out_handles: for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) { int err; @@ -928,8 +1155,7 @@ out_handles: memcpy(tmp, oa, sizeof(*tmp)); tmp->o_id = loi->loi_id; - memcpy(obdo_handle(tmp), lfh->lfh_data + i * FD_OSTDATA_SIZE, - FD_OSTDATA_SIZE); + memcpy(obdo_handle(tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); err = obd_close(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL, NULL); @@ -940,9 +1166,10 @@ out_handles: } } - OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count * FD_OSTDATA_SIZE); -out_lfh: - PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh)); + OBD_FREE(lfh->lfh_och, lsm->lsm_stripe_count * FD_OSTDATA_SIZE); + out_lfh: + lov_lfh_destroy(lfh); + lov_lfh_put(lfh); goto out_tmp; } @@ -959,17 +1186,17 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); if (oa->o_valid & OBD_MD_FLHANDLE) lfh = lov_handle2lfh(obdo_handle(oa)); @@ -978,17 +1205,11 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { int err; - if (lov->tgts[loi->loi_ost_idx].active == 0) { - CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); - continue; - } - /* create data objects with "parent" OA */ memcpy(&tmp, oa, sizeof(tmp)); tmp.o_id = loi->loi_id; if (lfh) - memcpy(obdo_handle(&tmp), - lfh->lfh_data + i * FD_OSTDATA_SIZE, + memcpy(obdo_handle(&tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); else tmp.o_valid &= ~OBD_MD_FLHANDLE; @@ -1005,50 +1226,140 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, rc = err; } } - if (lfh) { + if (lfh != NULL) { spin_lock(&export->exp_lov_data.led_lock); list_del(&lfh->lfh_list); spin_unlock(&export->exp_lov_data.led_lock); + lov_lfh_put(lfh); /* drop the reference owned by the list */ - OBD_FREE(lfh->lfh_data, lsm->lsm_stripe_count*FD_OSTDATA_SIZE); - PORTAL_SLAB_FREE(lfh, lov_file_cache, sizeof(*lfh)); + OBD_FREE(lfh->lfh_och, lsm->lsm_stripe_count * FD_OSTDATA_SIZE); + lov_lfh_destroy(lfh); + lov_lfh_put(lfh); /* balance handle2lfh above */ } - - RETURN(rc); + GOTO(out, rc); + out: + class_export_put(export); + return rc; } #ifndef log2 #define log2(n) ffz(~(n)) #endif -#warning FIXME: merge these two functions now that they are nearly the same - -/* compute ost offset in stripe "stripeno" corresponding to offset "lov_off" */ -static obd_off lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, - int stripeno) +/* we have an offset in file backed by an lov and want to find out where + * that offset lands in our given stripe of the file. for the easy + * case where the offset is within the stripe, we just have to scale the + * offset down to make it relative to the stripe instead of the lov. + * + * the harder case is what to do when the offset doesn't intersect the + * stripe. callers will want start offsets clamped ahead to the start + * of the nearest stripe in the file. end offsets similarly clamped to the + * nearest ending byte of a stripe in the file: + * + * all this function does is move offsets to the nearest region of the + * stripe, and it does its work "mod" the full length of all the stripes. + * consider a file with 3 stripes: + * + * S E + * --------------------------------------------------------------------- + * | 0 | 1 | 2 | 0 | 1 | 2 | + * --------------------------------------------------------------------- + * + * to find stripe 1's offsets for S and E, it divides by the full stripe + * width and does its math in the context of a single set of stripes: + * + * S E + * ----------------------------------- + * | 0 | 1 | 2 | + * ----------------------------------- + * + * it'll notice that E is outside stripe 1 and clamp it to the end of the + * stripe, then multiply it back out by lov_off to give the real offsets in + * the stripe: + * + * S E + * --------------------------------------------------------------------- + * | 1 | 1 | 1 | 1 | 1 | 1 | + * --------------------------------------------------------------------- + * + * it would have done similarly and pulled S forward to the start of a 1 + * stripe if, say, S had landed in a 0 stripe. + * + * this rounding isn't always correct. consider an E lov offset that lands + * on a 0 stripe, the "mod stripe width" math will pull it forward to the + * start of a 1 stripe, when in fact it wanted to be rounded back to the end + * of a previous 1 stripe. this logic is handled by callers and this is why: + * + * this function returns < 0 when the offset was "before" the stripe and + * was moved forward to the start of the stripe in question; 0 when it + * falls in the stripe and no shifting was done; > 0 when the offset + * was outside the stripe and was pulled back to its final byte. */ +static int lov_stripe_offset(struct lov_stripe_md *lsm, obd_off lov_off, + int stripeno, obd_off *obd_off) { unsigned long ssize = lsm->lsm_stripe_size; unsigned long swidth = ssize * lsm->lsm_stripe_count; unsigned long stripe_off, this_stripe; + int ret = 0; - if (lov_off == OBD_OBJECT_EOF || lov_off == 0) - return lov_off; + if (lov_off == OBD_OBJECT_EOF) { + *obd_off = OBD_OBJECT_EOF; + return 0; + } /* do_div(a, b) returns a % b, and a = a / b */ stripe_off = do_div(lov_off, swidth); this_stripe = stripeno * ssize; - if (stripe_off <= this_stripe) + if (stripe_off < this_stripe) { stripe_off = 0; - else { + ret = -1; + } else { stripe_off -= this_stripe; - if (stripe_off > ssize) + if (stripe_off >= ssize) { stripe_off = ssize; + ret = 1; + } } + *obd_off = lov_off * ssize + stripe_off; + return ret; +} + +/* given an extent in an lov and a stripe, calculate the extent of the stripe + * that is contained within the lov extent. this returns true if the given + * stripe does intersect with the lov extent. */ +static int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, + obd_off start, obd_off end, + obd_off *obd_start, obd_off *obd_end) +{ + int start_side, end_side; + + start_side = lov_stripe_offset(lsm, start, stripeno, obd_start); + end_side = lov_stripe_offset(lsm, end, stripeno, obd_end); + + CDEBUG(D_INODE, "["LPU64"->"LPU64"] -> [(%d) "LPU64"->"LPU64" (%d)]\n", + start, end, start_side, *obd_start, *obd_end, end_side); - return lov_off * ssize + stripe_off; + /* this stripe doesn't intersect the file extent when neither + * start or the end intersected the stripe and obd_start and + * obd_end got rounded up to the save value. */ + if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) + return 0; + + /* as mentioned in the lov_stripe_offset commentary, end + * might have been shifted in the wrong direction. This + * happens when an end offset is before the stripe when viewed + * through the "mod stripe size" math. we detect it being shifted + * in the wrong direction and touch it up. + * interestingly, this can't underflow since end must be > start + * if we passed through the previous check. + * (should we assert for that somewhere?) */ + if (end_side != 0) + (*obd_end)--; + + return 1; } /* compute which stripe number "lov_off" will be written into */ @@ -1063,7 +1374,6 @@ static int lov_stripe_number(struct lov_stripe_md *lsm, obd_off lov_off) return stripe_off / ssize; } - /* FIXME: maybe we'll just make one node the authoritative attribute node, then * we can send this 'punch' to just the authoritative node and the nodes * that the punch will affect. */ @@ -1081,36 +1391,39 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); if (oa->o_valid & OBD_MD_FLHANDLE) lfh = lov_handle2lfh(obdo_handle(oa)); lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { - obd_off starti = lov_stripe_offset(lsm, start, i); - obd_off endi = lov_stripe_offset(lsm, end, i); + obd_off starti, endi; int err; - if (starti == endi) + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + continue; + } + + if (!lov_stripe_intersects(lsm, i, start, end, &starti, &endi)) continue; /* create data objects with "parent" OA */ memcpy(&tmp, oa, sizeof(tmp)); tmp.o_id = loi->loi_id; if (lfh) - memcpy(obdo_handle(&tmp), - lfh->lfh_data + i * FD_OSTDATA_SIZE, + memcpy(obdo_handle(&tmp), lfh->lfh_och + i, FD_OSTDATA_SIZE); else tmp.o_valid &= ~OBD_MD_FLHANDLE; @@ -1127,13 +1440,43 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, rc = err; } } - RETURN(rc); + if (lfh != NULL) + lov_lfh_put(lfh); + GOTO(out, rc); + out: + class_export_put(export); + return rc; +} + +static int lov_brw_check(struct lov_obd *lov, struct lov_stripe_md *lsm, + obd_count oa_bufs, struct brw_page *pga) +{ + int i; + + /* The caller just wants to know if there's a chance that this + * I/O can succeed */ + for (i = 0; i < oa_bufs; i++) { + int stripe = lov_stripe_number(lsm, pga[i].off); + int ost = lsm->lsm_oinfo[stripe].loi_ost_idx; + struct ldlm_extent ext, subext; + ext.start = pga[i].off; + ext.start = pga[i].off + pga[i].count; + + if (!lov_stripe_intersects(lsm, i, ext.start, ext.end, + &subext.start, &subext.end)) + continue; + + if (lov->tgts[ost].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", ost); + return -EIO; + } + } + return 0; } -static inline int lov_brw(int cmd, struct lustre_handle *conn, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set, - struct obd_trans_info *oti) +static int lov_brw(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *lsm, obd_count oa_bufs, + struct brw_page *pga, struct obd_trans_info *oti) { struct { int bufct; @@ -1151,20 +1494,25 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } lov = &export->exp_obd->u.lov; + if (cmd == OBD_BRW_CHECK) { + rc = lov_brw_check(lov, lsm, oa_bufs, pga); + GOTO(out_exp, rc); + } + OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo)); if (!stripeinfo) - GOTO(out_cbdata, rc = -ENOMEM); + GOTO(out_exp, rc = -ENOMEM); OBD_ALLOC(where, sizeof(*where) * oa_bufs); if (!where) @@ -1194,65 +1542,159 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, shift = stripeinfo[which].index + stripeinfo[which].subcount; LASSERT(shift < oa_bufs); ioarr[shift] = pga[i]; - ioarr[shift].off = lov_stripe_offset(lsm, pga[i].off, which); + lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off); stripeinfo[which].subcount++; } for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) { int shift = si->index; + if (lov->tgts[si->ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", si->ost_idx); + GOTO(out_ioarr, rc = -EIO); + } + if (si->bufct) { LASSERT(shift < oa_bufs); rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn, &si->lsm, si->bufct, &ioarr[shift], - set, oti); + oti); if (rc) GOTO(out_ioarr, rc); } } - + GOTO(out_ioarr, rc); out_ioarr: OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs); out_where: OBD_FREE(where, sizeof(*where) * oa_bufs); out_sinfo: OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo)); - out_cbdata: - RETURN(rc); + out_exp: + class_export_put(export); + return rc; } -static struct lov_lock_handles *lov_newlockh(struct lov_stripe_md *lsm) +static int lov_brw_interpret (struct ptlrpc_request_set *set, + struct lov_brw_async_args *aa, int rc) { - struct lov_lock_handles *lov_lockh; - - OBD_ALLOC(lov_lockh, sizeof(*lov_lockh) + - sizeof(*lov_lockh->llh_handles) * lsm->lsm_stripe_count); - if (!lov_lockh) - return NULL; - - get_random_bytes(&lov_lockh->llh_cookie, sizeof(lov_lockh->llh_cookie)); + obd_count oa_bufs = aa->aa_oa_bufs; + struct brw_page *ioarr = aa->aa_ioarr; + ENTRY; - return lov_lockh; + OBD_FREE (ioarr, sizeof (*ioarr) * oa_bufs); + RETURN (rc); } -/* We are only ever passed local lock handles here, so we do not need to - * validate (and we can't really because these structs are variable sized - * and therefore alloced, and not from a private slab). - * - * We just check because we can... - */ -static struct lov_lock_handles *lov_h2lovlockh(struct lustre_handle *handle) +static int lov_brw_async(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *lsm, obd_count oa_bufs, + struct brw_page *pga, struct ptlrpc_request_set *set, + struct obd_trans_info *oti) { - struct lov_lock_handles *lov_lockh = NULL; + struct { + int bufct; + int index; + int subcount; + struct lov_stripe_md lsm; + int ost_idx; + } *stripeinfo, *si, *si_last; + struct obd_export *export = class_conn2export(conn); + struct lov_obd *lov; + struct brw_page *ioarr; + struct lov_oinfo *loi; + struct lov_brw_async_args *aa; + int rc = 0, i, *where, stripe_count = lsm->lsm_stripe_count; + ENTRY; + + if (!lsm) { + CERROR("LOV requires striping ea\n"); + GOTO(out_exp, rc = -EINVAL); + } + + if (lsm->lsm_magic != LOV_MAGIC) { + CERROR("LOV striping magic bad %#x != %#x\n", + lsm->lsm_magic, LOV_MAGIC); + GOTO(out_exp, rc = -EINVAL); + } + + lov = &export->exp_obd->u.lov; + + if (cmd == OBD_BRW_CHECK) { + rc = lov_brw_check(lov, lsm, oa_bufs, pga); + GOTO(out_exp, rc); + } + + OBD_ALLOC(stripeinfo, stripe_count * sizeof(*stripeinfo)); + if (!stripeinfo) + GOTO(out_exp, rc = -ENOMEM); + + OBD_ALLOC(where, sizeof(*where) * oa_bufs); + if (!where) + GOTO(out_sinfo, rc = -ENOMEM); + + OBD_ALLOC(ioarr, sizeof(*ioarr) * oa_bufs); + if (!ioarr) + GOTO(out_where, rc = -ENOMEM); + + for (i = 0; i < oa_bufs; i++) { + where[i] = lov_stripe_number(lsm, pga[i].off); + stripeinfo[where[i]].bufct++; + } + + for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo; + i < stripe_count; i++, loi++, si_last = si, si++) { + if (i > 0) + si->index = si_last->index + si_last->bufct; + si->lsm.lsm_object_id = loi->loi_id; + si->ost_idx = loi->loi_ost_idx; + } + + for (i = 0; i < oa_bufs; i++) { + int which = where[i]; + int shift; + + shift = stripeinfo[which].index + stripeinfo[which].subcount; + LASSERT(shift < oa_bufs); + ioarr[shift] = pga[i]; + lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off); + stripeinfo[which].subcount++; + } - if (!handle || !handle->addr) - RETURN(NULL); + for (i = 0, si = stripeinfo; i < stripe_count; i++, si++) { + int shift = si->index; + + if (si->bufct == 0) + continue; - lov_lockh = (struct lov_lock_handles *)(unsigned long)(handle->addr); - if (lov_lockh->llh_cookie != handle->cookie) - RETURN(NULL); + if (lov->tgts[si->ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", si->ost_idx); + GOTO(out_ioarr, rc = -EIO); + } - return lov_lockh; + LASSERT(shift < oa_bufs); + rc = obd_brw_async(cmd, &lov->tgts[si->ost_idx].conn, + &si->lsm, si->bufct, &ioarr[shift], + set, oti); + if (rc) + GOTO(out_ioarr, rc); + } + LASSERT (rc == 0); + LASSERT (set->set_interpret == NULL); + set->set_interpret = lov_brw_interpret; + LASSERT (sizeof (set->set_args) >= sizeof (struct lov_brw_async_args)); + aa = (struct lov_brw_async_args *)&set->set_args; + aa->aa_oa_bufs = oa_bufs; + aa->aa_ioarr = ioarr; + GOTO(out_where, rc); + out_ioarr: + OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs); + out_where: + OBD_FREE(where, sizeof(*where) * oa_bufs); + out_sinfo: + OBD_FREE(stripeinfo, stripe_count * sizeof(*stripeinfo)); + out_exp: + class_export_put(export); + return rc; } static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, @@ -1267,35 +1709,33 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct lov_obd *lov; struct lov_oinfo *loi; struct lov_stripe_md submd; - ldlm_error_t rc = ELDLM_LOCK_MATCHED, err; + ldlm_error_t rc; int i; ENTRY; if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out_exp, rc = -EINVAL); } - /* we should never be asked to replay a lock. */ - + /* we should never be asked to replay a lock this way. */ LASSERT((*flags & LDLM_FL_REPLAY) == 0); if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out_exp, rc = -ENODEV); if (lsm->lsm_stripe_count > 1) { - lov_lockh = lov_newlockh(lsm); - if (!lov_lockh) - RETURN(-ENOMEM); + lov_lockh = lov_llh_new(lsm); + if (lov_lockh == NULL) + GOTO(out_exp, rc = -ENOMEM); - lockh->addr = (__u64)(unsigned long)lov_lockh; - lockh->cookie = lov_lockh->llh_cookie; + lockh->cookie = lov_lockh->llh_handle.h_cookie; lov_lockhp = lov_lockh->llh_handles; } else { lov_lockhp = lockh; @@ -1307,32 +1747,27 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct ldlm_extent *extent = (struct ldlm_extent *)cookie; struct ldlm_extent sub_ext; + *flags = 0; + if (!lov_stripe_intersects(lsm, i, extent->start, extent->end, + &sub_ext.start, &sub_ext.end)) + continue; + if (lov->tgts[loi->loi_ost_idx].active == 0) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } - *flags = 0; - sub_ext.start = lov_stripe_offset(lsm, extent->start, i); - sub_ext.end = lov_stripe_offset(lsm, extent->end, i); - if (sub_ext.start == sub_ext.end /* || !active */) - continue; - /* XXX LOV STACKING: submd should be from the subobj */ submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; /* XXX submd is not fully initialized here */ *flags = 0; - err = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd, + rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd, parent_lock, type, &sub_ext, sizeof(sub_ext), mode, flags, cb, data, datalen, lov_lockhp); // XXX add a lock debug statement here - /* return _MATCHED only when all locks matched.. */ - if (err == ELDLM_OK) { - rc = ELDLM_OK; - } else if (err != ELDLM_LOCK_MATCHED) { - rc = err; + if (rc != ELDLM_OK) { memset(lov_lockhp, 0, sizeof(*lov_lockhp)); if (lov->tgts[loi->loi_ost_idx].active) { CERROR("error: enqueue objid "LPX64" subobj " @@ -1343,15 +1778,16 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, } } } - RETURN(rc); + if (lsm->lsm_stripe_count > 1) + lov_llh_put(lov_lockh); + GOTO(out_exp, rc = ELDLM_OK); -out_locks: + out_locks: while (loi--, lov_lockhp--, i-- > 0) { struct lov_stripe_md submd; int err; - if (lov_lockhp->cookie == 0 || - lov->tgts[loi->loi_ost_idx].active == 0) + if (lov_lockhp->cookie == 0) continue; /* XXX LOV STACKING: submd should be from the subobj */ @@ -1367,13 +1803,112 @@ out_locks: } if (lsm->lsm_stripe_count > 1) { - lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC; - OBD_FREE(lov_lockh, sizeof(*lov_lockh) + - sizeof(*lov_lockh->llh_handles) * - lsm->lsm_stripe_count); + lov_llh_destroy(lov_lockh); + lov_llh_put(lov_lockh); + } + out_exp: + class_export_put(export); + RETURN(rc); +} + +static int lov_match(struct lustre_handle *conn, struct lov_stripe_md *lsm, + __u32 type, void *cookie, int cookielen, __u32 mode, + int *flags, struct lustre_handle *lockh) +{ + struct obd_export *export = class_conn2export(conn); + struct lov_lock_handles *lov_lockh = NULL; + struct lustre_handle *lov_lockhp; + struct lov_obd *lov; + struct lov_oinfo *loi; + struct lov_stripe_md submd; + ldlm_error_t rc = 0; + int i; + ENTRY; + + if (!lsm) { + CERROR("LOV requires striping ea\n"); + GOTO(out_exp, rc = -EINVAL); + } + + if (lsm->lsm_magic != LOV_MAGIC) { + CERROR("LOV striping magic bad %#x != %#x\n", + lsm->lsm_magic, LOV_MAGIC); + GOTO(out_exp, rc = -EINVAL); + } + + if (!export || !export->exp_obd) + GOTO(out_exp, rc = -ENODEV); + + if (lsm->lsm_stripe_count > 1) { + lov_lockh = lov_llh_new(lsm); + if (lov_lockh == NULL) + GOTO(out_exp, rc = -ENOMEM); + + lockh->cookie = lov_lockh->llh_handle.h_cookie; + lov_lockhp = lov_lockh->llh_handles; + } else { + lov_lockhp = lockh; } - lockh->cookie = DEAD_HANDLE_MAGIC; + lov = &export->exp_obd->u.lov; + for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; + i++, loi++, lov_lockhp++) { + struct ldlm_extent *extent = (struct ldlm_extent *)cookie; + struct ldlm_extent sub_ext; + int lov_flags; + + if (!lov_stripe_intersects(lsm, i, extent->start, extent->end, + &sub_ext.start, &sub_ext.end)) + continue; + + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + rc = -EIO; + break; + } + + /* XXX LOV STACKING: submd should be from the subobj */ + submd.lsm_object_id = loi->loi_id; + submd.lsm_stripe_count = 0; + lov_flags = *flags; + /* XXX submd is not fully initialized here */ + rc = obd_match(&(lov->tgts[loi->loi_ost_idx].conn), &submd, + type, &sub_ext, sizeof(sub_ext), mode, + &lov_flags, lov_lockhp); + if (rc != 1) + break; + } + if (rc == 1) { + if (lsm->lsm_stripe_count > 1) + lov_llh_put(lov_lockh); + GOTO(out_exp, 1); + } + + while (loi--, lov_lockhp--, i-- > 0) { + struct lov_stripe_md submd; + int err; + + if (lov_lockhp->cookie == 0) + continue; + + /* XXX LOV STACKING: submd should be from the subobj */ + submd.lsm_object_id = loi->loi_id; + submd.lsm_stripe_count = 0; + err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, + mode, lov_lockhp); + if (err && lov->tgts[loi->loi_ost_idx].active) { + CERROR("error: cancelling objid "LPX64" on OST " + "idx %d after match failure: rc = %d\n", + loi->loi_id, loi->loi_ost_idx, err); + } + } + + if (lsm->lsm_stripe_count > 1) { + lov_llh_destroy(lov_lockh); + lov_llh_put(lov_lockh); + } + out_exp: + class_export_put(export); RETURN(rc); } @@ -1390,29 +1925,30 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, if (!lsm) { CERROR("LOV requires striping ea\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (lsm->lsm_magic != LOV_MAGIC) { CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); LASSERT(lockh); if (lsm->lsm_stripe_count > 1) { - lov_lockh = lov_h2lovlockh(lockh); + lov_lockh = lov_handle2llh(lockh); if (!lov_lockh) { CERROR("LOV: invalid lov lock handle %p\n", lockh); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } lov_lockhp = lov_lockh->llh_handles; - } else + } else { lov_lockhp = lockh; + } lov = &export->exp_obd->u.lov; for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; @@ -1421,7 +1957,8 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, int err; if (lov_lockhp->cookie == 0) { - CDEBUG(D_HA, "lov idx %d no lock?\n", loi->loi_ost_idx); + CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n", + loi->loi_ost_idx, loi->loi_id); continue; } @@ -1442,19 +1979,18 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, } } - if (lsm->lsm_stripe_count > 1) { - lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC; - OBD_FREE(lov_lockh, sizeof(*lov_lockh) + - sizeof(*lov_lockh->llh_handles) * - lsm->lsm_stripe_count); - } - lockh->cookie = DEAD_HANDLE_MAGIC; - - RETURN(rc); + if (lsm->lsm_stripe_count > 1) + lov_llh_destroy(lov_lockh); + if (lov_lockh != NULL) + lov_llh_put(lov_lockh); + GOTO(out, rc); + out: + class_export_put(export); + return rc; } static int lov_cancel_unused(struct lustre_handle *conn, - struct lov_stripe_md *lsm, int flags) + struct lov_stripe_md *lsm, int flags, void *opaque) { struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; @@ -1464,21 +2000,24 @@ static int lov_cancel_unused(struct lustre_handle *conn, if (!lsm) { CERROR("LOV requires striping ea for lock cancellation\n"); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { struct lov_stripe_md submd; int err; + if (lov->tgts[loi->loi_ost_idx].active == 0) + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); + submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; err = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn, - &submd, flags); + &submd, flags, opaque); if (err && lov->tgts[loi->loi_ost_idx].active) { CERROR("error: cancel unused objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, @@ -1487,10 +2026,21 @@ static int lov_cancel_unused(struct lustre_handle *conn, rc = err; } } - - RETURN(rc); + GOTO(out, rc); + out: + class_export_put(export); + return rc; } +#define LOV_U64_MAX ((__u64)~0ULL) +#define LOV_SUM_MAX(tot, add) \ + do { \ + if ((tot) + (add) < (tot)) \ + (tot) = LOV_U64_MAX; \ + else \ + (tot) += (add); \ + } while(0) + static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) { struct obd_export *export = class_conn2export(conn); @@ -1502,7 +2052,7 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) ENTRY; if (!export || !export->exp_obd) - RETURN(-ENODEV); + GOTO(out, rc = -ENODEV); lov = &export->exp_obd->u.lov; @@ -1539,14 +2089,30 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) * if one of the OBDs has no more objects left) * - could be sum if we stripe whole objects * - could be average, just to give a nice number - * - we just pick first OST and hope it is enough - sfs->f_ffree += lov_sfs.f_ffree; + * + * To give a "reasonable" (if not wholly accurate) + * number, we divide the total number of free objects + * by expected stripe count (watch out for overflow). */ + LOV_SUM_MAX(osfs->os_files, lov_sfs.os_files); + LOV_SUM_MAX(osfs->os_ffree, lov_sfs.os_ffree); } } - if (!set && !rc) + if (set) { + __u32 expected_stripes = lov->desc.ld_default_stripe_count ? + lov->desc.ld_default_stripe_count : + lov->desc.ld_active_tgt_count; + + if (osfs->os_files != LOV_U64_MAX) + do_div(osfs->os_files, expected_stripes); + if (osfs->os_ffree != LOV_U64_MAX) + do_div(osfs->os_ffree, expected_stripes); + } else if (!rc) rc = -EIO; - RETURN(rc); + GOTO(out, rc); + out: + class_export_put(export); + return rc; } static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, @@ -1601,7 +2167,7 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, rc = copy_to_user((void *)uarg, buf, len); if (rc) rc = -EFAULT; - OBD_FREE(buf, len); + obd_ioctl_freedata(buf, len); break; } case LL_IOC_LOV_SETSTRIPE: @@ -1639,6 +2205,49 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, RETURN(rc); } +static int lov_get_info(struct lustre_handle *conn, __u32 keylen, + void *key, __u32 *vallen, void *val) +{ + struct obd_device *obddev = class_conn2obd(conn); + struct lov_obd *lov = &obddev->u.lov; + int i; + ENTRY; + + if (!vallen || !val) + RETURN(-EFAULT); + + if (keylen > strlen("lock_to_stripe") && + strcmp(key, "lock_to_stripe") == 0) { + struct { + char name[16]; + struct ldlm_lock *lock; + struct lov_stripe_md *lsm; + } *data = key; + __u32 *stripe = val; + struct lov_oinfo *loi; + + if (*vallen < sizeof(*stripe)) + RETURN(-EFAULT); + *vallen = sizeof(*stripe); + + /* XXX This is another one of those bits that will need to + * change if we ever actually support nested LOVs. It uses + * the lock's connection to find out which stripe it is. */ + for (i = 0, loi = data->lsm->lsm_oinfo; + i < data->lsm->lsm_stripe_count; + i++, loi++) { + if (lov->tgts[loi->loi_ost_idx].conn.cookie == + data->lock->l_connh->cookie) { + *stripe = i; + RETURN(0); + } + } + RETURN(-ENXIO); + } + + RETURN(-EINVAL); +} + struct obd_ops lov_obd_ops = { o_owner: THIS_MODULE, o_attach: lov_attach, @@ -1652,15 +2261,19 @@ struct obd_ops lov_obd_ops = { o_create: lov_create, o_destroy: lov_destroy, o_getattr: lov_getattr, + o_getattr_async: lov_getattr_async, o_setattr: lov_setattr, o_open: lov_open, o_close: lov_close, o_brw: lov_brw, + o_brw_async: lov_brw_async, o_punch: lov_punch, o_enqueue: lov_enqueue, + o_match: lov_match, o_cancel: lov_cancel, o_cancel_unused: lov_cancel_unused, - o_iocontrol: lov_iocontrol + o_iocontrol: lov_iocontrol, + o_get_info: lov_get_info }; int __init lov_init(void) @@ -1670,12 +2283,6 @@ int __init lov_init(void) printk(KERN_INFO "Lustre Logical Object Volume driver; " "info@clusterfs.com\n"); - lov_file_cache = kmem_cache_create("ll_lov_file_data", - sizeof(struct lov_file_handles), - 0, 0, NULL, NULL); - if (!lov_file_cache) - RETURN(-ENOMEM); - lprocfs_init_vars(&lvars); rc = class_register_type(&lov_obd_ops, lvars.module_vars, OBD_LOV_DEVICENAME); @@ -1684,8 +2291,6 @@ int __init lov_init(void) static void __exit lov_exit(void) { - if (kmem_cache_destroy(lov_file_cache)) - CERROR("couldn't free LOV open cache\n"); class_unregister_type(OBD_LOV_DEVICENAME); } diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 463dd72..620dd5c 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -34,15 +34,6 @@ #include <linux/obd_class.h> #include <linux/obd_support.h> -/* lov_packdesc() is in mds/mds_lov.c */ -void lov_unpackdesc(struct lov_desc *ld) -{ - ld->ld_tgt_count = NTOH__u32(ld->ld_tgt_count); - ld->ld_default_stripe_count = HTON__u32(ld->ld_default_stripe_count); - ld->ld_default_stripe_size = HTON__u32(ld->ld_default_stripe_size); - ld->ld_pattern = HTON__u32(ld->ld_pattern); -} - void lov_dump_lmm(int level, struct lov_mds_md *lmm) { struct lov_object_id *loi; @@ -65,7 +56,8 @@ do { \ LASSERT(test); /* so we know what assertion failed */ \ } while(0) -/* Pack LOV object metadata for shipment to the MDS. +/* Pack LOV object metadata for disk storage. It is packed in LE byte + * order and is opaque to the networking layer. * * XXX In the future, this will be enhanced to get the EA size from the * underlying OSC device(s) to get their EA sizes so we can stack @@ -108,8 +100,7 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, RETURN(lmm_size); if (*lmmp && !lsm) { - /* endianness */ - ost_count = ((*lmmp)->lmm_ost_count); + ost_count = le32_to_cpu ((*lmmp)->lmm_ost_count); OBD_FREE(*lmmp, lov_mds_md_size(ost_count)); *lmmp = NULL; RETURN(0); @@ -122,25 +113,24 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, } lmm = *lmmp; + lmm->lmm_magic = cpu_to_le32 (LOV_MAGIC); + lmm->lmm_ost_count = cpu_to_le16 (ost_count); - lmm->lmm_stripe_count = (stripe_count); if (!lsm) RETURN(lmm_size); - /* XXX endianness */ - lmm->lmm_magic = (lsm->lsm_magic); - lmm->lmm_object_id = (lsm->lsm_object_id); - LASSERT(lsm->lsm_object_id); - lmm->lmm_stripe_size = (lsm->lsm_stripe_size); - lmm->lmm_stripe_offset = (lsm->lsm_stripe_offset); - lmm->lmm_ost_count = (ost_count); + lmm->lmm_object_id = cpu_to_le64 (lsm->lsm_object_id); + lmm->lmm_stripe_count = cpu_to_le16 (stripe_count); + lmm->lmm_stripe_size = cpu_to_le32 (lsm->lsm_stripe_size); + lmm->lmm_stripe_offset = cpu_to_le32 (lsm->lsm_stripe_offset); /* Only fill in the object ids which we are actually using. * Assumes lmm_objects is otherwise zero-filled. */ for (i = 0, loi = lsm->lsm_oinfo; i < stripe_count; i++, loi++) { /* XXX call down to osc_packmd() to do the packing */ - LASSERT(loi->loi_id); - lmm->lmm_objects[loi->loi_ost_idx].l_object_id = (loi->loi_id); + LASSERT (loi->loi_id); + lmm->lmm_objects[loi->loi_ost_idx].l_object_id = + cpu_to_le64 (loi->loi_id); } RETURN(lmm_size); @@ -156,14 +146,17 @@ static int lov_get_stripecnt(struct lov_obd *lov, int stripe_count) return stripe_count; } +/* Unpack LOV object metadata from disk storage. It is packed in LE byte + * order and is opaque to the networking layer. + */ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, - struct lov_mds_md *lmm) + struct lov_mds_md *lmm, int lmm_bytes) { struct obd_device *obd = class_conn2obd(conn); struct lov_obd *lov = &obd->u.lov; struct lov_stripe_md *lsm; struct lov_oinfo *loi; - int ost_count; + int ost_count = 0; int ost_offset = 0; int stripe_count; int lsm_size; @@ -171,14 +164,31 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, ENTRY; if (lmm) { - /* endianness */ - if (lmm->lmm_magic != LOV_MAGIC) { - CERROR("bad wire LOV MAGIC: %#08x != %#08x\n", - lmm->lmm_magic, LOV_MAGIC); + if (lmm_bytes < sizeof (*lmm)) { + CERROR("lov_mds_md too small: %d, need %d\n", + lmm_bytes, (int)sizeof(*lmm)); + RETURN(-EINVAL); + } + if (le32_to_cpu (lmm->lmm_magic) != LOV_MAGIC) { + CERROR("bad disk LOV MAGIC: %#08x != %#08x\n", + le32_to_cpu (lmm->lmm_magic), LOV_MAGIC); RETURN(-EINVAL); } - stripe_count = (lmm->lmm_stripe_count); - LASSERT(stripe_count); + + ost_count = le16_to_cpu (lmm->lmm_ost_count); + stripe_count = le16_to_cpu (lmm->lmm_stripe_count); + + if (ost_count == 0 || stripe_count == 0) { + CERROR ("zero ost %d or stripe %d count\n", + ost_count, stripe_count); + RETURN (-EINVAL); + } + + if (lmm_bytes < lov_mds_md_size (ost_count)) { + CERROR ("lov_mds_md too small: %d, need %d\n", + lmm_bytes, lov_mds_md_size (ost_count)); + RETURN (-EINVAL); + } } else stripe_count = lov_get_stripecnt(lov, 0); @@ -202,18 +212,16 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, } lsm = *lsmp; - + lsm->lsm_magic = LOV_MAGIC; lsm->lsm_stripe_count = stripe_count; + lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; + if (!lmm) RETURN(lsm_size); - /* XXX endianness */ - ost_offset = lsm->lsm_stripe_offset = (lmm->lmm_stripe_offset); - lsm->lsm_magic = (lmm->lmm_magic); - lsm->lsm_object_id = (lmm->lmm_object_id); - lsm->lsm_stripe_size = (lmm->lmm_stripe_size); - - ost_count = (lmm->lmm_ost_count); + lsm->lsm_object_id = le64_to_cpu (lmm->lmm_object_id); + lsm->lsm_stripe_size = le32_to_cpu (lmm->lmm_stripe_size); + ost_offset = lsm->lsm_stripe_offset = le32_to_cpu (lmm->lmm_stripe_offset); LMM_ASSERT(lsm->lsm_object_id); LMM_ASSERT(ost_count); @@ -226,7 +234,7 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, LMM_ASSERT(loi - lsm->lsm_oinfo < stripe_count); /* XXX LOV STACKING call down to osc_unpackmd() */ - loi->loi_id = (lmm->lmm_objects[ost_offset].l_object_id); + loi->loi_id = le64_to_cpu (lmm->lmm_objects[ost_offset].l_object_id); loi->loi_ost_idx = ost_offset; loi++; } @@ -258,8 +266,10 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, if (rc) RETURN(-EFAULT); + /* Bug 1185 FIXME: struct lov_mds_md is little-endian everywhere else */ + if (lmm.lmm_magic != LOV_MAGIC) { - CERROR("bad wire LOV MAGIC: %#08x != %#08x\n", + CERROR("bad userland LOV MAGIC: %#08x != %#08x\n", lmm.lmm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -299,6 +309,7 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, lsm->lsm_stripe_count = stripe_count; lsm->lsm_stripe_offset = lmm.lmm_stripe_offset; lsm->lsm_stripe_size = lmm.lmm_stripe_size; + lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES * stripe_count; *lsmp = lsm; @@ -314,10 +325,8 @@ int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct lov_mds_md *lmmu) { - struct obd_device *obd = class_conn2obd(conn); - struct lov_obd *lov = &obd->u.lov; struct lov_mds_md lmm, *lmmk = NULL; - int ost_count, rc, lmm_size; + int rc, lmm_size; ENTRY; if (!lsm) @@ -330,23 +339,20 @@ int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm, if (lmm.lmm_magic != LOV_MAGIC) RETURN(-EINVAL); - ost_count = lov->desc.ld_tgt_count; - - /* XXX we _could_ check if indices > user lmm_ost_count are zero */ - if (lmm.lmm_ost_count < ost_count) - RETURN(-EOVERFLOW); - rc = lov_packmd(conn, &lmmk, lsm); if (rc < 0) RETURN(rc); - + /* Bug 1185 FIXME: convert lmmk to big-endian before copy to userspace */ lmm_size = rc; rc = 0; - if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size)) + /* User wasn't expecting this many OST entries */ + if (lmm.lmm_ost_count < lmmk->lmm_ost_count) + rc = -EOVERFLOW; + else if (copy_to_user(lmmu, lmmk, lmm_size)) rc = -EFAULT; - obd_free_wiremd(conn, &lmmk); + obd_free_diskmd (conn, &lmmk); RETURN(rc); } diff --git a/lustre/mdc/Makefile.am b/lustre/mdc/Makefile.am index 1d9c099..f4f0218 100644 --- a/lustre/mdc/Makefile.am +++ b/lustre/mdc/Makefile.am @@ -5,16 +5,15 @@ DEFS= +if LIBLUSTRE +lib_LIBRARIES = libmdc.a +libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h +else MODULE = mdc modulefs_DATA = mdc.o EXTRA_PROGRAMS = mdc -LINX= mds_updates.c client.c -mdc_SOURCES = mdc_request.c mdc_reint.c lproc_mdc.c $(LINX) - -mds_updates.c: - test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c . -client.c: - test -e client.c || ln -sf $(top_srcdir)/lib/client.c . +mdc_SOURCES = mdc_request.c mdc_reint.c lproc_mdc.c mdc_lib.c mdc_internal.h +endif include $(top_srcdir)/Rules diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h new file mode 100644 index 0000000..e39a0aa --- /dev/null +++ b/lustre/mdc/mdc_internal.h @@ -0,0 +1,24 @@ +void mds_pack_req_body(struct ptlrpc_request *); +void mds_pack_rep_body(struct ptlrpc_request *); +void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size, + obd_id ino, int type); +void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, + int flags, struct mdc_op_data *data); +void mds_setattr_pack(struct ptlrpc_request *req, + struct mdc_op_data *data, + struct iattr *iattr, void *ea, int ealen); +void mds_create_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + const void *data, int datalen); +void mds_open_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + __u32 flags, const void *data, int datalen); +void mds_unlink_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data); +void mds_link_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data); +void mds_rename_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + const char *old, int oldlen, const char *new, int newlen); diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c new file mode 100644 index 0000000..1396f8d --- /dev/null +++ b/lustre/mdc/mdc_lib.c @@ -0,0 +1,282 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_MDS +#ifndef __KERNEL__ +# include <liblustre.h> +#endif +#include <linux/lustre_idl.h> +#include <linux/lustre_net.h> +#include <linux/lustre_mds.h> +#include <linux/lustre_lite.h> + +void mds_readdir_pack(struct ptlrpc_request *req, __u64 offset, __u32 size, + obd_id ino, int type, __u64 xid) +{ + struct mds_body *b; + + b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b)); + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; + b->fid1.id = ino; + b->fid1.f_type = type; + b->size = offset; /* !! */ + b->suppgid = -1; + b->blocks = xid; /* !! */ + b->nlink = size; /* !! */ +} + +static void mds_pack_body(struct mds_body *b) +{ + LASSERT (b != NULL); + + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; +} + +void mds_pack_req_body(struct ptlrpc_request *req) +{ + struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*b)); + mds_pack_body(b); +} + +/* packing of MDS records */ +void mds_create_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + const void *data, int datalen) +{ + struct mds_rec_create *rec; + char *tmp; + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = current->fsuid; + rec->cr_fsgid = current->fsgid; + rec->cr_cap = current->cap_effective; + ll_ino2fid(&rec->cr_fid, op_data->ino1, op_data->gen1, op_data->typ1); + memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_uid = uid; + rec->cr_gid = gid; + rec->cr_time = time; + if (in_group_p(op_data->gid1)) + rec->cr_suppgid = op_data->gid1; + else + rec->cr_suppgid = -1; + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1); + LOGL0(op_data->name, op_data->namelen, tmp); + + if (data) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen); + memcpy (tmp, data, datalen); + } +} +/* packing of MDS records */ +void mds_open_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, + __u32 mode, __u64 rdev, __u32 uid, __u32 gid, __u64 time, + __u32 flags, + const void *data, int datalen) +{ + struct mds_rec_create *rec; + char *tmp; + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = current->fsuid; + rec->cr_fsgid = current->fsgid; + rec->cr_cap = current->cap_effective; + ll_ino2fid(&rec->cr_fid, op_data->ino1, + op_data->gen1, op_data->typ1); + memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); + rec->cr_mode = mode; + rec->cr_flags = flags; + rec->cr_rdev = rdev; + rec->cr_uid = uid; + rec->cr_gid = gid; + rec->cr_time = time; + if (in_group_p(op_data->gid1)) + rec->cr_suppgid = op_data->gid1; + else + rec->cr_suppgid = -1; + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1); + LOGL0(op_data->name, op_data->namelen, tmp); + + if (data) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen); + memcpy (tmp, data, datalen); + } +} +void mds_setattr_pack(struct ptlrpc_request *req, + struct mdc_op_data *data, + struct iattr *iattr, void *ea, int ealen) +{ + struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, 0, + sizeof (*rec)); + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = current->fsuid; + rec->sa_fsgid = current->fsgid; + rec->sa_cap = current->cap_effective; + ll_ino2fid(&rec->sa_fid, data->ino1, data->gen1, data->typ1); + + if (iattr) { + rec->sa_valid = iattr->ia_valid; + rec->sa_mode = iattr->ia_mode; + rec->sa_uid = iattr->ia_uid; + rec->sa_gid = iattr->ia_gid; + rec->sa_size = iattr->ia_size; + rec->sa_atime = LTIME_S(iattr->ia_atime); + rec->sa_mtime = LTIME_S(iattr->ia_mtime); + rec->sa_ctime = LTIME_S(iattr->ia_ctime); + rec->sa_attr_flags = iattr->ia_attr_flags; + + if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid)) + rec->sa_suppgid = iattr->ia_gid; + else if ((iattr->ia_valid & ATTR_MODE) && + in_group_p(data->gid1)) + rec->sa_suppgid = data->gid1; + else + rec->sa_suppgid = -1; + } + + if (ealen != 0) + memcpy(lustre_msg_buf(req->rq_reqmsg, 1, ealen), ea, ealen); +} + +void mds_unlink_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + struct mds_rec_unlink *rec; + char *tmp; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + LASSERT (rec != NULL); + + rec->ul_opcode = REINT_UNLINK; + rec->ul_fsuid = current->fsuid; + rec->ul_fsgid = current->fsgid; + rec->ul_cap = current->cap_effective; + rec->ul_mode = data->mode; + if (in_group_p(data->gid1)) + rec->ul_suppgid = data->gid1; + else + rec->ul_suppgid = -1; + ll_ino2fid(&rec->ul_fid1, data->ino1, data->gen1, data->typ1); + if (data->ino2) + ll_ino2fid(&rec->ul_fid2, data->ino2, data->gen2, data->typ2); + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1); + LASSERT (tmp != NULL); + LOGL0(data->name, data->namelen, tmp); +} + +void mds_link_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + struct mds_rec_link *rec; + char *tmp; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = current->fsuid; + rec->lk_fsgid = current->fsgid; + rec->lk_cap = current->cap_effective; + if (in_group_p(data->gid1)) + rec->lk_suppgid1 = data->gid1; + else + rec->lk_suppgid1 = -1; + if (in_group_p(data->gid2)) + rec->lk_suppgid2 = data->gid2; + else + rec->lk_suppgid2 = -1; + ll_ino2fid(&rec->lk_fid1, data->ino1, data->gen1, data->typ1); + ll_ino2fid(&rec->lk_fid2, data->ino2, data->gen2, data->typ2); + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1); + LOGL0(data->name, data->namelen, tmp); +} + +void mds_rename_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + const char *old, int oldlen, const char *new, int newlen) +{ + struct mds_rec_rename *rec; + char *tmp; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = current->fsuid; + rec->rn_fsgid = current->fsgid; + rec->rn_cap = current->cap_effective; + if (in_group_p(data->gid1)) + rec->rn_suppgid1 = data->gid1; + else + rec->rn_suppgid1 = -1; + if (in_group_p(data->gid2)) + rec->rn_suppgid2 = data->gid2; + else + rec->rn_suppgid2 = -1; + ll_ino2fid(&rec->rn_fid1, data->ino1, data->gen1, data->typ1); + ll_ino2fid(&rec->rn_fid2, data->ino2, data->gen2, data->typ2); + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, oldlen + 1); + LOGL0(old, oldlen, tmp); + + if (new) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, newlen + 1); + LOGL0(new, newlen, tmp); + } +} + +void mds_getattr_pack(struct ptlrpc_request *req, int valid, int offset, + int flags, struct mdc_op_data *data) +{ + struct mds_body *b; + b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*b)); + + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; + b->valid = valid; + b->flags = flags; + if (in_group_p(data->gid1)) + b->suppgid = data->gid1; + else + b->suppgid = -1; + + ll_ino2fid(&b->fid1, data->ino1, data->gen1, data->typ1); + if (data->name) { + char *tmp; + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, + data->namelen + 1); + LOGL0(data->name, data->namelen, tmp); + } +} diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 3553a45..68d7f0d 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -20,22 +20,27 @@ */ #define EXPORT_SYMTAB - -#include <linux/config.h> -#include <linux/module.h> -#include <linux/kernel.h> - #define DEBUG_SUBSYSTEM S_MDC +#ifdef __KERNEL__ +# include <linux/config.h> +# include <linux/module.h> +# include <linux/kernel.h> +#else +# include <liblustre.h> +#endif + #include <linux/obd_class.h> #include <linux/lustre_mds.h> +#include "mdc_internal.h" /* mdc_setattr does its own semaphore handling */ static int mdc_reint(struct ptlrpc_request *request, int level) { int rc; - __u32 *opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0); + __u32 *opcodeptr; + opcodeptr = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*opcodeptr)); request->rq_level = level; if (!(*opcodeptr == REINT_SETATTR)) @@ -44,12 +49,8 @@ static int mdc_reint(struct ptlrpc_request *request, int level) if (!(*opcodeptr == REINT_SETATTR)) mdc_put_rpc_lock(&mdc_rpc_lock, NULL); - if (rc) { + if (rc) CDEBUG(D_INFO, "error in handling %d\n", rc); - } else { - /* For future resend/replays. */ - *opcodeptr |= REINT_REPLAYING; - } return rc; } @@ -59,7 +60,8 @@ static int mdc_reint(struct ptlrpc_request *request, int level) * If it is called with iattr->ia_valid & ATTR_FROM_OPEN, then it is a * magic open-path setattr that should take the setattr semaphore and * go to the setattr portal. */ -int mdc_setattr(struct lustre_handle *conn, struct inode *inode, +int mdc_setattr(struct lustre_handle *conn, + struct mdc_op_data *data, struct iattr *iattr, void *ea, int ealen, struct ptlrpc_request **request) { @@ -85,7 +87,7 @@ int mdc_setattr(struct lustre_handle *conn, struct inode *inode, } else rpc_lock = &mdc_rpc_lock; - mds_setattr_pack(req, inode, iattr, ea, ealen); + mds_setattr_pack(req, data, iattr, ea, ealen); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); @@ -101,15 +103,17 @@ int mdc_setattr(struct lustre_handle *conn, struct inode *inode, RETURN(rc); } -int mdc_create(struct lustre_handle *conn, struct inode *dir, - const char *name, int namelen, const void *data, int datalen, +int mdc_create(struct lustre_handle *conn, + struct mdc_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, __u32 gid, __u64 time, __u64 rdev, struct ptlrpc_request **request) { struct ptlrpc_request *req; - int rc, size[3] = {sizeof(struct mds_rec_create), namelen + 1, 0}; + int rc, size[3] = {sizeof(struct mds_rec_create), + op_data->namelen + 1, 0}; int level, bufcount = 2; - ENTRY; +// ENTRY; if (data && datalen) { size[bufcount] = datalen; @@ -119,12 +123,14 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir, req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, bufcount, size, NULL); if (!req) - RETURN(-ENOMEM); + return -ENOMEM; +// RETURN(-ENOMEM); /* mds_create_pack fills msg->bufs[1] with name * and msg->bufs[2] with tgt, for symlinks or lov MD data */ - mds_create_pack(req, 0, dir, mode, rdev, uid, gid, time, - name, namelen, data, datalen); + mds_create_pack(req, 0, op_data, + mode, rdev, uid, gid, time, + data, datalen); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); @@ -135,7 +141,6 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir, /* Resend if we were told to. */ if (rc == -ERESTARTSYS) { level = LUSTRE_CONN_RECOVD; - req->rq_flags = 0; goto resend; } @@ -143,16 +148,17 @@ int mdc_create(struct lustre_handle *conn, struct inode *dir, mdc_store_inode_generation(req, 0, 0); *request = req; - RETURN(rc); + return rc; +// RETURN(rc); } -int mdc_unlink(struct lustre_handle *conn, struct inode *dir, - struct inode *child, __u32 mode, const char *name, int namelen, +int mdc_unlink(struct lustre_handle *conn, + struct mdc_op_data *data, struct ptlrpc_request **request) { struct obd_device *obddev = class_conn2obd(conn); struct ptlrpc_request *req = *request; - int rc, size[2] = {sizeof(struct mds_rec_unlink), namelen + 1}; + int rc, size[2] = {sizeof(struct mds_rec_unlink), data->namelen + 1}; ENTRY; LASSERT(req == NULL); @@ -167,7 +173,7 @@ int mdc_unlink(struct lustre_handle *conn, struct inode *dir, size[1] = obddev->u.cli.cl_max_mds_easize; req->rq_replen = lustre_msg_size(2, size); - mds_unlink_pack(req, 0, dir, child, mode, name, namelen); + mds_unlink_pack(req, 0, data); rc = mdc_reint(req, LUSTRE_CONN_FULL); if (rc == -ERESTARTSYS) @@ -176,11 +182,11 @@ int mdc_unlink(struct lustre_handle *conn, struct inode *dir, } int mdc_link(struct lustre_handle *conn, - struct inode *src, struct inode *dir, const char *name, - int namelen, struct ptlrpc_request **request) + struct mdc_op_data *data, + struct ptlrpc_request **request) { struct ptlrpc_request *req; - int rc, size[2] = {sizeof(struct mds_rec_link), namelen + 1}; + int rc, size[2] = {sizeof(struct mds_rec_link), data->namelen + 1}; ENTRY; req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_REINT, 2, size, @@ -188,7 +194,7 @@ int mdc_link(struct lustre_handle *conn, if (!req) RETURN(-ENOMEM); - mds_link_pack(req, 0, src, dir, name, namelen); + mds_link_pack(req, 0, data); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); @@ -202,8 +208,9 @@ int mdc_link(struct lustre_handle *conn, } int mdc_rename(struct lustre_handle *conn, - struct inode *src, struct inode *tgt, const char *old, - int oldlen, const char *new, int newlen, + struct mdc_op_data *data, + const char *old, int oldlen, + const char *new, int newlen, struct ptlrpc_request **request) { struct ptlrpc_request *req; @@ -216,7 +223,7 @@ int mdc_rename(struct lustre_handle *conn, if (!req) RETURN(-ENOMEM); - mds_rename_pack(req, 0, src, tgt, old, oldlen, new, newlen); + mds_rename_pack(req, 0, data, old, oldlen, new, newlen); size[0] = sizeof(struct mds_body); req->rq_replen = lustre_msg_size(1, size); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 68075f5..dfcd7af 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -22,14 +22,21 @@ #define EXPORT_SYMTAB #define DEBUG_SUBSYSTEM S_MDC -#include <linux/module.h> -#include <linux/pagemap.h> -#include <linux/miscdevice.h> +#ifdef __KERNEL__ +# include <linux/module.h> +# include <linux/pagemap.h> +# include <linux/miscdevice.h> +# include <linux/init.h> +#else +# include <liblustre.h> +# include <linux/obd_class.h> +#endif + #include <linux/lustre_mds.h> #include <linux/lustre_lite.h> #include <linux/lustre_dlm.h> -#include <linux/init.h> #include <linux/lprocfs_status.h> +#include "mdc_internal.h" #define REQUEST_MINOR 244 @@ -51,19 +58,22 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, if (!req) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); req->rq_level = level; req->rq_replen = lustre_msg_size(1, &size); mds_pack_req_body(req); req->rq_reqmsg->flags |= msg_flags; - mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); - mdc_put_rpc_lock(&mdc_rpc_lock, NULL); if (!rc) { - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_unpack_body(body); + body = lustre_swab_repbuf (req, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't extract mds_body\n"); + GOTO (out, rc = -EPROTO); + } + memcpy(rootfid, &body->fid1, sizeof(*rootfid)); CDEBUG(D_NET, "root ino="LPU64", last_committed="LPU64 @@ -90,85 +100,158 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, { struct ptlrpc_request *req; struct mds_status_req *streq; + struct lov_desc *desc; + struct obd_uuid *uuids; int rc, size[2] = {sizeof(*streq)}; + int i; ENTRY; req = ptlrpc_prep_req(class_conn2cliimp(mdc_connh), MDS_GETLOVINFO, 1, size, NULL); if (!req) - GOTO(out, rc = -ENOMEM); + RETURN (-ENOMEM); *request = req; - streq = lustre_msg_buf(req->rq_reqmsg, 0); - streq->flags = HTON__u32(MDS_STATUS_LOV); - streq->repbuf = HTON__u32(8192); + streq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*streq)); + streq->flags = MDS_STATUS_LOV; + streq->repbuf = LOV_MAX_UUID_BUFFER_SIZE; /* prepare for reply */ req->rq_level = LUSTRE_CONN_CON; - size[0] = 512; - size[1] = 8192; + size[0] = sizeof (*desc); + size[1] = LOV_MAX_UUID_BUFFER_SIZE; req->rq_replen = lustre_msg_size(2, size); + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); rc = ptlrpc_queue_wait(req); mdc_put_rpc_lock(&mdc_rpc_lock, NULL); - out: - RETURN(rc); + if (rc != 0) { + CERROR ("rcp failed\n"); + GOTO (failed, rc); + } + + desc = lustre_swab_repbuf (req, 0, sizeof (*desc), + lustre_swab_lov_desc); + if (desc == NULL) { + CERROR ("Can't unpack lov_desc\n"); + GOTO (failed, rc = -EPROTO); + } + + LASSERT_REPSWAB (req, 1); + /* array of uuids byte-sex insensitive; just verify they are all + * there and terminated */ + uuids = lustre_msg_buf (req->rq_repmsg, 1, + desc->ld_tgt_count * sizeof (*uuids)); + if (uuids == NULL) { + CERROR ("Can't unpack %d uuids\n", desc->ld_tgt_count); + GOTO (failed, rc = -EPROTO); + } + + for (i = 0; i < desc->ld_tgt_count; i++) { + int uid_len = strnlen (uuids[i].uuid, sizeof (uuids[i].uuid)); + + if (uid_len == sizeof (uuids[i].uuid)) { + CERROR ("Unterminated uuid %d:%*s\n", + i, (int)sizeof (uuids[i].uuid), uuids[i].uuid); + GOTO (failed, rc = -EPROTO); + } + } + RETURN(0); + + failed: + ptlrpc_req_finished (req); + RETURN (rc); } -int mdc_getattr(struct lustre_handle *conn, - obd_id ino, int type, unsigned long valid, unsigned int ea_size, +int mdc_getattr_common (struct lustre_handle *conn, + unsigned int ea_size, struct ptlrpc_request *req) +{ + struct mds_body *body; + void *eadata; + int rc; + int size[2] = {sizeof(*body), 0}; + int bufcount = 1; + ENTRY; + + /* request message already built */ + + if (ea_size != 0) { + size[bufcount++] = ea_size; + CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", + ea_size); + } + req->rq_replen = lustre_msg_size(bufcount, size); + + mdc_get_rpc_lock(&mdc_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(&mdc_rpc_lock, NULL); + if (rc != 0) + RETURN (rc); + + body = lustre_swab_repbuf (req, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't unpack mds_body\n"); + RETURN (-EPROTO); + } + + CDEBUG(D_NET, "mode: %o\n", body->mode); + + LASSERT_REPSWAB (req, 1); + if (body->eadatasize != 0) { + /* reply indicates presence of eadata; check it's there... */ + eadata = lustre_msg_buf (req->rq_repmsg, 1, body->eadatasize); + if (eadata == NULL) { + CERROR ("Missing/short eadata\n"); + RETURN (-EPROTO); + } + } + + RETURN (0); +} + +int mdc_getattr(struct lustre_handle *conn, struct ll_fid *fid, + unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request) { struct ptlrpc_request *req; struct mds_body *body; - int rc, size[2] = {sizeof(*body), 0}, bufcount = 1; + int size = sizeof(*body); + int rc; ENTRY; /* XXX do we need to make another request here? We just did a getattr * to do the lookup in the first place. */ - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR, 1, size, + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR, 1, &size, NULL); if (!req) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); - ll_ino2fid(&body->fid1, ino, 0, type); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); + memcpy(&body->fid1, fid, sizeof(*fid)); body->valid = valid; - - if (ea_size) { - size[bufcount] = ea_size; - bufcount++; - body->size = ea_size; - CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", - ea_size); - } - req->rq_replen = lustre_msg_size(bufcount, size); + body->eadatasize = ea_size; mds_pack_req_body(req); - mdc_get_rpc_lock(&mdc_rpc_lock, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_rpc_lock(&mdc_rpc_lock, NULL); - if (!rc) { - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_unpack_body(body); - CDEBUG(D_NET, "mode: %o\n", body->mode); + rc = mdc_getattr_common (conn, ea_size, req); + if (rc != 0) { + ptlrpc_req_finished (req); + req = NULL; } - - GOTO(out, rc); out: *request = req; - return rc; + RETURN (rc); } -int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, +int mdc_getattr_name(struct lustre_handle *conn, struct ll_fid *fid, char *filename, int namelen, unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request) { struct ptlrpc_request *req; struct mds_body *body; - int rc, size[2] = {sizeof(*body), namelen}, bufcount = 1; + int rc, size[2] = {sizeof(*body), namelen}; ENTRY; req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR_NAME, 2, @@ -176,32 +259,20 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, if (!req) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); - ll_inode2fid(&body->fid1, parent); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); + memcpy(&body->fid1, fid, sizeof(*fid)); body->valid = valid; - memcpy(lustre_msg_buf(req->rq_reqmsg, 1), filename, namelen); - - if (ea_size) { - size[1] = ea_size; - bufcount++; - body->size = ea_size; - CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", - ea_size); - valid |= OBD_MD_FLEASIZE; - } - - req->rq_replen = lustre_msg_size(bufcount, size); + body->eadatasize = ea_size; mds_pack_req_body(req); - mdc_get_rpc_lock(&mdc_rpc_lock, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_rpc_lock(&mdc_rpc_lock, NULL); - if (!rc) { - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_unpack_body(body); - } + LASSERT (strnlen (filename, namelen) == namelen - 1); + memcpy(lustre_msg_buf(req->rq_reqmsg, 1, namelen), filename, namelen); - EXIT; + rc = mdc_getattr_common (conn, ea_size, req); + if (rc != 0) { + ptlrpc_req_finished (req); + req = NULL; + } out: *request = req; return rc; @@ -211,143 +282,104 @@ int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, int repoff) { - struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff); - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff); - + struct mds_rec_create *rec = + lustre_msg_buf(req->rq_reqmsg, reqoff, sizeof (*rec)); + struct mds_body *body = + lustre_msg_buf(req->rq_repmsg, repoff, sizeof (*body)); + + LASSERT (rec != NULL); + LASSERT (body != NULL); + memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid); DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64, rec->cr_replayfid.generation, rec->cr_replayfid.id); } -static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag) -{ - int rc; - struct lustre_handle lockh; - ENTRY; - - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); - RETURN(rc); - } - break; - case LDLM_CB_CANCELING: { - /* Invalidate all dentries associated with this inode */ - struct inode *inode = lock->l_data; - - LASSERT(data != NULL); - - /* XXX what tells us that 'data' is a valid inode at all? - * we should probably validate the lock handle first? - */ - inode = igrab(inode); - - if (inode == NULL) /* inode->i_state & I_FREEING */ - break; - - if (S_ISDIR(inode->i_mode)) { - CDEBUG(D_INODE, "invalidating inode %lu\n", - inode->i_ino); - - ll_invalidate_inode_pages(inode); - } - - if (inode->i_sb->s_root && - inode != inode->i_sb->s_root->d_inode) - d_unhash_aliases(inode); - - iput(inode); - break; - } - default: - LBUG(); - } - - RETURN(0); -} - /* We always reserve enough space in the reply packet for a stripe MD, because * we don't know in advance the file type. * * XXX we could get that from ext2_dir_entry_2 file_type */ -int mdc_enqueue(struct lustre_handle *conn, int lock_type, - struct lookup_intent *it, int lock_mode, struct inode *dir, - struct dentry *de, struct lustre_handle *lockh, - char *tgt, int tgtlen, void *data, int datalen) +int mdc_enqueue(struct lustre_handle *conn, + int lock_type, + struct lookup_intent *it, + int lock_mode, + struct mdc_op_data *data, + struct lustre_handle *lockh, + char *tgt, + int tgtlen, + ldlm_completion_callback cb_completion, + ldlm_blocking_callback cb_blocking, + void *cb_data) { struct ptlrpc_request *req; struct obd_device *obddev = class_conn2obd(conn); struct ldlm_res_id res_id = - { .name = {dir->i_ino, dir->i_generation} }; + { .name = {data->ino1, data->gen1} }; int size[6] = {sizeof(struct ldlm_request), sizeof(struct ldlm_intent)}; int rc, flags = LDLM_FL_HAS_INTENT; int repsize[3] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), obddev->u.cli.cl_max_mds_easize}; - struct mdc_unlink_data *d = data; struct ldlm_reply *dlm_rep; struct ldlm_intent *lit; struct ldlm_request *lockreq; + void *eadata; + unsigned long irqflags; + int reply_buffers = 0; ENTRY; - LDLM_DEBUG_NOLOCK("mdsintent %s parent dir %lu", - ldlm_it2str(it->it_op), dir->i_ino); +// LDLM_DEBUG_NOLOCK("mdsintent=%s,name=%s,dir=%lu", +// ldlm_it2str(it->it_op), it_name, it_inode->i_ino); if (it->it_op & IT_OPEN) { it->it_mode |= S_IFREG; it->it_mode &= ~current->fs->umask; size[2] = sizeof(struct mds_rec_create); - size[3] = de->d_name.len + 1; + size[3] = data->namelen + 1; req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4, size, NULL); if (!req) RETURN(-ENOMEM); - req->rq_flags |= PTL_RPC_FL_REPLAY; + spin_lock_irqsave (&req->rq_lock, irqflags); + req->rq_replay = 1; + spin_unlock_irqrestore (&req->rq_lock, irqflags); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1); - lit->opc = NTOH__u64((__u64)it->it_op); + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; /* pack the intended request */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid, - current->fsgid, CURRENT_TIME, it->it_flags, - de->d_name.name, de->d_name.len, tgt, tgtlen); -#else - mds_open_pack(req, 2, dir, it->it_mode, 0, current->fsuid, - current->fsgid, CURRENT_TIME.tv_sec, it->it_flags, - de->d_name.name, de->d_name.len, tgt, tgtlen); -#endif + mds_open_pack(req, 2, data, it->it_mode, 0, + current->fsuid, current->fsgid, + LTIME_S(CURRENT_TIME), it->it_flags, + tgt, tgtlen); + /* get ready for the reply */ + reply_buffers = 3; req->rq_replen = lustre_msg_size(3, repsize); } else if (it->it_op & IT_UNLINK) { size[2] = sizeof(struct mds_rec_unlink); - size[3] = d->unl_len + 1; + size[3] = data->namelen + 1; req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4, size, NULL); if (!req) RETURN(-ENOMEM); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1); - lit->opc = NTOH__u64((__u64)it->it_op); + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; /* pack the intended request */ - mds_unlink_pack(req, 2, d->unl_dir, - d->unl_de, d->unl_mode, - d->unl_name, d->unl_len); + mds_unlink_pack(req, 2, data); + /* get ready for the reply */ + reply_buffers = 3; req->rq_replen = lustre_msg_size(3, repsize); } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { int valid = OBD_MD_FLNOTOBD | OBD_MD_FLEASIZE; size[2] = sizeof(struct mds_body); - size[3] = de->d_name.len + 1; + size[3] = data->namelen + 1; req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 4, size, NULL); @@ -355,13 +387,13 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, RETURN(-ENOMEM); /* pack the intent */ - lit = lustre_msg_buf(req->rq_reqmsg, 1); - lit->opc = NTOH__u64((__u64)it->it_op); + lit = lustre_msg_buf(req->rq_reqmsg, 1, sizeof (*lit)); + lit->opc = (__u64)it->it_op; /* pack the intended request */ - mds_getattr_pack(req, valid, 2, it->it_flags, dir, - de->d_name.name, de->d_name.len); + mds_getattr_pack(req, valid, 2, it->it_flags, data); /* get ready for the reply */ + reply_buffers = 3; req->rq_replen = lustre_msg_size(3, repsize); } else if (it->it_op == IT_READDIR) { req = ptlrpc_prep_req(class_conn2cliimp(conn), LDLM_ENQUEUE, 1, @@ -370,6 +402,7 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, RETURN(-ENOMEM); /* get ready for the reply */ + reply_buffers = 1; req->rq_replen = lustre_msg_size(1, repsize); } else { LBUG(); @@ -379,20 +412,13 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, mdc_get_rpc_lock(&mdc_rpc_lock, it); rc = ldlm_cli_enqueue(conn, req, obddev->obd_namespace, NULL, res_id, lock_type, NULL, 0, lock_mode, &flags, - ldlm_completion_ast, mdc_blocking_ast, dir, NULL, - lockh); + cb_completion, cb_blocking, cb_data, lockh); mdc_put_rpc_lock(&mdc_rpc_lock, it); - /* If we successfully created, mark the request so that replay will - * do the right thing */ - if (req->rq_transno) { - struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2); - rec->cr_opcode |= REINT_REPLAYING; - } /* Similarly, if we're going to replay this request, we don't want to * actually get a lock, just perform the intent. */ - if (req->rq_transno || (req->rq_flags & PTL_RPC_FL_REPLAY)) { - lockreq = lustre_msg_buf(req->rq_reqmsg, 0); + if (req->rq_transno || req->rq_replay) { + lockreq = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*lockreq)); lockreq->lock_flags |= LDLM_FL_INTENT_ONLY; } @@ -403,6 +429,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, memset(lockh, 0, sizeof(*lockh)); } else if (rc != 0) { CERROR("ldlm_cli_enqueue: %d\n", rc); + LASSERT (rc < 0); + ptlrpc_req_finished(req); RETURN(rc); } else { /* rc = 0 */ struct ldlm_lock *lock = ldlm_handle2lock(lockh); @@ -432,47 +460,57 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, LDLM_LOCK_PUT(lock); } - dlm_rep = lustre_msg_buf(req->rq_repmsg, 0); + dlm_rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*dlm_rep)); + LASSERT (dlm_rep != NULL); /* checked by ldlm_cli_enqueue() */ + LASSERT_REPSWABBED (req, 0); /* swabbed by ldlm_cli_enqueue() */ + it->it_disposition = (int) dlm_rep->lock_policy_res1; it->it_status = (int) dlm_rep->lock_policy_res2; it->it_lock_mode = lock_mode; it->it_data = req; - RETURN(rc); -} - -void mdc_lock_set_inode(struct lustre_handle *lockh, struct inode *inode) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - ENTRY; + /* We know what to expect, so we do any byte flipping required here */ + LASSERT (reply_buffers == 3 || reply_buffers == 1); + if (reply_buffers == 3) { + struct mds_body *body; - LASSERT(lock != NULL); - lock->l_data = inode; - LDLM_LOCK_PUT(lock); - EXIT; -} + body = lustre_swab_repbuf (req, 1, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't swab mds_body\n"); + RETURN (-EPROTO); + } -int mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, - int flags) -{ - struct ldlm_res_id res_id = - { .name = {inode->i_ino, inode->i_generation} }; - struct obd_device *obddev = class_conn2obd(conn); - ENTRY; - RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags)); + if ((body->valid & OBD_MD_FLEASIZE) != 0) { + /* The eadata is opaque; just check that it is + * there. Eventually, obd_unpackmd() will check + * the contents */ + eadata = lustre_swab_repbuf (req, 2, body->eadatasize, + NULL); + if (eadata == NULL) { + CERROR ("Missing/short eadata\n"); + RETURN (-EPROTO); + } + } + } + + RETURN(rc); } static void mdc_replay_open(struct ptlrpc_request *req) { - struct lustre_handle old, *file_fh = req->rq_replay_data; + struct obd_client_handle *och = req->rq_replay_data; + struct lustre_handle old, *file_fh = &och->och_fh; struct list_head *tmp; - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 0); + struct mds_body *body; - mds_unpack_body(body); + body = lustre_swab_repbuf (req, 1, sizeof (*body), + lustre_swab_mds_body); + LASSERT (body != NULL); + memcpy(&old, file_fh, sizeof(old)); - CDEBUG(D_HA, "updating from "LPD64"/"LPD64" to "LPD64"/"LPD64"\n", - file_fh->addr, file_fh->cookie, body->handle.addr, - body->handle.cookie); + CDEBUG(D_HA, "updating handle from "LPD64" to "LPD64"\n", + file_fh->cookie, body->handle.cookie); memcpy(file_fh, &body->handle, sizeof(body->handle)); /* A few frames up, ptlrpc_replay holds the lock, so this is safe. */ @@ -480,7 +518,7 @@ static void mdc_replay_open(struct ptlrpc_request *req) req = list_entry(tmp, struct ptlrpc_request, rq_list); if (req->rq_reqmsg->opc != MDS_CLOSE) continue; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); if (memcmp(&body->handle, &old, sizeof(old))) continue; @@ -489,15 +527,23 @@ static void mdc_replay_open(struct ptlrpc_request *req) } } -void mdc_set_open_replay_data(struct ll_file_data *fd) +void mdc_set_open_replay_data(struct obd_client_handle *och) { - struct ptlrpc_request *req = fd->fd_req; - struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, 2); - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct ptlrpc_request *req = och->och_req; + struct mds_rec_create *rec = + lustre_msg_buf(req->rq_reqmsg, 2, sizeof (*rec)); + struct mds_body *body = + lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body)); + + LASSERT (rec != NULL); + /* outgoing messages always in my byte order */ + LASSERT (body != NULL); + /* incoming message in my byte order (it's been swabbed) */ + LASSERT_REPSWABBED (req, 1); memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid); - fd->fd_req->rq_replay_cb = mdc_replay_open; - fd->fd_req->rq_replay_data = &fd->fd_mdshandle; + req->rq_replay_cb = mdc_replay_open; + req->rq_replay_data = och; } int mdc_close(struct lustre_handle *conn, obd_id ino, int type, @@ -513,7 +559,7 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type, if (!req) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); ll_ino2fid(&body->fid1, ino, 0, type); memcpy(&body->handle, fh, sizeof(body->handle)); @@ -530,72 +576,83 @@ int mdc_close(struct lustre_handle *conn, obd_id ino, int type, } int mdc_readpage(struct lustre_handle *conn, obd_id ino, int type, __u64 offset, - char *addr, struct ptlrpc_request **request) + struct page *page, struct ptlrpc_request **request) { struct obd_import *imp = class_conn2cliimp(conn); - struct ptlrpc_connection *connection = - client_conn2cli(conn)->cl_import.imp_connection; struct ptlrpc_request *req = NULL; struct ptlrpc_bulk_desc *desc = NULL; - struct ptlrpc_bulk_page *bulk = NULL; struct mds_body *body; int rc, size = sizeof(*body); ENTRY; CDEBUG(D_INODE, "inode: %ld\n", (long)ino); - desc = ptlrpc_prep_bulk(connection); - if (desc == NULL) - GOTO(out, rc = -ENOMEM); - req = ptlrpc_prep_req(imp, MDS_READPAGE, 1, &size, NULL); if (!req) - GOTO(out2, rc = -ENOMEM); - + GOTO(out, rc = -ENOMEM); /* XXX FIXME bug 249 */ req->rq_request_portal = MDS_READPAGE_PORTAL; - bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out2, rc = -ENOMEM); - - bulk->bp_xid = ptlrpc_next_xid(); - bulk->bp_buflen = PAGE_CACHE_SIZE; - bulk->bp_buf = addr; - - desc->bd_ptl_ev_hdlr = NULL; - desc->bd_portal = MDS_BULK_PORTAL; - - rc = ptlrpc_register_bulk_put(desc); - if (rc) { - CERROR("couldn't setup bulk sink: error %d.\n", rc); - GOTO(out2, rc); + desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, MDS_BULK_PORTAL); + if (desc == NULL) { + GOTO(out, rc = -ENOMEM); } + /* NB req now owns desc and will free it when it gets freed */ - mds_readdir_pack(req, offset, ino, type, bulk->bp_xid); + rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE); + if (rc != 0) + GOTO(out, rc); + + mds_readdir_pack(req, offset, PAGE_CACHE_SIZE, ino, type); req->rq_replen = lustre_msg_size(1, &size); rc = ptlrpc_queue_wait(req); - if (rc) { - ptlrpc_abort_bulk(desc); - GOTO(out2, rc); - } else { - body = lustre_msg_buf(req->rq_repmsg, 0); - mds_unpack_body(body); + + if (rc == 0) { + LASSERT (desc->bd_page_count == 1); + body = lustre_swab_repbuf (req, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't unpack mds_body\n"); + GOTO (out, rc = -EPROTO); + } } EXIT; - out2: - ptlrpc_bulk_decref(desc); out: *request = req; return rc; } +static int mdc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, + void *karg, void *uarg) +{ + struct obd_device *obddev = class_conn2obd(conn); + struct obd_ioctl_data *data = karg; + struct obd_import *imp = obddev->u.cli.cl_import; + ENTRY; + + switch (cmd) { + case OBD_IOC_CLIENT_RECOVER: + RETURN(ptlrpc_recover_import(imp, data->ioc_inlbuf1)); + case IOC_OSC_SET_ACTIVE: + if (data->ioc_offset) { + CERROR("%s: can't reactivate MDC\n", + obddev->obd_uuid.uuid); + RETURN(-ENOTTY); + } + RETURN(ptlrpc_set_import_active(imp, 0)); + default: + CERROR("osc_ioctl(): unrecognised ioctl %#x\n", cmd); + RETURN(-ENOTTY); + } +} + static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) { struct ptlrpc_request *req; - int rc, size = sizeof(*osfs); + struct obd_statfs *msfs; + int rc, size = sizeof(*msfs); ENTRY; req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_STATFS, 0, NULL, @@ -612,8 +669,14 @@ static int mdc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) if (rc) GOTO(out, rc); - obd_statfs_unpack(osfs, lustre_msg_buf(req->rq_repmsg, 0)); - + msfs = lustre_swab_repbuf (req, 0, sizeof (*msfs), + lustre_swab_obd_statfs); + if (msfs == NULL) { + CERROR ("Can't unpack obd_statfs\n"); + GOTO (out, rc = -EPROTO); + } + + memcpy (osfs, msfs, sizeof (*msfs)); EXIT; out: ptlrpc_req_finished(req); @@ -634,122 +697,19 @@ static int mdc_detach(struct obd_device *dev) return lprocfs_obd_detach(dev); } -/* Send a mostly-dummy GETSTATUS request and indicate that we're done replay. */ -static int signal_completed_replay(struct obd_import *imp) -{ - struct ll_fid fid; - - return send_getstatus(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY); -} - -static int mdc_recover(struct obd_import *imp, int phase) -{ - int rc; - unsigned long flags; - struct ptlrpc_request *req; - struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; - ENTRY; - - switch(phase) { - case PTLRPC_RECOVD_PHASE_PREPARE: - ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY); - RETURN(0); - - case PTLRPC_RECOVD_PHASE_NOTCONN: - ldlm_namespace_cleanup(ns, 1); - ptlrpc_abort_inflight(imp, 0); - /* FALL THROUGH */ - case PTLRPC_RECOVD_PHASE_RECOVER: - reconnect: - rc = ptlrpc_reconnect_import(imp, MDS_CONNECT, &req); - - flags = req->rq_repmsg - ? lustre_msg_get_op_flags(req->rq_repmsg) - : 0; - - if (rc == -EBUSY && (flags & MSG_CONNECT_RECOVERING)) - CERROR("reconnect denied by recovery; should retry\n"); - - if (rc) { - if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) { - CERROR("can't reconnect, invalidating\n"); - ldlm_namespace_cleanup(ns, 1); - ptlrpc_abort_inflight(imp, 0); - } - ptlrpc_req_finished(req); - RETURN(rc); - } - - if (flags & MSG_CONNECT_RECOVERING) { - /* Replay if they want it. */ - DEBUG_REQ(D_HA, req, "MDS wants replay"); - rc = ptlrpc_replay(imp); - if (rc) - GOTO(check_rc, rc); - - rc = ldlm_replay_locks(imp); - if (rc) - GOTO(check_rc, rc); - - rc = signal_completed_replay(imp); - if (rc) - GOTO(check_rc, rc); - } else if (flags & MSG_CONNECT_RECONNECT) { - DEBUG_REQ(D_HA, req, "reconnecting to MDS"); - /* Nothing else to do here. */ - } else { - DEBUG_REQ(D_HA, req, "evicted: invalidating"); - /* Otherwise, clean everything up. */ - ldlm_namespace_cleanup(ns, 1); - ptlrpc_abort_inflight(imp, 0); - } - - ptlrpc_req_finished(req); - spin_lock_irqsave(&imp->imp_lock, flags); - imp->imp_level = LUSTRE_CONN_FULL; - spin_unlock_irqrestore(&imp->imp_lock, flags); - - ptlrpc_wake_delayed(imp); - - rc = ptlrpc_resend(imp); - if (rc) - GOTO(check_rc, rc); - - RETURN(0); - check_rc: - /* If we get disconnected in the middle, recovery has probably - * failed. Reconnect and find out. - */ - if (rc == -ENOTCONN) - goto reconnect; - RETURN(rc); - - default: - RETURN(-EINVAL); - } -} - -static int mdc_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) -{ - struct obd_import *imp = &obd->u.cli.cl_import; - imp->imp_recover = mdc_recover; - return client_obd_connect(conn, obd, cluuid, recovd, recover); -} - struct obd_ops mdc_obd_ops = { o_owner: THIS_MODULE, o_attach: mdc_attach, o_detach: mdc_detach, o_setup: client_obd_setup, o_cleanup: client_obd_cleanup, - o_connect: mdc_connect, - o_disconnect: client_obd_disconnect, + o_connect: client_import_connect, + o_disconnect: client_import_disconnect, + o_iocontrol: mdc_iocontrol, o_statfs: mdc_statfs }; -static int __init ptlrpc_request_init(void) +int __init mdc_init(void) { struct lprocfs_static_vars lvars; mdc_init_rpc_lock(&mdc_rpc_lock); @@ -759,11 +719,12 @@ static int __init ptlrpc_request_init(void) LUSTRE_MDC_NAME); } -static void __exit ptlrpc_request_exit(void) +static void __exit mdc_exit(void) { class_unregister_type(LUSTRE_MDC_NAME); } +#ifdef __KERNEL__ MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); MODULE_DESCRIPTION("Lustre Metadata Client"); MODULE_LICENSE("GPL"); @@ -771,7 +732,6 @@ MODULE_LICENSE("GPL"); EXPORT_SYMBOL(mdc_getstatus); EXPORT_SYMBOL(mdc_getlovinfo); EXPORT_SYMBOL(mdc_enqueue); -EXPORT_SYMBOL(mdc_cancel_unused); EXPORT_SYMBOL(mdc_getattr); EXPORT_SYMBOL(mdc_getattr_name); EXPORT_SYMBOL(mdc_create); @@ -781,10 +741,10 @@ EXPORT_SYMBOL(mdc_link); EXPORT_SYMBOL(mdc_readpage); EXPORT_SYMBOL(mdc_setattr); EXPORT_SYMBOL(mdc_close); -EXPORT_SYMBOL(mdc_lock_set_inode); EXPORT_SYMBOL(mdc_set_open_replay_data); EXPORT_SYMBOL(mdc_store_inode_generation); -module_init(ptlrpc_request_init); -module_exit(ptlrpc_request_exit); +module_init(mdc_init); +module_exit(mdc_exit); +#endif diff --git a/lustre/mds/Makefile.am b/lustre/mds/Makefile.am index f789c22..cb63910 100644 --- a/lustre/mds/Makefile.am +++ b/lustre/mds/Makefile.am @@ -4,21 +4,10 @@ # See the file COPYING in this distribution DEFS= - MODULE = mds - modulefs_DATA = mds.o EXTRA_PROGRAMS = mds - -LINX= mds_updates.c mds_open.c simple.c target.c - -mds_updates.c: - test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c -simple.c: - test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c -target.c: - test -e target.c || ln -sf $(top_srcdir)/lib/target.c - -mds_SOURCES = mds_lov.c handler.c mds_reint.c mds_fs.c lproc_mds.c $(LINX) +mds_SOURCES = mds_lov.c handler.c mds_reint.c mds_fs.c lproc_mds.c mds_open.c \ +mds_lib.c mds_internal.h include $(top_srcdir)/Rules diff --git a/lustre/mds/Makefile.mk b/lustre/mds/Makefile.mk new file mode 100644 index 0000000..6b712fb --- /dev/null +++ b/lustre/mds/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += mds.o + +mds-objs := mds_lov.o handler.o mds_reint.o mds_fs.o lproc_mds.o mds_internal.h mds_updates.o mds_open.o simple.o target.o diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 58cfa20..259a6bc 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -35,27 +35,29 @@ #include <linux/init.h> #include <linux/obd_class.h> #include <linux/random.h> +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/ext3_fs.h> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#include <linux/smp_lock.h> -#include <linux/buffer_head.h> -#include <linux/workqueue.h> -#include <linux/mount.h> -#else -#include <linux/locks.h> +# include <linux/smp_lock.h> +# include <linux/buffer_head.h> +# include <linux/workqueue.h> +# include <linux/mount.h> +#else +# include <linux/locks.h> #endif #include <linux/obd_lov.h> #include <linux/lustre_mds.h> #include <linux/lustre_fsfilt.h> #include <linux/lprocfs_status.h> - -kmem_cache_t *mds_file_cache; +#include "mds_internal.h" extern int mds_get_lovtgts(struct mds_obd *obd, int tgt_count, struct obd_uuid *uuidarray); extern int mds_get_lovdesc(struct mds_obd *obd, struct lov_desc *desc); int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, struct ptlrpc_request *req, int rc, int disp); -static int mds_cleanup(struct obd_device * obddev); +static int mds_cleanup(struct obd_device * obddev, int force, int failover); inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req) { @@ -65,9 +67,13 @@ inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req) static int mds_bulk_timeout(void *data) { struct ptlrpc_bulk_desc *desc = data; + struct obd_export *exp = desc->bd_export; - ENTRY; - recovd_conn_fail(desc->bd_connection); + CERROR("bulk send timed out: evicting %s@%s\n", + exp->exp_client_uuid.uuid, + exp->exp_connection->c_remote_uuid.uuid); + ptlrpc_fail_export(exp); + ptlrpc_abort_bulk (desc); RETURN(1); } @@ -76,39 +82,35 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, __u64 offset, __u64 xid) { struct ptlrpc_bulk_desc *desc; - struct ptlrpc_bulk_page *bulk; struct l_wait_info lwi; - char *buf; + struct page *page; int rc = 0; ENTRY; - desc = ptlrpc_prep_bulk(req->rq_connection); + LASSERT ((offset & (PAGE_CACHE_SIZE - 1)) == 0); + + desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, MDS_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); - bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) + LASSERT (PAGE_SIZE == PAGE_CACHE_SIZE); + page = alloc_pages (GFP_KERNEL, 0); + if (page == NULL) GOTO(cleanup_bulk, rc = -ENOMEM); - OBD_ALLOC(buf, PAGE_CACHE_SIZE); - if (buf == NULL) - GOTO(cleanup_bulk, rc = -ENOMEM); + rc = ptlrpc_prep_bulk_page(desc, page, 0, PAGE_CACHE_SIZE); + if (rc != 0) + GOTO(cleanup_buf, rc); CDEBUG(D_EXT2, "reading %lu@"LPU64" from dir %lu (size %llu)\n", PAGE_CACHE_SIZE, offset, file->f_dentry->d_inode->i_ino, file->f_dentry->d_inode->i_size); - rc = fsfilt_readpage(req->rq_export->exp_obd, file, buf, + rc = fsfilt_readpage(req->rq_export->exp_obd, file, page_address (page), PAGE_CACHE_SIZE, (loff_t *)&offset); if (rc != PAGE_CACHE_SIZE) GOTO(cleanup_buf, rc = -EIO); - bulk->bp_xid = xid; - bulk->bp_buf = buf; - bulk->bp_buflen = PAGE_CACHE_SIZE; - desc->bd_ptl_ev_hdlr = NULL; - desc->bd_portal = MDS_BULK_PORTAL; - rc = ptlrpc_bulk_put(desc); if (rc) GOTO(cleanup_buf, rc); @@ -121,19 +123,17 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, } lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_SENT, - &lwi); + rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi); if (rc) { - if (rc != -ETIMEDOUT) - LBUG(); + LASSERT (rc == -ETIMEDOUT); GOTO(cleanup_buf, rc); } EXIT; cleanup_buf: - OBD_FREE(buf, PAGE_SIZE); + __free_pages (page, 0); cleanup_bulk: - ptlrpc_bulk_decref(desc); + ptlrpc_free_bulk (desc); out: return rc; } @@ -157,7 +157,7 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, res_id, LDLM_PLAIN, NULL, 0, lock_mode, &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, NULL, lockh); + mds_blocking_ast, NULL, lockh); if (rc != ELDLM_OK) { l_dput(de); retval = ERR_PTR(-ENOLCK); /* XXX translate ldlm code */ @@ -171,67 +171,52 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, #endif - /* Look up an entry by inode number. */ /* this function ONLY returns valid dget'd dentries with an initialized inode or errors */ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, struct vfsmount **mnt) { - /* stolen from NFS */ - struct super_block *sb = mds->mds_sb; + char fid_name[32]; unsigned long ino = fid->id; __u32 generation = fid->generation; struct inode *inode; - struct list_head *lp; struct dentry *result; if (ino == 0) RETURN(ERR_PTR(-ESTALE)); - inode = iget(sb, ino); - if (inode == NULL) - RETURN(ERR_PTR(-ENOMEM)); + snprintf(fid_name, sizeof(fid_name), "0x%lx", ino); - CDEBUG(D_DENTRY, "--> mds_fid2dentry: sb %p\n", inode->i_sb); + /* under ext3 this is neither supposed to return bad inodes + nor NULL inodes. */ + result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name)); + if (IS_ERR(result)) + RETURN(result); - if (is_bad_inode(inode) || - (generation && inode->i_generation != generation)) { + inode = result->d_inode; + if (!inode) + RETURN(ERR_PTR(-ENOENT)); + + CDEBUG(D_DENTRY, "--> mds_fid2dentry: ino %lu, gen %u, sb %p\n", + inode->i_ino, inode->i_generation, inode->i_sb); + + if (generation && inode->i_generation != generation) { /* we didn't find the right inode.. */ - CERROR("bad inode %lu, link: %d ct: %d or version %u/%u\n", + CERROR("bad inode %lu, link: %d ct: %d or generation %u/%u\n", inode->i_ino, inode->i_nlink, atomic_read(&inode->i_count), inode->i_generation, generation); - iput(inode); + dput(result); RETURN(ERR_PTR(-ENOENT)); } - /* now to find a dentry. If possible, get a well-connected one */ - if (mnt) + if (mnt) { *mnt = mds->mds_vfsmnt; - spin_lock(&dcache_lock); - list_for_each(lp, &inode->i_dentry) { - result = list_entry(lp, struct dentry, d_alias); - if (!(result->d_flags & DCACHE_DISCONNECTED)) { - dget_locked(result); - result->d_vfs_flags |= DCACHE_REFERENCED; - spin_unlock(&dcache_lock); - iput(inode); - if (mnt) - mntget(*mnt); - return result; - } - } - spin_unlock(&dcache_lock); - result = d_alloc_root(inode); - if (result == NULL) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - if (mnt) mntget(*mnt); - result->d_flags |= DCACHE_DISCONNECTED; - return result; + } + + RETURN(result); } @@ -242,13 +227,12 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, * on the server, etc. */ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { struct obd_export *exp; struct mds_export_data *med; struct mds_client_data *mcd; - int rc; + int rc, abort_recovery; ENTRY; if (!conn || !obd || !cluuid) @@ -256,9 +240,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, /* Check for aborted recovery. */ spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_flags & OBD_ABORT_RECOVERY) - target_abort_recovery(obd); + abort_recovery = obd->obd_abort_recovery; spin_unlock_bh(&obd->obd_processing_task_lock); + if (abort_recovery) + target_abort_recovery(obd); /* XXX There is a small race between checking the list and adding a * new connection for the same UUID, but the real threat (list @@ -276,6 +261,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, exp = class_conn2export(conn); LASSERT(exp); med = &exp->exp_mds_data; + class_export_put(exp); OBD_ALLOC(mcd, sizeof(*mcd)); if (!mcd) { @@ -289,7 +275,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, INIT_LIST_HEAD(&med->med_open_head); spin_lock_init(&med->med_open_lock); - rc = mds_client_add(&obd->u.mds, med, -1); + rc = mds_client_add(obd, &obd->u.mds, med, -1); if (rc) GOTO(out_mcd, rc); @@ -298,42 +284,116 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, out_mcd: OBD_FREE(mcd, sizeof(*mcd)); out_export: - class_disconnect(conn); + class_disconnect(conn, 0); return rc; } +static void mds_mfd_addref(void *mfdp) +{ + struct mds_file_data *mfd = mfdp; + + atomic_inc(&mfd->mfd_refcount); + CDEBUG(D_INFO, "GETting mfd %p : new refcount %d\n", mfd, + atomic_read(&mfd->mfd_refcount)); +} + +struct mds_file_data *mds_mfd_new(void) +{ + struct mds_file_data *mfd; + + OBD_ALLOC(mfd, sizeof *mfd); + if (mfd == NULL) { + CERROR("mds: out of memory\n"); + return NULL; + } + + atomic_set(&mfd->mfd_refcount, 2); + + INIT_LIST_HEAD(&mfd->mfd_handle.h_link); + class_handle_hash(&mfd->mfd_handle, mds_mfd_addref); + + return mfd; +} + +static struct mds_file_data *mds_handle2mfd(struct lustre_handle *handle) +{ + ENTRY; + LASSERT(handle != NULL); + RETURN(class_handle2object(handle->cookie)); +} + +void mds_mfd_put(struct mds_file_data *mfd) +{ + CDEBUG(D_INFO, "PUTting mfd %p : new refcount %d\n", mfd, + atomic_read(&mfd->mfd_refcount) - 1); + LASSERT(atomic_read(&mfd->mfd_refcount) > 0 && + atomic_read(&mfd->mfd_refcount) < 0x5a5a); + if (atomic_dec_and_test(&mfd->mfd_refcount)) { + LASSERT(list_empty(&mfd->mfd_handle.h_link)); + OBD_FREE(mfd, sizeof *mfd); + } +} + +void mds_mfd_destroy(struct mds_file_data *mfd) +{ + class_handle_unhash(&mfd->mfd_handle); + mds_mfd_put(mfd); +} + /* Call with med->med_open_lock held, please. */ -inline int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med) +static int mds_close_mfd(struct mds_file_data *mfd, struct mds_export_data *med) { - struct file *file = mfd->mfd_file; - int rc; struct dentry *de = NULL; - LASSERT(file->private_data == mfd); - - LASSERT(mfd->mfd_servercookie != DEAD_HANDLE_MAGIC); +#ifdef CONFIG_SMP + LASSERT(spin_is_locked(&med->med_open_lock)); +#endif list_del(&mfd->mfd_list); - mfd->mfd_servercookie = DEAD_HANDLE_MAGIC; - kmem_cache_free(mds_file_cache, mfd); - if (file->f_dentry->d_parent) { - LASSERT(atomic_read(&file->f_dentry->d_parent->d_count)); - de = dget(file->f_dentry->d_parent); + if (mfd->mfd_dentry->d_parent) { + LASSERT(atomic_read(&mfd->mfd_dentry->d_parent->d_count)); + de = dget(mfd->mfd_dentry->d_parent); } - rc = filp_close(file, 0); + + /* this is the actual "close" */ + l_dput(mfd->mfd_dentry); + if (de) l_dput(de); - RETURN(rc); + + mds_mfd_destroy(mfd); + RETURN(0); } -static int mds_disconnect(struct lustre_handle *conn) +static int mds_disconnect(struct lustre_handle *conn, int failover) { struct obd_export *export = class_conn2export(conn); - struct list_head *tmp, *n; + int rc; + unsigned long flags; + ENTRY; + + ldlm_cancel_locks_for_export(export); + + spin_lock_irqsave(&export->exp_lock, flags); + export->exp_failover = failover; + spin_unlock_irqrestore(&export->exp_lock, flags); + + rc = class_disconnect(conn, failover); + class_export_put(export); + + RETURN(rc); +} + +static void mds_destroy_export(struct obd_export *export) +{ struct mds_export_data *med = &export->exp_mds_data; + struct list_head *tmp, *n; int rc; + ENTRY; + LASSERT(!strcmp(export->exp_obd->obd_type->typ_name, + LUSTRE_MDS_NAME)); /* * Close any open files. @@ -342,28 +402,39 @@ static int mds_disconnect(struct lustre_handle *conn) list_for_each_safe(tmp, n, &med->med_open_head) { struct mds_file_data *mfd = list_entry(tmp, struct mds_file_data, mfd_list); - CERROR("force closing client file handle for %*s\n", - mfd->mfd_file->f_dentry->d_name.len, - mfd->mfd_file->f_dentry->d_name.name); +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct dentry *dentry = mfd->mfd_dentry; + CERROR("force closing client file handle for %*s (%s:%lu)\n", + dentry->d_name.len, dentry->d_name.name, + kdevname(dentry->d_inode->i_sb->s_dev), + dentry->d_inode->i_ino); +#endif rc = mds_close_mfd(mfd, med); if (rc) CDEBUG(D_INODE, "Error closing file: %d\n", rc); } spin_unlock(&med->med_open_lock); - ldlm_cancel_locks_for_export(export); - if (med->med_outstanding_reply) { + if (export->exp_outstanding_reply) { + struct ptlrpc_request *req = export->exp_outstanding_reply; + unsigned long flags; + /* Fake the ack, so the locks get cancelled. */ - med->med_outstanding_reply->rq_flags &= ~PTL_RPC_FL_WANT_ACK; - med->med_outstanding_reply->rq_flags |= PTL_RPC_FL_ERR; - wake_up(&med->med_outstanding_reply->rq_wait_for_rep); - med->med_outstanding_reply = NULL; - } - mds_client_free(export); + LBUG (); + /* Actually we can't do this because it prevents us knowing + * if the ACK callback ran or not */ + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_want_ack = 0; + req->rq_err = 1; + wake_up(&req->rq_wait_for_rep); + spin_unlock_irqrestore (&req->rq_lock, flags); - rc = class_disconnect(conn); + export->exp_outstanding_reply = NULL; + } - RETURN(rc); + if (!export->exp_failover) + mds_client_free(export); + EXIT; } /* @@ -393,7 +464,7 @@ static int mds_getstatus(struct ptlrpc_request *req) rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) { CERROR("mds: out of memory for message: size=%d\n", size); - req->rq_status = -ENOMEM; + req->rq_status = -ENOMEM; /* superfluous? */ RETURN(-ENOMEM); } @@ -404,7 +475,7 @@ static int mds_getstatus(struct ptlrpc_request *req) */ mds_fsync_super(mds->mds_sb); - body = lustre_msg_buf(req->rq_repmsg, 0); + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); memcpy(&body->fid1, &mds->mds_rootfid, sizeof(body->fid1)); /* the last_committed and last_xid fields are filled in for all @@ -418,19 +489,28 @@ static int mds_getlovinfo(struct ptlrpc_request *req) struct mds_obd *mds = mds_req2mds(req); struct mds_status_req *streq; struct lov_desc *desc; + struct obd_uuid *uuid0; int tgt_count; int rc, size[2] = {sizeof(*desc)}; ENTRY; - streq = lustre_msg_buf(req->rq_reqmsg, 0); - streq->flags = NTOH__u32(streq->flags); - streq->repbuf = NTOH__u32(streq->repbuf); + streq = lustre_swab_reqbuf (req, 0, sizeof (*streq), + lustre_swab_mds_status_req); + if (streq == NULL) { + CERROR ("Can't unpack mds_status_req\n"); + RETURN (-EFAULT); + } + + if (streq->repbuf > LOV_MAX_UUID_BUFFER_SIZE) { + CERROR ("Illegal request for uuid array > %d\n", + streq->repbuf); + RETURN (-EINVAL); + } size[1] = streq->repbuf; rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) { CERROR("mds: out of memory for message: size=%d\n", size[1]); - req->rq_status = -ENOMEM; RETURN(-ENOMEM); } @@ -439,18 +519,21 @@ static int mds_getlovinfo(struct ptlrpc_request *req) RETURN(0); } - desc = lustre_msg_buf(req->rq_repmsg, 0); - memcpy(desc, &mds->mds_lov_desc, sizeof *desc); - lov_packdesc(desc); - tgt_count = le32_to_cpu(desc->ld_tgt_count); - if (tgt_count * sizeof(struct obd_uuid) > streq->repbuf) { + /* XXX We're sending the lov_desc in my byte order. + * Receiver will swab... */ + desc = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*desc)); + memcpy(desc, &mds->mds_lov_desc, sizeof (*desc)); + + tgt_count = mds->mds_lov_desc.ld_tgt_count; + uuid0 = lustre_msg_buf (req->rq_repmsg, 1, + tgt_count * sizeof (*uuid0)); + if (uuid0 == NULL) { CERROR("too many targets, enlarge client buffers\n"); req->rq_status = -ENOSPC; RETURN(0); } - rc = mds_get_lovtgts(mds, tgt_count, - lustre_msg_buf(req->rq_repmsg, 1)); + rc = mds_get_lovtgts(mds, tgt_count, uuid0); if (rc) { CERROR("get_lovtgts error %d\n", rc); req->rq_status = rc; @@ -507,17 +590,19 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, { struct mds_obd *mds = &obd->u.mds; struct lov_mds_md *lmm; - int lmm_size = msg->buflens[offset]; + int lmm_size; int rc; ENTRY; - if (lmm_size == 0) { + lmm = lustre_msg_buf(msg, offset, 0); + if (lmm == NULL) { + /* Some problem with getting eadata when I sized the reply + * buffer... */ CDEBUG(D_INFO, "no space reserved for inode %lu MD\n", inode->i_ino); RETURN(0); } - - lmm = lustre_msg_buf(msg, offset); + lmm_size = msg->buflens[offset]; /* I don't really like this, but it is a sanity check on the client * MD request. However, if the client doesn't know how much space @@ -529,15 +614,13 @@ int mds_pack_md(struct obd_device *obd, struct lustre_msg *msg, // RETURN(-EINVAL); } - /* We don't need to store the reply size, because this buffer is - * discarded right after unpacking, and the LOV can figure out the - * size itself from the ost count. - */ - if ((rc = fsfilt_get_md(obd, inode, lmm, lmm_size)) < 0) { - CDEBUG(D_INFO, "No md for ino %lu: rc = %d\n", - inode->i_ino, rc); + rc = fsfilt_get_md(obd, inode, lmm, lmm_size); + if (rc < 0) { + CERROR ("Error %d reading eadata for ino %lu\n", + rc, inode->i_ino); } else if (rc > 0) { body->valid |= OBD_MD_FLEASIZE; + body->eadatasize = rc; rc = 0; } @@ -556,24 +639,36 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, if (inode == NULL) RETURN(-ENOENT); - body = lustre_msg_buf(req->rq_repmsg, reply_off); + body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof (*body)); + LASSERT (body != NULL); /* caller prepped reply */ mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); - if (S_ISREG(inode->i_mode) && reqbody->valid & OBD_MD_FLEASIZE) { + if (S_ISREG(inode->i_mode) && + (reqbody->valid & OBD_MD_FLEASIZE) != 0) { rc = mds_pack_md(obd, req->rq_repmsg, reply_off + 1, body, inode); - } else if (S_ISLNK(inode->i_mode) && reqbody->valid & OBD_MD_LINKNAME) { - char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1); - int len = req->rq_repmsg->buflens[reply_off + 1]; + } else if (S_ISLNK(inode->i_mode) && + (reqbody->valid & OBD_MD_LINKNAME) != 0) { + char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1, 0); + int len; + + LASSERT (symname != NULL); /* caller prepped reply */ + len = req->rq_repmsg->buflens[reply_off + 1]; rc = inode->i_op->readlink(dentry, symname, len); if (rc < 0) { CERROR("readlink failed: %d\n", rc); + } else if (rc != len - 1) { + CERROR ("Unexpected readlink rc %d: expecting %d\n", + rc, len - 1); + rc = -EINVAL; } else { CDEBUG(D_INODE, "read symlink dest %s\n", symname); body->valid |= OBD_MD_LINKNAME; + body->eadatasize = rc + 1; + symname[rc] = 0; /* NULL terminate */ rc = 0; } } @@ -588,9 +683,12 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1; ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, offset); + body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body)); + LASSERT (body != NULL); /* checked by caller */ + LASSERT_REQSWABBED (req, offset); /* swabbed by caller */ - if (S_ISREG(inode->i_mode) && body->valid & OBD_MD_FLEASIZE) { + if (S_ISREG(inode->i_mode) && + (body->valid & OBD_MD_FLEASIZE) != 0) { int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0); CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n", rc, inode->i_ino); @@ -606,11 +704,15 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, } else size[bufcount] = rc; bufcount++; - } else if (body->valid & OBD_MD_LINKNAME) { - size[bufcount] = MIN(inode->i_size + 1, body->size); + } else if (S_ISLNK (inode->i_mode) && + (body->valid & OBD_MD_LINKNAME) != 0) { + if (inode->i_size + 1 != body->eadatasize) + CERROR ("symlink size: %Lu, reply space: %d\n", + inode->i_size + 1, body->eadatasize); + size[bufcount] = MIN(inode->i_size + 1, body->eadatasize); bufcount++; - CDEBUG(D_INODE, "symlink size: %Lu, reply space: "LPU64"\n", - inode->i_size + 1, body->size); + CDEBUG(D_INODE, "symlink size: %Lu, reply space: %d\n", + inode->i_size + 1, body->eadatasize); } if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) { @@ -636,8 +738,6 @@ static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req, struct lustre_handle *client_lockh) { - struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_client_data *mcd = med->med_mcd; struct obd_device *obd = req->rq_export->exp_obd; struct mds_obd *mds = mds_req2mds(req); struct dentry *parent, *child; @@ -648,18 +748,19 @@ static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req, int namelen, rc = 0; char *name; - req->rq_transno = mcd->mcd_last_transno; - req->rq_status = mcd->mcd_last_result; - - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); - if (req->rq_status) - return; + body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*body)); + LASSERT (body != NULL); /* checked by caller */ + LASSERT_REQSWABBED (req, offset); /* swabbed by caller */ - body = lustre_msg_buf(req->rq_reqmsg, offset); - name = lustre_msg_buf(req->rq_reqmsg, offset + 1); + name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); + LASSERT (name != NULL); /* checked by caller */ + LASSERT_REQSWABBED (req, offset + 1); /* swabbed by caller */ namelen = req->rq_reqmsg->buflens[offset + 1]; + + LASSERT (offset == 2 || offset == 0); /* requests were at offset 2, replies go back at 1 */ if (offset) offset = 1; @@ -674,19 +775,17 @@ static void reconstruct_getattr_name(int offset, struct ptlrpc_request *req, LASSERT(!IS_ERR(parent)); dir = parent->d_inode; LASSERT(dir); - child = lookup_one_len(name, parent, namelen - 1); + child = ll_lookup_one_len(name, parent, namelen - 1); LASSERT(!IS_ERR(child)); - if (!med->med_outstanding_reply) { - /* XXX need to enqueue client lock */ - LBUG(); + if (req->rq_repmsg == NULL) { + rc = mds_getattr_pack_msg(req, child->d_inode, offset); + /* XXX need to handle error here */ + LASSERT (rc == 0); } - if (req->rq_repmsg == NULL) - mds_getattr_pack_msg(req, child->d_inode, offset); - rc = mds_getattr_internal(obd, child, req, body, offset); - LASSERT(!rc); + req->rq_status = rc; l_dput(child); l_dput(parent); } @@ -703,24 +802,41 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, struct obd_ucred uc; struct ldlm_res_id child_res_id = { .name = {0} }; struct lustre_handle parent_lockh; - int namelen, flags = 0, rc = 0, cleanup_phase = 0; + int namesize; + int flags = 0, rc = 0, cleanup_phase = 0, req_was_resent; char *name; ENTRY; LASSERT(!strcmp(obd->obd_type->typ_name, "mds")); - MDS_CHECK_RESENT(req, - reconstruct_getattr_name(offset, req, child_lockh)); + /* Swab now, before anyone looks inside the request */ - if (req->rq_reqmsg->bufcount <= offset + 1) { - LBUG(); - GOTO(cleanup, rc = -EINVAL); + body = lustre_swab_reqbuf (req, offset, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't swab mds_body\n"); + GOTO (cleanup, rc = -EFAULT); } - body = lustre_msg_buf(req->rq_reqmsg, offset); - name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - namelen = req->rq_reqmsg->buflens[offset + 1]; - /* requests were at offset 2, replies go back at 1 */ + LASSERT_REQSWAB (req, offset + 1); + name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); + if (name == NULL) { + CERROR ("Can't unpack name\n"); + GOTO (cleanup, rc = -EFAULT); + } + namesize = req->rq_reqmsg->buflens[offset + 1]; + + req_was_resent = lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT; + if (child_lockh->cookie) { + LASSERT(req_was_resent); + reconstruct_getattr_name(offset, req, child_lockh); + RETURN(0); + } else if (req_was_resent) { + DEBUG_REQ(D_HA, req, "no reply for RESENT req"); + } + + LASSERT (offset == 0 || offset == 2); + /* if requests were at offset 2, replies go back at 1 */ if (offset) offset = 1; @@ -740,10 +856,10 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, cleanup_phase = 1; /* parent dentry and lock */ - CDEBUG(D_INODE, "parent ino %lu, name %*s\n", dir->i_ino,namelen,name); + CDEBUG(D_INODE, "parent ino %lu, name %s\n", dir->i_ino, name); /* Step 2: Lookup child */ - dchild = lookup_one_len(name, de, namelen - 1); + dchild = ll_lookup_one_len(name, de, namesize - 1); if (IS_ERR(dchild)) { CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild)); GOTO(cleanup, rc = PTR_ERR(dchild)); @@ -761,7 +877,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, child_res_id, LDLM_PLAIN, NULL, 0, LCK_PR, &flags, ldlm_completion_ast, mds_blocking_ast, - NULL, NULL, child_lockh); + NULL, child_lockh); if (rc != ELDLM_OK) { CERROR("ldlm_cli_enqueue: %d\n", rc); GOTO(cleanup, rc = -EIO); @@ -769,15 +885,18 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req, cleanup_phase = 3; /* child lock */ - if (req->rq_repmsg == NULL) - mds_getattr_pack_msg(req, dchild->d_inode, offset); + if (req->rq_repmsg == NULL) { + rc = mds_getattr_pack_msg(req, dchild->d_inode, offset); + if (rc != 0) { + CERROR ("mds_getattr_pack_msg: %d\n", rc); + GOTO (cleanup, rc); + } + } rc = mds_getattr_internal(obd, dchild, req, body, offset); GOTO(cleanup, rc); /* returns the lock to the client */ - + cleanup: - rc = mds_finish_transno(mds, dchild ? dchild->d_inode : NULL, NULL, - req, rc, 0); switch (cleanup_phase) { case 3: if (rc) @@ -812,7 +931,13 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) int rc = 0; ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, offset); + body = lustre_swab_reqbuf (req, offset, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't unpack body\n"); + RETURN (-EFAULT); + } + uc.ouc_fsuid = body->fsuid; uc.ouc_fsgid = body->fsgid; uc.ouc_cap = body->capability; @@ -824,6 +949,10 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) } rc = mds_getattr_pack_msg(req, de->d_inode, offset); + if (rc != 0) { + CERROR ("mds_getattr_pack_msg: %d\n", rc); + GOTO (out_pop, rc); + } req->rq_status = mds_getattr_internal(obd, de, req, body, 0); @@ -847,13 +976,12 @@ static int mds_statfs(struct ptlrpc_request *req) GOTO(out, rc); } - osfs = lustre_msg_buf(req->rq_repmsg, 0); + osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*osfs)); rc = fsfilt_statfs(obd, obd->u.mds.mds_sb, osfs); if (rc) { CERROR("mds: statfs failed: rc %d\n", rc); GOTO(out, rc); } - obd_statfs_pack(osfs, osfs); EXIT; out: @@ -861,69 +989,6 @@ out: return 0; } -static struct mds_file_data *mds_handle2mfd(struct lustre_handle *handle) -{ - struct mds_file_data *mfd = NULL; - ENTRY; - - if (!handle || !handle->addr) - RETURN(NULL); - - mfd = (struct mds_file_data *)(unsigned long)(handle->addr); - if (!kmem_cache_validate(mds_file_cache, mfd)) - RETURN(NULL); - - if (mfd->mfd_servercookie != handle->cookie) - RETURN(NULL); - - RETURN(mfd); -} - -#if 0 - -static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req, - int offset, struct mds_body *body, struct inode *inode) -{ - struct obd_device *obd = req->rq_export->exp_obd; - struct lov_mds_md *lmm = lustre_msg_buf(req->rq_reqmsg, offset); - int lmm_size = req->rq_reqmsg->buflens[offset]; - struct obd_run_ctxt saved; - struct obd_ucred uc; - void *handle; - int rc, rc2; - ENTRY; - - /* I don't really like this, but it is a sanity check on the client - * MD request. - */ - if (lmm_size > mds->mds_max_mdsize) { - CERROR("Saving MD for inode %lu of %d bytes > max %d\n", - inode->i_ino, lmm_size, mds->mds_max_mdsize); - //RETURN(-EINVAL); - } - - CDEBUG(D_INODE, "storing %d bytes MD for inode %lu\n", - lmm_size, inode->i_ino); - uc.ouc_fsuid = body->fsuid; - uc.ouc_fsgid = body->fsgid; - uc.ouc_cap = body->capability; - push_ctxt(&saved, &mds->mds_ctxt, &uc); - handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR); - if (IS_ERR(handle)) { - rc = PTR_ERR(handle); - GOTO(out_ea, rc); - } - - rc = fsfilt_set_md(obd, inode,handle,lmm,lmm_size); - rc = mds_finish_transno(mds, inode, handle, req, rc, 0); -out_ea: - pop_ctxt(&saved, &mds->mds_ctxt, &uc); - - RETURN(rc); -} - -#endif - static void reconstruct_close(struct ptlrpc_request *req) { struct mds_export_data *med = &req->rq_export->exp_mds_data; @@ -948,13 +1013,17 @@ static int mds_close(struct ptlrpc_request *req) MDS_CHECK_RESENT(req, reconstruct_close(req)); - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf(req, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) { + CERROR ("Can't unpack body\n"); + RETURN (-EFAULT); + } mfd = mds_handle2mfd(&body->handle); if (mfd == NULL) { DEBUG_REQ(D_ERROR, req, "no handle for file close "LPD64 - ": addr "LPX64", cookie "LPX64"\n", - body->fid1.id, body->handle.addr, + ": cookie "LPX64"\n", body->fid1.id, body->handle.cookie); RETURN(-ESTALE); } @@ -966,6 +1035,7 @@ static int mds_close(struct ptlrpc_request *req) if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) { CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n"); req->rq_status = -ENOMEM; + mds_mfd_put(mfd); RETURN(-ENOMEM); } @@ -975,6 +1045,7 @@ static int mds_close(struct ptlrpc_request *req) req->rq_status = rc; } + mds_mfd_put(mfd); RETURN(0); } @@ -986,7 +1057,7 @@ static int mds_readpage(struct ptlrpc_request *req) struct file *file; struct mds_body *body, *repbody; struct obd_run_ctxt saved; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); struct obd_ucred uc; ENTRY; @@ -996,7 +1067,23 @@ static int mds_readpage(struct ptlrpc_request *req) GOTO(out, rc = -ENOMEM); } - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_mds_body); + if (body == NULL) + GOTO (out, rc = -EFAULT); + + /* body->size is actually the offset -eeb */ + if ((body->size & (PAGE_SIZE - 1)) != 0) { + CERROR ("offset "LPU64"not on a page boundary\n", body->size); + GOTO (out, rc = -EFAULT); + } + + /* body->nlink is actually the #bytes to read -eeb */ + if (body->nlink != PAGE_SIZE) { + CERROR ("size %d is not PAGE_SIZE\n", body->nlink); + GOTO (out, rc = -EFAULT); + } + uc.ouc_fsuid = body->fsuid; uc.ouc_fsgid = body->fsgid; uc.ouc_cap = body->capability; @@ -1012,7 +1099,7 @@ static int mds_readpage(struct ptlrpc_request *req) if (IS_ERR(file)) GOTO(out_pop, rc = PTR_ERR(file)); - repbody = lustre_msg_buf(req->rq_repmsg, 0); + repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody)); repbody->size = file->f_dentry->d_inode->i_size; repbody->valid = OBD_MD_FLSIZE; @@ -1020,6 +1107,7 @@ static int mds_readpage(struct ptlrpc_request *req) doesn't send a reply when this function completes. Instead a callback function would send the reply */ /* body->blocks is actually the xid -phil */ + /* body->size is actually the offset -eeb */ rc = mds_sendpage(req, file, body->size, body->blocks); filp_close(file, 0); @@ -1057,12 +1145,15 @@ static int filter_recovery_request(struct ptlrpc_request *req, { switch (req->rq_reqmsg->opc) { case MDS_CONNECT: /* This will never get here, but for completeness. */ + case OST_CONNECT: /* This will never get here, but for completeness. */ case MDS_DISCONNECT: + case OST_DISCONNECT: *process = 1; RETURN(0); case MDS_CLOSE: case MDS_GETSTATUS: /* used in unmounting */ + case OBD_PING: case MDS_REINT: case LDLM_ENQUEUE: *process = target_queue_recovery_request(req, obd); @@ -1072,7 +1163,8 @@ static int filter_recovery_request(struct ptlrpc_request *req, DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); *process = 0; /* XXX what should we set rq_status to here? */ - RETURN(ptlrpc_error(req->rq_svc, req)); + req->rq_status = -EAGAIN; + RETURN(ptlrpc_error(req)); } } @@ -1085,106 +1177,42 @@ static char *reint_names[] = { [REINT_OPEN] "open", }; -void mds_steal_ack_locks(struct mds_export_data *med, +void mds_steal_ack_locks(struct obd_export *exp, struct ptlrpc_request *req) { - struct ptlrpc_request *oldrep = med->med_outstanding_reply; + unsigned long flags; + + struct ptlrpc_request *oldrep = exp->exp_outstanding_reply; memcpy(req->rq_ack_locks, oldrep->rq_ack_locks, sizeof req->rq_ack_locks); - oldrep->rq_flags |= PTL_RPC_FL_RESENT; + spin_lock_irqsave (&req->rq_lock, flags); + oldrep->rq_resent = 1; wake_up(&oldrep->rq_wait_for_rep); + spin_unlock_irqrestore (&req->rq_lock, flags); DEBUG_REQ(D_HA, oldrep, "stole locks from"); DEBUG_REQ(D_HA, req, "stole locks for"); } -static void mds_send_reply(struct ptlrpc_request *req, int rc) -{ - int i; - struct ptlrpc_req_ack_lock *ack_lock; - struct l_wait_info lwi; - struct mds_export_data *med = - (req->rq_export && req->rq_ack_locks[0].mode) ? - &req->rq_export->exp_mds_data : NULL; - - if (med) { - med->med_outstanding_reply = req; - req->rq_flags |= PTL_RPC_FL_WANT_ACK; - init_waitqueue_head(&req->rq_wait_for_rep); - } - - if (!OBD_FAIL_CHECK(OBD_FAIL_MDS_ALL_REPLY_NET | OBD_FAIL_ONCE)) { - if (rc) { - DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); - ptlrpc_error(req->rq_svc, req); - } else { - DEBUG_REQ(D_NET, req, "sending reply"); - ptlrpc_reply(req->rq_svc, req); - } - } else { - obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; - DEBUG_REQ(D_ERROR, req, "dropping reply"); - if (!med && req->rq_repmsg) - OBD_FREE(req->rq_repmsg, req->rq_replen); - } - - if (!med) { - DEBUG_REQ(D_HA, req, "not waiting for ack"); - return; - } - - lwi = LWI_TIMEOUT(obd_timeout / 2 * HZ, NULL, NULL); - rc = l_wait_event(req->rq_wait_for_rep, - (req->rq_flags & PTL_RPC_FL_WANT_ACK) == 0 || - (req->rq_flags & PTL_RPC_FL_RESENT), - &lwi); - - if (req->rq_flags & PTL_RPC_FL_RESENT) { - /* The client resent this request, so abort the - * waiting-ack portals stuff, and don't decref the - * locks. - */ - DEBUG_REQ(D_HA, req, "resent: not cancelling locks"); - ptlrpc_abort(req); - return; - } - - if (rc == -ETIMEDOUT) { - ptlrpc_abort(req); - recovd_conn_fail(req->rq_export->exp_connection); - DEBUG_REQ(D_HA, req, "cancelling locks for timeout"); - } else { - DEBUG_REQ(D_HA, req, "cancelling locks for ack"); - } - - med->med_outstanding_reply = NULL; - - for (ack_lock = req->rq_ack_locks, i = 0; i < 4; i++, ack_lock++) { - if (!ack_lock->mode) - break; - ldlm_lock_decref(&ack_lock->lock, ack_lock->mode); - } -} - int mds_handle(struct ptlrpc_request *req) { - int should_process, rc; + int should_process; + int rc = 0; struct mds_obd *mds = NULL; /* quell gcc overwarning */ struct obd_device *obd = NULL; ENTRY; - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_HANDLE_UNPACK)) { - DEBUG_REQ(D_ERROR, req, "invalid request (%d)", rc); - GOTO(out, rc); - } - OBD_FAIL_RETURN(OBD_FAIL_MDS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0); LASSERT(!strcmp(req->rq_obd->obd_type->typ_name, LUSTRE_MDT_NAME)); + /* XXX identical to OST */ if (req->rq_reqmsg->opc != MDS_CONNECT) { struct mds_export_data *med; + int recovering, abort_recovery; + if (req->rq_export == NULL) { + CERROR("lustre_mds: operation %d on unconnected MDS\n", + req->rq_reqmsg->opc); req->rq_status = -ENOTCONN; GOTO(out, rc = -ENOTCONN); } @@ -1192,12 +1220,15 @@ int mds_handle(struct ptlrpc_request *req) med = &req->rq_export->exp_mds_data; obd = req->rq_export->exp_obd; mds = &obd->u.mds; + + /* Check for aborted recovery. */ spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_flags & OBD_ABORT_RECOVERY) - target_abort_recovery(obd); + abort_recovery = obd->obd_abort_recovery; + recovering = obd->obd_recovering; spin_unlock_bh(&obd->obd_processing_task_lock); - - if (obd->obd_flags & OBD_RECOVERING) { + if (abort_recovery) { + target_abort_recovery(obd); + } else if (recovering) { rc = filter_recovery_request(req, obd, &should_process); if (rc || !should_process) RETURN(rc); @@ -1224,7 +1255,7 @@ int mds_handle(struct ptlrpc_request *req) /* Make sure that last_rcvd is correct. */ if (!rc) mds_fsync_super(mds->mds_sb); - req->rq_status = rc; + req->rq_status = rc; /* superfluous? */ break; case MDS_GETSTATUS: @@ -1253,9 +1284,9 @@ int mds_handle(struct ptlrpc_request *req) * acquiring any new locks in mds_getattr_name, so we don't * want to cancel. */ - lockh.addr = 0; + lockh.cookie = 0; rc = mds_getattr_name(0, req, &lockh); - if (rc == 0 && lockh.addr) + if (rc == 0 && lockh.cookie) ldlm_lock_decref(&lockh, LCK_PR); break; } @@ -1275,13 +1306,24 @@ int mds_handle(struct ptlrpc_request *req) break; case MDS_REINT: { - int opc = *(u32 *)lustre_msg_buf(req->rq_reqmsg, 0); + __u32 *opcp = lustre_msg_buf (req->rq_reqmsg, 0, sizeof (*opcp)); + __u32 opc; int size[2] = {sizeof(struct mds_body), mds->mds_max_mdsize}; int bufcount; - DEBUG_REQ(D_INODE, req, "reint (%s%s)", - reint_names[opc & REINT_OPCODE_MASK], - opc & REINT_REPLAYING ? "|REPLAYING" : ""); + /* NB only peek inside req now; mds_reint() will swab it */ + if (opcp == NULL) { + CERROR ("Can't inspect opcode\n"); + rc = -EINVAL; + break; + } + opc = *opcp; + if (lustre_msg_swabbed (req->rq_reqmsg)) + __swab32s (&opc); + + DEBUG_REQ(D_INODE, req, "reint %d (%s)", opc, + (opc < sizeof (reint_names) / sizeof (reint_names[0]) || + reint_names[opc] == NULL) ? reint_names[opc] : "unknown opcode"); OBD_FAIL_RETURN(OBD_FAIL_MDS_REINT_NET, 0); @@ -1306,6 +1348,11 @@ int mds_handle(struct ptlrpc_request *req) rc = mds_close(req); break; + case OBD_PING: + DEBUG_REQ(D_INODE, req, "ping"); + rc = target_handle_ping(req); + break; + case LDLM_ENQUEUE: DEBUG_REQ(D_INODE, req, "enqueue"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0); @@ -1325,7 +1372,8 @@ int mds_handle(struct ptlrpc_request *req) OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0); break; default: - rc = ptlrpc_error(req->rq_svc, req); + req->rq_status = -ENOTSUPP; + rc = ptlrpc_error(req); RETURN(rc); } @@ -1337,10 +1385,11 @@ int mds_handle(struct ptlrpc_request *req) struct obd_device *obd = list_entry(mds, struct obd_device, u.mds); req->rq_repmsg->last_xid = - HTON__u64(le64_to_cpu(med->med_mcd->mcd_last_xid)); - if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) { + le64_to_cpu (med->med_mcd->mcd_last_xid); + + if (!obd->obd_no_transno) { req->rq_repmsg->last_committed = - HTON__u64(obd->obd_last_committed); + obd->obd_last_committed; } else { DEBUG_REQ(D_IOCTL, req, "not sending last_committed update"); @@ -1348,12 +1397,12 @@ int mds_handle(struct ptlrpc_request *req) CDEBUG(D_INFO, "last_transno "LPU64", last_committed "LPU64 ", xid "LPU64"\n", mds->mds_last_transno, obd->obd_last_committed, - NTOH__u64(req->rq_xid)); + req->rq_xid); } out: if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) { - if (obd && (obd->obd_flags & OBD_RECOVERING)) { + if (obd && obd->obd_recovering) { DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); return target_queue_final_reply(req, rc); } @@ -1361,7 +1410,7 @@ int mds_handle(struct ptlrpc_request *req) rc = req->rq_status = -ENOTCONN; } - mds_send_reply(req, rc); + target_send_reply(req, rc, OBD_FAIL_MDS_ALL_REPLY_NET); return 0; } @@ -1414,8 +1463,10 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) struct mds_obd *mds = &obddev->u.mds; struct vfsmount *mnt; int rc = 0; + unsigned long page; ENTRY; + #ifdef CONFIG_DEV_RDONLY dev_clear_rdonly(2); #endif @@ -1426,7 +1477,15 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) if (IS_ERR(obddev->obd_fsops)) RETURN(rc = PTR_ERR(obddev->obd_fsops)); - mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL); + if (!(page = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + memset((void *)page, 0, PAGE_SIZE); + sprintf((char *)page, "iopen_nopriv"); + + mnt = do_kern_mount(data->ioc_inlbuf2, 0, + data->ioc_inlbuf1, (void *)page); + free_page(page); if (IS_ERR(mnt)) { rc = PTR_ERR(mnt); CERROR("do_kern_mount failed: rc = %d\n", rc); @@ -1449,7 +1508,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) obddev->obd_namespace = ldlm_namespace_new("mds_server", LDLM_NAMESPACE_SERVER); if (obddev->obd_namespace == NULL) { - mds_cleanup(obddev); + mds_cleanup(obddev, 0, 0); GOTO(err_fs, rc = -ENOMEM); } @@ -1461,7 +1520,7 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(0); err_fs: - mds_fs_cleanup(obddev); + mds_fs_cleanup(obddev, 0); err_put: unlock_kernel(); mntput(mds->mds_vfsmnt); @@ -1472,7 +1531,7 @@ err_ops: return rc; } -static int mds_cleanup(struct obd_device *obddev) +static int mds_cleanup(struct obd_device *obddev, int force, int failover) { struct super_block *sb; struct mds_obd *mds = &obddev->u.mds; @@ -1483,14 +1542,25 @@ static int mds_cleanup(struct obd_device *obddev) RETURN(0); mds_update_server_data(mds); - mds_fs_cleanup(obddev); + mds_fs_cleanup(obddev, failover); unlock_kernel(); + + /* 2 seems normal on mds, (may_umount() also expects 2 + fwiw), but we only see 1 at this point in obdfilter. */ + if (atomic_read(&obddev->u.mds.mds_vfsmnt->mnt_count) > 2){ + CERROR("%s: mount point busy, mnt_count: %d\n", + obddev->obd_name, + atomic_read(&obddev->u.mds.mds_vfsmnt->mnt_count)); + } + mntput(mds->mds_vfsmnt); mds->mds_sb = 0; ldlm_namespace_free(obddev->obd_namespace); + if (obddev->obd_recovering) + target_cancel_recovery_timer(obddev); lock_kernel(); #ifdef CONFIG_DEV_RDONLY dev_clear_rdonly(2); @@ -1503,18 +1573,32 @@ static int mds_cleanup(struct obd_device *obddev) inline void fixup_handle_for_resent_req(struct ptlrpc_request *req, struct lustre_handle *lockh) { - struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_client_data *mcd = med->med_mcd; - struct ptlrpc_request *oldrep = med->med_outstanding_reply; - struct ldlm_reply *dlm_rep; + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct ldlm_request *dlmreq = + lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*dlmreq)); + struct lustre_handle remote_hdl = dlmreq->lock_handle1; + struct list_head *iter; + + if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT)) + return; + + l_lock(&obd->obd_namespace->ns_lock); + list_for_each(iter, &exp->exp_ldlm_data.led_held_locks) { + struct ldlm_lock *lock; + lock = list_entry(iter, struct ldlm_lock, l_export_chain); + if (lock->l_remote_handle.cookie == remote_hdl.cookie) { + lockh->cookie = lock->l_handle.h_cookie; + DEBUG_REQ(D_HA, req, "restoring lock cookie "LPX64, + lockh->cookie); + l_unlock(&obd->obd_namespace->ns_lock); + return; + } - if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) && - (mcd->mcd_last_xid == req->rq_xid) && (oldrep != NULL)) { - DEBUG_REQ(D_HA, req, "restoring lock handle from %p", oldrep); - dlm_rep = lustre_msg_buf(oldrep->rq_repmsg, 0); - lockh->addr = dlm_rep->lock_handle.addr; - lockh->cookie = dlm_rep->lock_handle.cookie; } + l_unlock(&obd->obd_namespace->ns_lock); + DEBUG_REQ(D_HA, req, "no existing lock with rhandle "LPX64, + remote_hdl.cookie); } static int ldlm_intent_policy(struct ldlm_namespace *ns, @@ -1531,17 +1615,23 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, if (req->rq_reqmsg->bufcount > 1) { /* an intent needs to be considered */ - struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1); + struct ldlm_intent *it; struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; struct mds_body *mds_body; struct ldlm_reply *rep; - struct lustre_handle lockh; + struct lustre_handle lockh = { 0 }; struct ldlm_lock *new_lock; int rc, offset = 2, repsize[3] = {sizeof(struct ldlm_reply), sizeof(struct mds_body), mds->mds_max_mdsize}; - it->opc = NTOH__u64(it->opc); + it = lustre_swab_reqbuf (req, 1, sizeof (*it), + lustre_swab_ldlm_intent); + if (it == NULL) { + CERROR ("Intent missing\n"); + rc = req->rq_status = -EFAULT; + RETURN (rc); + } LDLM_DEBUG(lock, "intent policy, opc: %s", ldlm_it2str(it->opc)); @@ -1553,7 +1643,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, RETURN(rc); } - rep = lustre_msg_buf(req->rq_repmsg, 0); + rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep)); rep->lock_policy_res1 = IT_INTENT_EXEC; fixup_handle_for_resent_req(req, &lockh); @@ -1584,7 +1674,7 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, rep->lock_policy_res2 = req->rq_status; RETURN(ELDLM_LOCK_ABORTED); } - mds_body = lustre_msg_buf(req->rq_repmsg, 1); + mds_body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*mds_body)); if (!(mds_body->valid & OBD_MD_FLEASIZE)) { rep->lock_policy_res2 = rc; RETURN(ELDLM_LOCK_ABORTED); @@ -1611,17 +1701,37 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, LBUG(); } - if (flags & LDLM_FL_INTENT_ONLY) { - LDLM_DEBUG(lock, "INTENT_ONLY, aborting lock"); - RETURN(ELDLM_LOCK_ABORTED); - } - /* By this point, whatever function we called above must have * filled in 'lockh' or returned an error. We want to give the * new lock to the client instead of whatever lock it was about * to get. */ new_lock = ldlm_handle2lock(&lockh); LASSERT(new_lock != NULL); + + /* If we've already given this lock to a client once, then we + * should have no readers or writers. Otherwise, we should + * have one reader _or_ writer ref (which will be zeroed below + * before returning the lock to a client. + */ + if (new_lock->l_export == req->rq_export) + LASSERT(new_lock->l_readers + new_lock->l_writers == 0); + else + LASSERT(new_lock->l_readers + new_lock->l_writers == 1); + + /* If we're running an intent only, we want to abort the new + * lock, and let the client abort the original lock. */ + if (flags & LDLM_FL_INTENT_ONLY) { + LDLM_DEBUG(lock, "INTENT_ONLY, aborting locks"); + l_lock(&new_lock->l_resource->lr_namespace->ns_lock); + if (new_lock->l_readers) + ldlm_lock_decref(&lockh, LCK_PR); + else + ldlm_lock_decref(&lockh, LCK_PW); + l_unlock(&new_lock->l_resource->lr_namespace->ns_lock); + LDLM_LOCK_PUT(new_lock); + RETURN(ELDLM_LOCK_ABORTED); + } + *lockp = new_lock; rep->lock_policy_res2 = req->rq_status; @@ -1629,14 +1739,13 @@ static int ldlm_intent_policy(struct ldlm_namespace *ns, if (new_lock->l_export == req->rq_export) { /* Already gave this to the client, which means that we * reconstructed a reply. */ - LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & + LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT); RETURN(ELDLM_LOCK_REPLACED); } /* Fixup the lock to be given to the client */ l_lock(&new_lock->l_resource->lr_namespace->ns_lock); - LASSERT(new_lock->l_readers + new_lock->l_writers == 1); new_lock->l_readers = 0; new_lock->l_writers = 0; @@ -1706,7 +1815,8 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, - mds_handle, "mds"); + mds_handle, "mds", obddev); + if (!mds->mds_service) { CERROR("failed to start service\n"); RETURN(rc = -ENOMEM); @@ -1726,7 +1836,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_SETATTR_PORTAL, MDC_REPLY_PORTAL, - mds_handle, "mds"); + mds_handle, "mds_setattr", obddev); if (!mds->mds_setattr_service) { CERROR("failed to start getattr service\n"); GOTO(err_thread, rc = -ENOMEM); @@ -1748,7 +1858,7 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_READPAGE_PORTAL, MDC_REPLY_PORTAL, - mds_handle, "mds"); + mds_handle, "mds_readpage", obddev); if (!mds->mds_readpage_service) { CERROR("failed to start readpage service\n"); GOTO(err_thread2, rc = -ENOMEM); @@ -1781,7 +1891,7 @@ err_thread: } -static int mdt_cleanup(struct obd_device *obddev) +static int mdt_cleanup(struct obd_device *obddev, int force, int failover) { struct mds_obd *mds = &obddev->u.mds; ENTRY; @@ -1803,14 +1913,15 @@ extern int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, /* use obd ops to offer management infrastructure */ static struct obd_ops mds_obd_ops = { - o_owner: THIS_MODULE, - o_attach: mds_attach, - o_detach: mds_detach, - o_connect: mds_connect, - o_disconnect: mds_disconnect, - o_setup: mds_setup, - o_cleanup: mds_cleanup, - o_iocontrol: mds_iocontrol + o_owner: THIS_MODULE, + o_attach: mds_attach, + o_detach: mds_detach, + o_connect: mds_connect, + o_disconnect: mds_disconnect, + o_setup: mds_setup, + o_cleanup: mds_cleanup, + o_iocontrol: mds_iocontrol, + o_destroy_export: mds_destroy_export }; static struct obd_ops mdt_obd_ops = { @@ -1825,11 +1936,6 @@ static struct obd_ops mdt_obd_ops = { static int __init mds_init(void) { struct lprocfs_static_vars lvars; - mds_file_cache = kmem_cache_create("ll_mds_file_data", - sizeof(struct mds_file_data), - 0, 0, NULL, NULL); - if (mds_file_cache == NULL) - return -ENOMEM; lprocfs_init_multi_vars(0, &lvars); class_register_type(&mds_obd_ops, lvars.module_vars, LUSTRE_MDS_NAME); @@ -1845,8 +1951,6 @@ static void __exit mds_exit(void) ldlm_unregister_intent(); class_unregister_type(LUSTRE_MDS_NAME); class_unregister_type(LUSTRE_MDT_NAME); - if (kmem_cache_destroy(mds_file_cache)) - CERROR("couldn't free MDS file cache\n"); } MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index e4522fb..5d6fa57 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -37,8 +37,7 @@ struct lprocfs_vars lprocfs_mdt_module_vars[] = { {0} }; #else -static inline -int lprocfs_mds_statfs(void *data, struct statfs *sfs) +static inline int lprocfs_mds_statfs(void *data, struct statfs *sfs) { struct obd_device* dev = (struct obd_device*) data; struct mds_obd *mds; @@ -66,16 +65,28 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof, return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type); } +int lprocfs_mds_rd_mntdev(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* obd = (struct obd_device *)data; + + LASSERT(obd != NULL); + LASSERT(obd->u.mds.mds_vfsmnt->mnt_devname); + *eof = 1; + return snprintf(page, count, "%s\n", + obd->u.mds.mds_vfsmnt->mnt_devname); +} struct lprocfs_vars lprocfs_mds_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "blocksize", rd_blksize, 0, 0 }, - { "bytestotal", rd_kbytestotal, 0, 0 }, + { "kbytestotal",rd_kbytestotal, 0, 0 }, { "kbytesfree", rd_kbytesfree, 0, 0 }, { "fstype", rd_fstype, 0, 0 }, { "filestotal", rd_filestotal, 0, 0 }, { "filesfree", rd_filesfree, 0, 0 }, { "filegroups", rd_filegroups, 0, 0 }, + { "mntdev", lprocfs_mds_rd_mntdev, 0, 0 }, { 0 } }; @@ -101,5 +112,5 @@ struct lprocfs_static_vars lprocfs_array_vars[] = { {lprocfs_mds_module_vars, lprocfs_mdt_obd_vars}}; LPROCFS_INIT_MULTI_VARS(lprocfs_array_vars, - (sizeof(lprocfs_array_vars)/ - sizeof(struct lprocfs_static_vars))) + (sizeof(lprocfs_array_vars) / + sizeof(struct lprocfs_static_vars))) diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 7952101..cefc680 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -42,8 +42,6 @@ #define MDS_MAX_CLIENTS (PAGE_SIZE * 8) #define MDS_MAX_CLIENT_WORDS (MDS_MAX_CLIENTS / sizeof(unsigned long)) -static unsigned long last_rcvd_slots[MDS_MAX_CLIENT_WORDS]; - #define LAST_RCVD "last_rcvd" /* Add client data to the MDS. We use a bitmap to locate a free space @@ -51,29 +49,37 @@ static unsigned long last_rcvd_slots[MDS_MAX_CLIENT_WORDS]; * Otherwise, we have just read the data from the last_rcvd file and * we know its offset. */ -int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off) +int mds_client_add(struct obd_device *obd, struct mds_obd *mds, + struct mds_export_data *med, int cl_off) { + unsigned long *bitmap = mds->mds_client_bitmap; int new_client = (cl_off == -1); + LASSERT(bitmap != NULL); + + /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp(med->med_mcd->mcd_uuid, "OBD_CLASS_UUID")) + RETURN(0); + /* the bitmap operations can handle cl_off > sizeof(long) * 8, so * there's no need for extra complication here */ if (new_client) { - cl_off = find_first_zero_bit(last_rcvd_slots, MDS_MAX_CLIENTS); + cl_off = find_first_zero_bit(bitmap, MDS_MAX_CLIENTS); repeat: if (cl_off >= MDS_MAX_CLIENTS) { CERROR("no room for clients - fix MDS_MAX_CLIENTS\n"); return -ENOMEM; } - if (test_and_set_bit(cl_off, last_rcvd_slots)) { + if (test_and_set_bit(cl_off, bitmap)) { CERROR("MDS client %d: found bit is set in bitmap\n", cl_off); - cl_off = find_next_zero_bit(last_rcvd_slots, - MDS_MAX_CLIENTS, cl_off); + cl_off = find_next_zero_bit(bitmap, MDS_MAX_CLIENTS, + cl_off); goto repeat; } } else { - if (test_and_set_bit(cl_off, last_rcvd_slots)) { + if (test_and_set_bit(cl_off, bitmap)) { CERROR("MDS client %d: bit already set in bitmap!!\n", cl_off); LBUG(); @@ -89,11 +95,36 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off) struct obd_run_ctxt saved; loff_t off = MDS_LR_CLIENT + (cl_off * MDS_LR_SIZE); ssize_t written; + void *handle; push_ctxt(&saved, &mds->mds_ctxt, NULL); - written = lustre_fwrite(mds->mds_rcvd_filp, - (char *)med->med_mcd, - sizeof(*med->med_mcd), &off); + /* We need to start a transaction here first, to avoid a + * possible ordering deadlock on last_rcvd->i_sem and the + * journal lock. In most places we start the journal handle + * first (because we do compound transactions), and then + * later do the write into last_rcvd, which gets i_sem. + * + * Without this transaction, clients connecting at the same + * time other MDS operations are ongoing get last_rcvd->i_sem + * first (in generic_file_write()) and start the journal + * transaction afterwards, and can deadlock with other ops. + * + * We use FSFILT_OP_SETATTR because it is smallest, but all + * ops include enough space for the last_rcvd update so we + * could use any of them, or maybe an FSFILT_OP_NONE is best? + */ + handle = fsfilt_start(obd,mds->mds_rcvd_filp->f_dentry->d_inode, + FSFILT_OP_SETATTR); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + CERROR("unable to start transaction: rc %d\n", + (int)written); + } else { + written = lustre_fwrite(mds->mds_rcvd_filp,med->med_mcd, + sizeof(*med->med_mcd), &off); + fsfilt_commit(obd,mds->mds_rcvd_filp->f_dentry->d_inode, + handle, 0); + } pop_ctxt(&saved, &mds->mds_ctxt, NULL); if (written != sizeof(*med->med_mcd)) { @@ -115,17 +146,23 @@ int mds_client_free(struct obd_export *exp) struct mds_client_data zero_mcd; struct obd_run_ctxt saved; int written; + unsigned long *bitmap = mds->mds_client_bitmap; loff_t off; + LASSERT(bitmap); if (!med->med_mcd) RETURN(0); + /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp(med->med_mcd->mcd_uuid, "OBD_CLASS_UUID")) + GOTO(free_and_out, 0); + off = MDS_LR_CLIENT + (med->med_off * MDS_LR_SIZE); CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n", med->med_off, off, med->med_mcd->mcd_uuid); - if (!test_and_clear_bit(med->med_off, last_rcvd_slots)) { + if (!test_and_clear_bit(med->med_off, bitmap)) { CERROR("MDS client %u: bit already clear in bitmap!!\n", med->med_off); LBUG(); @@ -146,6 +183,7 @@ int mds_client_free(struct obd_export *exp) med->med_mcd->mcd_uuid, med->med_off); } + free_and_out: OBD_FREE(med->med_mcd, sizeof(*med->med_mcd)); return 0; @@ -153,6 +191,8 @@ int mds_client_free(struct obd_export *exp) static int mds_server_free_data(struct mds_obd *mds) { + OBD_FREE(mds->mds_client_bitmap, + MDS_MAX_CLIENT_WORDS * sizeof(unsigned long)); OBD_FREE(mds->mds_server_data, sizeof(*mds->mds_server_data)); mds->mds_server_data = NULL; @@ -170,18 +210,27 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) __u64 last_transno = 0; __u64 last_mount; int rc = 0; - + LASSERT(sizeof(struct mds_client_data) == MDS_LR_SIZE); LASSERT(sizeof(struct mds_server_data) <= MDS_LR_CLIENT); OBD_ALLOC(msd, sizeof(*msd)); if (!msd) RETURN(-ENOMEM); + + OBD_ALLOC(mds->mds_client_bitmap, + MDS_MAX_CLIENT_WORDS * sizeof(unsigned long)); + if (!mds->mds_client_bitmap) { + OBD_FREE(msd, sizeof(*msd)); + RETURN(-ENOMEM); + } + rc = lustre_fread(f, (char *)msd, sizeof(*msd), &off); mds->mds_server_data = msd; if (rc == 0) { - CERROR("empty MDS %s, new MDS?\n", LAST_RCVD); + CERROR("%s: empty MDS %s, new MDS?\n", obddev->obd_name, + LAST_RCVD); RETURN(0); } @@ -252,21 +301,21 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) sizeof exp->exp_client_uuid.uuid); med = &exp->exp_mds_data; med->med_mcd = mcd; - mds_client_add(mds, med, cl_off); + mds_client_add(obddev, mds, med, cl_off); /* create helper if export init gets more complex */ INIT_LIST_HEAD(&med->med_open_head); spin_lock_init(&med->med_open_lock); mcd = NULL; obddev->obd_recoverable_clients++; + class_export_put(exp); } else { - CDEBUG(D_INFO, - "discarded client %d, UUID '%s', count %Ld\n", - cl_off, mcd->mcd_uuid, - (long long)le64_to_cpu(mcd->mcd_mount_count)); + CDEBUG(D_INFO, "discarded client %d, UUID '%s', count " + LPU64"\n", cl_off, mcd->mcd_uuid, + le64_to_cpu(mcd->mcd_mount_count)); } - CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n", + CDEBUG(D_OTHER, "client at offset %d has last_transno = %Lu\n", cl_off, (unsigned long long)last_transno); if (last_transno > mds->mds_last_transno) @@ -280,7 +329,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) obddev->obd_recoverable_clients, mds->mds_last_transno); obddev->obd_next_recovery_transno = obddev->obd_last_committed + 1; - obddev->obd_flags |= OBD_RECOVERING; + obddev->obd_recovering = 1; } if (mcd) @@ -315,14 +364,14 @@ static int mds_fs_prep(struct obd_device *obddev) dput(dentry); - dentry = simple_mkdir(current->fs->pwd, "FH", 0700); - if (IS_ERR(dentry)) { - rc = PTR_ERR(dentry); - CERROR("cannot create FH directory: rc = %d\n", rc); + dentry = lookup_one_len("__iopen__", current->fs->pwd, + strlen("__iopen__")); + if (IS_ERR(dentry) || !dentry->d_inode) { + rc = (IS_ERR(dentry)) ? PTR_ERR(dentry): -ENOENT; + CERROR("cannot open iopen FH directory: rc = %d\n", rc); GOTO(err_pop, rc); } - /* XXX probably want to hold on to this later... */ - dput(dentry); + mds->mds_fid_de = dentry; f = filp_open(LAST_RCVD, O_RDWR | O_CREAT, 0644); if (IS_ERR(f)) { @@ -354,7 +403,7 @@ err_pop: return rc; err_client: - class_disconnect_all(obddev); + class_disconnect_exports(obddev, 0); err_filp: if (filp_close(f, 0)) CERROR("can't close %s after error\n", LAST_RCVD); @@ -372,28 +421,33 @@ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt) mds->mds_ctxt.pwdmnt = mnt; mds->mds_ctxt.pwd = mnt->mnt_root; mds->mds_ctxt.fs = get_ds(); - RETURN(mds_fs_prep(obddev)); } -int mds_fs_cleanup(struct obd_device *obddev) +int mds_fs_cleanup(struct obd_device *obddev, int failover) { struct mds_obd *mds = &obddev->u.mds; struct obd_run_ctxt saved; int rc = 0; - class_disconnect_all(obddev); /* this cleans up client info too */ + if (failover) + CERROR("%s: shutting down for failover; client state will" + " be preserved.\n", obddev->obd_name); + + class_disconnect_exports(obddev, failover); /* this cleans up client + info too */ mds_server_free_data(mds); push_ctxt(&saved, &mds->mds_ctxt, NULL); if (mds->mds_rcvd_filp) { rc = filp_close(mds->mds_rcvd_filp, 0); mds->mds_rcvd_filp = NULL; - if (rc) CERROR("last_rcvd file won't close, rc=%d\n", rc); } pop_ctxt(&saved, &mds->mds_ctxt, NULL); + shrink_dcache_parent(mds->mds_fid_de); + dput(mds->mds_fid_de); return rc; } diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h new file mode 100644 index 0000000..0b62a92 --- /dev/null +++ b/lustre/mds/mds_internal.h @@ -0,0 +1,15 @@ +struct mds_file_data *mds_mfd_new(void); +void mds_mfd_put(struct mds_file_data *mfd); +void mds_mfd_destroy(struct mds_file_data *mfd); +int mds_update_unpack(struct ptlrpc_request *, int offset, + struct mds_update_record *); + +/* mds/mds_fs.c */ +int mds_client_add(struct obd_device *obd, struct mds_obd *mds, + struct mds_export_data *med, int cl_off); +int mds_client_free(struct obd_export *exp); + +#ifdef __KERNEL__ +void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode); +void mds_pack_inode2body(struct mds_body *body, struct inode *inode); +#endif diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c new file mode 100644 index 0000000..8f16795 --- /dev/null +++ b/lustre/mds/mds_lib.c @@ -0,0 +1,310 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_MDS + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/version.h> +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +# include <linux/locks.h> // for wait_on_buffer +#else +# include <linux/buffer_head.h> // for wait_on_buffer +#endif +#include <linux/unistd.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/stat.h> +#include <asm/uaccess.h> +#include <linux/slab.h> +#include <asm/segment.h> + +#include <linux/obd_support.h> +#include <linux/lustre_lib.h> +#include <linux/lustre_mds.h> +#include <linux/lustre_lite.h> + +void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) +{ + fid->id = inode->i_ino; + fid->generation = inode->i_generation; + fid->f_type = (S_IFMT & inode->i_mode); +} + +void mds_pack_inode2body(struct mds_body *b, struct inode *inode) +{ + b->valid = OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLNLINK | OBD_MD_FLGENER; + + /* The MDS file size isn't authoritative for regular files, so don't + * even pretend. */ + if (S_ISREG(inode->i_mode)) + b->valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + + b->ino = inode->i_ino; + b->atime = LTIME_S(inode->i_atime); + b->mtime = LTIME_S(inode->i_mtime); + b->ctime = LTIME_S(inode->i_ctime); + b->mode = inode->i_mode; + b->size = inode->i_size; + b->blocks = inode->i_blocks; + b->uid = inode->i_uid; + b->gid = inode->i_gid; + b->flags = inode->i_flags; + b->rdev = b->rdev; + b->nlink = inode->i_nlink; + b->generation = inode->i_generation; + b->suppgid = -1; +} +/* unpacking */ +static int mds_setattr_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct iattr *attr = &r->ur_iattr; + struct mds_rec_setattr *rec; + ENTRY; + + rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), + lustre_swab_mds_rec_setattr); + if (rec == NULL) + RETURN (-EFAULT); + + r->ur_fsuid = rec->sa_fsuid; + r->ur_fsgid = rec->sa_fsgid; + r->ur_cap = rec->sa_cap; + r->ur_suppgid1 = rec->sa_suppgid; + r->ur_suppgid2 = -1; + r->ur_fid1 = &rec->sa_fid; + attr->ia_valid = rec->sa_valid; + attr->ia_mode = rec->sa_mode; + attr->ia_uid = rec->sa_uid; + attr->ia_gid = rec->sa_gid; + attr->ia_size = rec->sa_size; + LTIME_S(attr->ia_atime) = rec->sa_atime; + LTIME_S(attr->ia_mtime) = rec->sa_mtime; + LTIME_S(attr->ia_ctime) = rec->sa_ctime; + attr->ia_attr_flags = rec->sa_attr_flags; + + LASSERT_REQSWAB (req, offset + 1); + if (req->rq_reqmsg->bufcount > offset + 1) { + r->ur_eadata = lustre_msg_buf (req->rq_reqmsg, + offset + 1, 0); + if (r->ur_eadata == NULL) + RETURN (-EFAULT); + r->ur_eadatalen = req->rq_reqmsg->buflens[offset + 1]; + } else { + r->ur_eadata = NULL; + r->ur_eadatalen = 0; + } + + RETURN(0); +} + +static int mds_create_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct mds_rec_create *rec; + ENTRY; + + rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), + lustre_swab_mds_rec_create); + if (rec == NULL) + RETURN (-EFAULT); + + r->ur_fsuid = rec->cr_fsuid; + r->ur_fsgid = rec->cr_fsgid; + r->ur_cap = rec->cr_cap; + r->ur_fid1 = &rec->cr_fid; + r->ur_fid2 = &rec->cr_replayfid; + r->ur_mode = rec->cr_mode; + r->ur_rdev = rec->cr_rdev; + r->ur_uid = rec->cr_uid; + r->ur_gid = rec->cr_gid; + r->ur_time = rec->cr_time; + r->ur_flags = rec->cr_flags; + r->ur_suppgid1 = rec->cr_suppgid; + r->ur_suppgid2 = -1; + + LASSERT_REQSWAB (req, offset + 1); + r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); + if (r->ur_name == NULL) + RETURN (-EFAULT); + r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; + + LASSERT_REQSWAB (req, offset + 2); + if (req->rq_reqmsg->bufcount > offset + 2) { + /* NB for now, we only seem to pass NULL terminated symlink + * target strings here. If this ever changes, we'll have + * to stop checking for a buffer filled completely with a + * NULL terminated string here, and make the callers check + * depending on what they expect. We should probably stash + * it in r->ur_eadata in that case, so it's obvious... -eeb + */ + r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0); + if (r->ur_tgt == NULL) + RETURN (-EFAULT); + r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; + } else { + r->ur_tgt = NULL; + r->ur_tgtlen = 0; + } + RETURN(0); +} + +static int mds_link_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct mds_rec_link *rec; + ENTRY; + + rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), + lustre_swab_mds_rec_link); + if (rec == NULL) + RETURN (-EFAULT); + + r->ur_fsuid = rec->lk_fsuid; + r->ur_fsgid = rec->lk_fsgid; + r->ur_cap = rec->lk_cap; + r->ur_suppgid1 = rec->lk_suppgid1; + r->ur_suppgid2 = rec->lk_suppgid2; + r->ur_fid1 = &rec->lk_fid1; + r->ur_fid2 = &rec->lk_fid2; + + LASSERT_REQSWAB (req, offset + 1); + r->ur_name = lustre_msg_string (req->rq_reqmsg, offset + 1, 0); + if (r->ur_name == NULL) + RETURN (-EFAULT); + r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; + RETURN(0); +} + +static int mds_unlink_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct mds_rec_unlink *rec; + ENTRY; + + rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), + lustre_swab_mds_rec_unlink); + if (rec == NULL) + RETURN(-EFAULT); + + r->ur_fsuid = rec->ul_fsuid; + r->ur_fsgid = rec->ul_fsgid; + r->ur_cap = rec->ul_cap; + r->ur_mode = rec->ul_mode; + r->ur_suppgid1 = rec->ul_suppgid; + r->ur_suppgid2 = -1; + r->ur_fid1 = &rec->ul_fid1; + r->ur_fid2 = &rec->ul_fid2; + + LASSERT_REQSWAB (req, offset + 1); + r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); + if (r->ur_name == NULL) + RETURN(-EFAULT); + r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; + RETURN(0); +} + +static int mds_rename_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *r) +{ + struct mds_rec_rename *rec; + ENTRY; + + rec = lustre_swab_reqbuf (req, offset, sizeof (*rec), + lustre_swab_mds_rec_unlink); + if (rec == NULL) + RETURN(-EFAULT); + + r->ur_fsuid = rec->rn_fsuid; + r->ur_fsgid = rec->rn_fsgid; + r->ur_cap = rec->rn_cap; + r->ur_suppgid1 = rec->rn_suppgid1; + r->ur_suppgid2 = rec->rn_suppgid2; + r->ur_fid1 = &rec->rn_fid1; + r->ur_fid2 = &rec->rn_fid2; + + LASSERT_REQSWAB (req, offset + 1); + r->ur_name = lustre_msg_string(req->rq_reqmsg, offset + 1, 0); + if (r->ur_name == NULL) + RETURN(-EFAULT); + r->ur_namelen = req->rq_reqmsg->buflens[offset + 1]; + + LASSERT_REQSWAB (req, offset + 2); + r->ur_tgt = lustre_msg_string(req->rq_reqmsg, offset + 2, 0); + if (r->ur_tgt == NULL) + RETURN(-EFAULT); + r->ur_tgtlen = req->rq_reqmsg->buflens[offset + 2]; + RETURN(0); +} + +typedef int (*update_unpacker)(struct ptlrpc_request *req, int offset, + struct mds_update_record *r); + +static update_unpacker mds_unpackers[REINT_MAX + 1] = { + [REINT_SETATTR] mds_setattr_unpack, + [REINT_CREATE] mds_create_unpack, + [REINT_LINK] mds_link_unpack, + [REINT_UNLINK] mds_unlink_unpack, + [REINT_RENAME] mds_rename_unpack, + [REINT_OPEN] mds_create_unpack, +}; + +int mds_update_unpack(struct ptlrpc_request *req, int offset, + struct mds_update_record *rec) +{ + __u32 *opcodep; + __u32 opcode; + int rc; + ENTRY; + + /* NB don't lustre_swab_reqbuf() here. We're just taking a peek + * and we want to leave it to the specific unpacker once we've + * identified the message type */ + opcodep = lustre_msg_buf (req->rq_reqmsg, offset, sizeof (*opcodep)); + if (opcodep == NULL) + RETURN(-EFAULT); + + opcode = *opcodep; + if (lustre_msg_swabbed (req->rq_reqmsg)) + __swab32s (&opcode); + + if (opcode > REINT_MAX || + mds_unpackers[opcode] == NULL) { + CERROR ("Unexpected opcode %d\n", opcode); + RETURN(-EFAULT); + } + + rec->ur_opcode = opcode; + rc = mds_unpackers[opcode](req, offset, rec); + RETURN(rc); +} diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index 796fcd2..02c53cc 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -33,14 +33,20 @@ #include <linux/obd_lov.h> #include <linux/lustre_lib.h> -/* lov_unpackdesc() is in lov/lov_pack.c */ +void le_lov_desc_to_cpu (struct lov_desc *ld) +{ + ld->ld_tgt_count = le32_to_cpu (ld->ld_tgt_count); + ld->ld_default_stripe_count = le32_to_cpu (ld->ld_default_stripe_count); + ld->ld_default_stripe_size = le32_to_cpu (ld->ld_default_stripe_size); + ld->ld_pattern = le32_to_cpu (ld->ld_pattern); +} -void lov_packdesc(struct lov_desc *ld) +void cpu_to_le_lov_desc (struct lov_desc *ld) { - ld->ld_tgt_count = HTON__u32(ld->ld_tgt_count); - ld->ld_default_stripe_count = HTON__u32(ld->ld_default_stripe_count); - ld->ld_default_stripe_size = HTON__u32(ld->ld_default_stripe_size); - ld->ld_pattern = HTON__u32(ld->ld_pattern); + ld->ld_tgt_count = cpu_to_le32 (ld->ld_tgt_count); + ld->ld_default_stripe_count = cpu_to_le32 (ld->ld_default_stripe_count); + ld->ld_default_stripe_size = cpu_to_le32 (ld->ld_default_stripe_size); + ld->ld_pattern = cpu_to_le32 (ld->ld_pattern); } int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, @@ -52,6 +58,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, int tgt_count; int rc; int i; + struct lov_desc *disk_desc; ENTRY; tgt_count = desc->ld_tgt_count; @@ -76,36 +83,44 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, RETURN(-EINVAL); } - memcpy(&mds->mds_lov_desc, desc, sizeof *desc); - mds->mds_has_lov_desc = 1; - /* XXX the MDS should not really know about this */ - mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count); + OBD_ALLOC (disk_desc, sizeof (*disk_desc)); + if (disk_desc == NULL) { + CERROR ("Can't allocate disk_desc\n"); + RETURN (-ENOMEM); + } - lov_packdesc(desc); + *disk_desc = *desc; + cpu_to_le_lov_desc (disk_desc); + rc = 0; push_ctxt(&saved, &mds->mds_ctxt, NULL); + + /* Bug 1186: FIXME: if there is an existing LOVDESC, verify new + * tgt_count > old */ f = filp_open("LOVDESC", O_CREAT|O_RDWR, 0644); if (IS_ERR(f)) { CERROR("Cannot open/create LOVDESC file\n"); GOTO(out, rc = PTR_ERR(f)); } -#warning FIXME: if there is an existing LOVDESC, verify new tgt_count > old - rc = lustre_fwrite(f, (char *)desc, sizeof(*desc), &f->f_pos); + rc = lustre_fwrite(f, (char *)disk_desc, sizeof(*disk_desc), &f->f_pos); if (filp_close(f, 0)) CERROR("Error closing LOVDESC file\n"); if (rc != sizeof(*desc)) { CERROR("Cannot open/create LOVDESC file\n"); - GOTO(out, rc = PTR_ERR(f)); + if (rc >= 0) + rc = -EIO; + GOTO(out, rc); } + /* Bug 1186: FIXME: if there is an existing LOVTGTS, verify + * existing UUIDs same */ f = filp_open("LOVTGTS", O_CREAT|O_RDWR, 0644); if (IS_ERR(f)) { CERROR("Cannot open/create LOVTGTS file\n"); GOTO(out, rc = PTR_ERR(f)); } -#warning FIXME: if there is an existing LOVTGTS, verify existing UUIDs same rc = 0; for (i = 0; i < tgt_count ; i++) { rc = lustre_fwrite(f, uuidarray[i].uuid, @@ -116,14 +131,21 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, if (rc >= 0) rc = -EIO; break; - } else - rc = 0; + } + rc = 0; } if (filp_close(f, 0)) CERROR("Error closing LOVTGTS file\n"); + memcpy(&mds->mds_lov_desc, desc, sizeof *desc); + mds->mds_has_lov_desc = 1; + /* XXX the MDS should not really know about this */ + mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count); + out: pop_ctxt(&saved, &mds->mds_ctxt, NULL); + OBD_FREE (disk_desc, sizeof (*disk_desc)); + RETURN(rc); } @@ -150,6 +172,9 @@ int mds_get_lovdesc(struct mds_obd *mds, struct lov_desc *desc) GOTO(out, rc = -EIO); } else rc = 0; + + le_lov_desc_to_cpu (desc); /* convert to my byte order */ + EXIT; out: pop_ctxt(&saved, &mds->mds_ctxt, NULL); @@ -192,7 +217,7 @@ out: } int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, - int len, void *karg, void *uarg) + int len, void *karg, void *uarg) { struct obd_device *obd = class_conn2obd(conn); struct obd_ioctl_data *data = karg; @@ -236,11 +261,12 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, CERROR("UUID array size too small\n"); RETURN(-ENOSPC); } - rc = mds_get_lovtgts(&obd->u.mds, desc->ld_tgt_count, uuidarray); + rc = mds_get_lovtgts(&obd->u.mds, desc->ld_tgt_count, + uuidarray); RETURN(rc); - case OBD_IOC_SET_READONLY: + case OBD_IOC_SET_READONLY: CERROR("setting device %s read-only\n", ll_bdevname(obd->u.mds.mds_sb->s_dev)); #ifdef CONFIG_DEV_RDONLY @@ -248,6 +274,11 @@ int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, #endif RETURN(0); + case OBD_IOC_ABORT_RECOVERY: + CERROR("aborting recovery for device %s\n", obd->obd_name); + target_abort_recovery(obd); + RETURN(0); + default: RETURN(-EINVAL); } diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index 50ca592..d83e4ee 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -33,17 +33,18 @@ #include <linux/obd_class.h> #include <linux/random.h> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#include <linux/buffer_head.h> -#include <linux/workqueue.h> +# include <linux/buffer_head.h> +# include <linux/workqueue.h> #else -#include <linux/locks.h> +# include <linux/locks.h> #endif #include <linux/obd_lov.h> #include <linux/lustre_mds.h> #include <linux/lustre_fsfilt.h> #include <linux/lprocfs_status.h> -extern kmem_cache_t *mds_file_cache; +#include "mds_internal.h" + extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req); int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, struct ptlrpc_request *req, int rc, __u32 op_data); @@ -57,7 +58,53 @@ extern int enqueue_ordered_locks(int lock_mode, struct obd_device *obd, struct lustre_handle *c1_lockh, struct lustre_handle *c2_lockh); -void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, +struct mds_file_data *mds_dentry_open(struct dentry *dentry, + struct vfsmount *mnt, + int flags, + struct ptlrpc_request *req) +{ + struct mds_export_data *med = &req->rq_export->exp_mds_data; + struct inode *inode; + int mode; + struct mds_file_data *mfd; + int error; + + mfd = mds_mfd_new(); + if (!mfd) { + CERROR("mds: out of memory\n"); + GOTO(cleanup_dentry, error = -ENOMEM); + } + + mode = (flags+1) & O_ACCMODE; + inode = dentry->d_inode; + + if (mode & FMODE_WRITE) { + error = get_write_access(inode); + if (error) + goto cleanup_mfd; + } + + mfd->mfd_mode = mode; + mfd->mfd_dentry = dentry; + mfd->mfd_xid = req->rq_xid; + + spin_lock(&med->med_open_lock); + list_add(&mfd->mfd_list, &med->med_open_head); + spin_unlock(&med->med_open_lock); + mds_mfd_put(mfd); + return mfd; + +cleanup_mfd: + mds_mfd_put(mfd); + mds_mfd_destroy(mfd); +cleanup_dentry: + dput(dentry); + mntput(mnt); + return ERR_PTR(error); +} + +void reconstruct_open(struct mds_update_record *rec, int offset, + struct ptlrpc_request *req, struct lustre_handle *child_lockh) { struct mds_export_data *med = &req->rq_export->exp_mds_data; @@ -66,21 +113,23 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, struct mds_file_data *mfd; struct obd_device *obd = req->rq_export->exp_obd; struct dentry *parent, *child; - struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0); - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct ldlm_reply *rep; + struct mds_body *body; int disp, rc; ENTRY; - ENTRY; + LASSERT(offset == 2); /* only called via intent */ + rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep)); + body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body)); /* copy rc, transno and disp; steal locks */ req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; disp = rep->lock_policy_res1 = mcd->mcd_last_data; - - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); - + + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); + /* We never care about these. */ disp &= ~(IT_OPEN_LOOKUP | IT_OPEN_POS | IT_OPEN_NEG); if (!disp) { @@ -91,10 +140,9 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, parent = mds_fid2dentry(mds, rec->ur_fid1, NULL); LASSERT(!IS_ERR(parent)); - child = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3), - parent, req->rq_reqmsg->buflens[3] - 1); + child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); LASSERT(!IS_ERR(child)); - + if (!child->d_inode) { GOTO(out_dput, 0); /* child not present to open */ } @@ -108,12 +156,8 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, GOTO(out_dput, 0); } - if (!med->med_outstanding_reply) { - LBUG(); /* XXX need to get enqueue client lock */ - } - /* get lock (write for O_CREAT, read otherwise) */ - + mds_pack_inode2fid(&body->fid1, child->d_inode); mds_pack_inode2body(body, child->d_inode); if (S_ISREG(child->d_inode->i_mode)) { @@ -127,7 +171,7 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, /* If we're opening a file without an EA, change to a write lock (unless we already have one). */ - + /* If we have -EEXIST as the status, and we were asked to create * exclusively, we can tell we failed because the file already existed. */ @@ -150,7 +194,7 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, GOTO(out_dput, 0); } - if (med->med_outstanding_reply) { + if (req->rq_export->exp_outstanding_reply) { struct list_head *t; mfd = NULL; /* XXX can we just look in the old reply to find the handle in @@ -164,28 +208,16 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, /* if we're not recovering, it had better be found */ LASSERT(mfd); } else { - struct file *file; - mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL); + mntget(mds->mds_vfsmnt); + mfd = mds_dentry_open(child, mds->mds_vfsmnt, + rec->ur_flags & ~(O_DIRECT | O_TRUNC), req); if (!mfd) { CERROR("mds: out of memory\n"); GOTO(out_dput, req->rq_status = -ENOMEM); } - mntget(mds->mds_vfsmnt); - file = dentry_open(child, mds->mds_vfsmnt, - rec->ur_flags & ~(O_DIRECT | O_TRUNC)); - LASSERT(!IS_ERR(file)); /* XXX -ENOMEM? */ - file->private_data = mfd; - mfd->mfd_file = file; - mfd->mfd_xid = req->rq_xid; - get_random_bytes(&mfd->mfd_servercookie, - sizeof(mfd->mfd_servercookie)); - spin_lock(&med->med_open_lock); - list_add(&mfd->mfd_list, &med->med_open_head); - spin_unlock(&med->med_open_lock); } - - body->handle.addr = (__u64)(unsigned long)mfd; - body->handle.cookie = mfd->mfd_servercookie; + + body->handle.cookie = mfd->mfd_handle.h_cookie; out_dput: l_dput(child); @@ -196,11 +228,13 @@ void reconstruct_open(struct mds_update_record *rec, struct ptlrpc_request *req, int mds_open(struct mds_update_record *rec, int offset, struct ptlrpc_request *req, struct lustre_handle *child_lockh) { + static const char acc_table [] = {[O_RDONLY] MAY_READ, + [O_WRONLY] MAY_WRITE, + [O_RDWR] MAY_READ | MAY_WRITE}; struct mds_obd *mds = mds_req2mds(req); struct obd_device *obd = req->rq_export->exp_obd; - struct ldlm_reply *rep = lustre_msg_buf(req->rq_repmsg, 0); - struct file *file; - struct mds_body *body = lustre_msg_buf(req->rq_repmsg, 1); + struct ldlm_reply *rep; + struct mds_body *body; struct dentry *dchild = NULL, *parent; struct mds_export_data *med; struct mds_file_data *mfd = NULL; @@ -209,9 +243,14 @@ int mds_open(struct mds_update_record *rec, int offset, int rc = 0, parent_mode, child_mode = LCK_PR, lock_flags, created = 0; int cleanup_phase = 0; void *handle = NULL; + int acc_mode; ENTRY; - MDS_CHECK_RESENT(req, reconstruct_open(rec, req, child_lockh)); + LASSERT(offset == 2); /* only called via intent */ + rep = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rep)); + body = lustre_msg_buf(req->rq_repmsg, 1, sizeof (*body)); + + MDS_CHECK_RESENT(req, reconstruct_open(rec, offset, req, child_lockh)); med = &req->rq_export->exp_mds_data; rep->lock_policy_res1 |= IT_OPEN_LOOKUP; @@ -221,6 +260,12 @@ int mds_open(struct mds_update_record *rec, int offset, RETURN(-ENOMEM); } + if ((rec->ur_flags & O_ACCMODE) >= sizeof (acc_table)) + RETURN(-EINVAL); + acc_mode = acc_table [rec->ur_flags & O_ACCMODE]; + if ((rec->ur_flags & O_TRUNC) != 0) + acc_mode |= MAY_WRITE; + /* Step 1: Find and lock the parent */ parent_mode = (rec->ur_flags & O_CREAT) ? LCK_PW : LCK_PR; parent = mds_fid2locked_dentry(obd, rec->ur_fid1, NULL, parent_mode, @@ -235,8 +280,7 @@ int mds_open(struct mds_update_record *rec, int offset, cleanup_phase = 1; /* parent dentry and lock */ /* Step 2: Lookup the child */ - dchild = lookup_one_len(lustre_msg_buf(req->rq_reqmsg, 3), - parent, req->rq_reqmsg->buflens[3] - 1); + dchild = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); if (IS_ERR(dchild)) GOTO(cleanup, rc = PTR_ERR(dchild)); @@ -267,6 +311,7 @@ int mds_open(struct mds_update_record *rec, int offset, GOTO(cleanup, rc); created = 1; child_mode = LCK_PW; + acc_mode = 0; /* Don't check for permissions */ } /* Step 4: It's positive, so lock the child */ @@ -277,7 +322,7 @@ int mds_open(struct mds_update_record *rec, int offset, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, child_res_id, LDLM_PLAIN, NULL, 0, child_mode, &lock_flags, ldlm_completion_ast, - mds_blocking_ast, NULL, NULL, child_lockh); + mds_blocking_ast, NULL, child_lockh); if (rc != ELDLM_OK) { CERROR("ldlm_cli_enqueue: %d\n", rc); GOTO(cleanup, rc = -EIO); @@ -287,21 +332,32 @@ int mds_open(struct mds_update_record *rec, int offset, mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); + if (S_ISREG(dchild->d_inode->i_mode)) { + /* Check permissions etc */ + rc = permission(dchild->d_inode, acc_mode); + if (rc != 0) + GOTO(cleanup, rc); + + /* Can't write to a read-only file */ + if (IS_RDONLY(dchild->d_inode) && (acc_mode & MAY_WRITE) != 0) + GOTO(cleanup, rc = -EPERM); + + /* An append-only file must be opened in append mode for + * writing */ + if (IS_APPEND(dchild->d_inode) && + (acc_mode & MAY_WRITE) != 0 && + ((rec->ur_flags & O_APPEND) == 0 || + (rec->ur_flags & O_TRUNC) != 0)) + GOTO (cleanup, rc = -EPERM); + rc = mds_pack_md(obd, req->rq_repmsg, 2, body, dchild->d_inode); if (rc) GOTO(cleanup, rc); - } else { - /* If this isn't a regular file, we can't open it. */ - - /* We want to drop the child dentry, because we're not returning - * failure (which would do this for us in step 2), and we're not - * handing it off to the open file in dentry_open. */ - l_dput(dchild); - GOTO(cleanup, rc = 0); /* returns the lock to the client */ } - if (!created && (rec->ur_flags & O_CREAT) && (rec->ur_flags & O_EXCL)) { + if (!created && (rec->ur_flags & O_CREAT) && + (rec->ur_flags & O_EXCL)) { /* File already exists, we didn't just create it, and we * were passed O_EXCL; err-or. */ GOTO(cleanup, rc = -EEXIST); // returns a lock to the client @@ -309,43 +365,33 @@ int mds_open(struct mds_update_record *rec, int offset, /* If we're opening a file without an EA, the client needs a write * lock. */ - if (child_mode != LCK_PW && !(body->valid & OBD_MD_FLEASIZE)) { + if (S_ISREG(dchild->d_inode->i_mode) && + child_mode != LCK_PW && !(body->valid & OBD_MD_FLEASIZE)) { ldlm_lock_decref(child_lockh, child_mode); child_mode = LCK_PW; goto reacquire; } - /* Step 5: Open it */ + /* if we are following a symlink, don't open */ + if (S_ISLNK(dchild->d_inode->i_mode)) + GOTO(cleanup, rc = 0); + + /* Step 5: mds_open it */ rep->lock_policy_res1 |= IT_OPEN_OPEN; - mfd = kmem_cache_alloc(mds_file_cache, GFP_KERNEL); + + /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */ + mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, + rec->ur_flags & ~(O_DIRECT | O_TRUNC), req); if (!mfd) { CERROR("mds: out of memory\n"); + dchild = NULL; /* prevent a double dput in step 2 */ GOTO(cleanup, rc = -ENOMEM); } cleanup_phase = 4; /* mfd allocated */ - - /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */ - mntget(mds->mds_vfsmnt); - file = dentry_open(dchild, mds->mds_vfsmnt, - rec->ur_flags & ~(O_DIRECT | O_TRUNC)); - if (IS_ERR(file)) { - dchild = NULL; /* prevent a double dput in step 2 */ - GOTO(cleanup, rc = PTR_ERR(file)); - } - - file->private_data = mfd; - mfd->mfd_file = file; - mfd->mfd_xid = req->rq_xid; - get_random_bytes(&mfd->mfd_servercookie, sizeof(mfd->mfd_servercookie)); - spin_lock(&med->med_open_lock); - list_add(&mfd->mfd_list, &med->med_open_head); - spin_unlock(&med->med_open_lock); - - body->handle.addr = (__u64)(unsigned long)mfd; - body->handle.cookie = mfd->mfd_servercookie; - CDEBUG(D_INODE, "file %p: mfd %p, cookie "LPX64"\n", - mfd->mfd_file, mfd, mfd->mfd_servercookie); + body->handle.cookie = mfd->mfd_handle.h_cookie; + CDEBUG(D_INODE, "mfd %p, cookie "LPX64"\n", mfd, + mfd->mfd_handle.h_cookie); GOTO(cleanup, rc = 0); /* returns a lock to the client */ cleanup: @@ -353,18 +399,18 @@ int mds_open(struct mds_update_record *rec, int offset, req, rc, rep->lock_policy_res1); switch (cleanup_phase) { case 4: - if (rc) - kmem_cache_free(mds_file_cache, mfd); + if (rc && !S_ISLNK(dchild->d_inode->i_mode)) + mds_mfd_destroy(mfd); case 3: - /* This is the same logic as in the IT_OPEN part of + /* This is the same logic as in the IT_OPEN part of * ldlm_intent_policy: if we found the dentry, or we tried to * open it (meaning that we created, if it wasn't found), then * we return the lock to the caller and client. */ if (!(rep->lock_policy_res1 & (IT_OPEN_OPEN | IT_OPEN_POS))) ldlm_lock_decref(child_lockh, child_mode); case 2: - if (rc) - l_dput(dchild); + if (rc || S_ISLNK(dchild->d_inode->i_mode)) + l_dput(dchild); case 1: l_dput(parent); if (rc) { diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 583ba4a..823a7a6 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -37,15 +37,13 @@ #include <linux/lustre_mds.h> #include <linux/lustre_dlm.h> #include <linux/lustre_fsfilt.h> +#include "mds_internal.h" extern inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req); -static void mds_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, int error) +static void mds_commit_cb(struct obd_device *obd, __u64 transno, int error) { - CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n", - last_rcvd, error); - if (!error && last_rcvd > obd->obd_last_committed) - obd->obd_last_committed = last_rcvd; + obd_transno_commit_cb(obd, transno, error); } /* Assumes caller has already pushed us into the kernel context. */ @@ -56,15 +54,19 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, struct mds_export_data *med = &req->rq_export->exp_mds_data; struct mds_client_data *mcd = med->med_mcd; struct obd_device *obd = req->rq_export->exp_obd; - int started_handle = 0, err; + int err; __u64 transno; loff_t off; ssize_t written; ENTRY; - /* we don't allocate new transnos for replayed requests */ - if (req->rq_level == LUSTRE_CONN_RECOVD) - GOTO(out, rc = rc); + /* if the export has already been failed, we have no last_rcvd slot */ + if (req->rq_export->exp_failed) { + CERROR("committing transaction for disconnected client\n"); + if (handle) + GOTO(commit, rc); + GOTO(out, rc); + } if (!handle) { /* if we're starting our own xaction, use our own inode */ @@ -74,15 +76,17 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, CERROR("fsfilt_start: %ld\n", PTR_ERR(handle)); GOTO(out, rc = PTR_ERR(handle)); } - started_handle = 1; } off = MDS_LR_CLIENT + med->med_off * MDS_LR_SIZE; - spin_lock(&mds->mds_transno_lock); - transno = ++mds->mds_last_transno; - spin_unlock(&mds->mds_transno_lock); - req->rq_repmsg->transno = req->rq_transno = HTON__u64(transno); + transno = req->rq_reqmsg->transno; + if (transno == 0) { + spin_lock(&mds->mds_transno_lock); + transno = ++mds->mds_last_transno; + spin_unlock(&mds->mds_transno_lock); + } + req->rq_repmsg->transno = req->rq_transno = transno; mcd->mcd_last_transno = cpu_to_le64(transno); mcd->mcd_mount_count = cpu_to_le64(mds->mds_mount_count); mcd->mcd_last_xid = cpu_to_le64(req->rq_xid); @@ -90,9 +94,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, mcd->mcd_last_data = cpu_to_le32(op_data); fsfilt_set_last_rcvd(req->rq_export->exp_obd, transno, handle, - mds_last_rcvd_cb); - written = lustre_fwrite(mds->mds_rcvd_filp, (char *)mcd, sizeof(*mcd), - &off); + mds_commit_cb); + written = lustre_fwrite(mds->mds_rcvd_filp, mcd, sizeof(*mcd), &off); CDEBUG(D_INODE, "wrote trans "LPU64" client %s at #%u: written = " LPSZ"\n", transno, mcd->mcd_uuid, med->med_off, written); @@ -106,7 +109,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, } } - err = fsfilt_commit(obd, i, handle); +commit: + err = fsfilt_commit(obd, i, handle, 0); if (err) { CERROR("error committing transaction: %d\n", err); if (!rc) @@ -125,11 +129,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *i, void *handle, */ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) { -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - time_t now = CURRENT_TIME; -#else - time_t now = CURRENT_TIME.tv_sec; -#endif + time_t now = LTIME_S(CURRENT_TIME); struct iattr *attr = &rec->ur_iattr; unsigned int ia_valid = attr->ia_valid; int error; @@ -142,19 +142,11 @@ int mds_fix_attr(struct inode *inode, struct mds_update_record *rec) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) RETURN(-EPERM); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - attr->ia_ctime = now; - if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime = now; - if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime = now; -#else - attr->ia_ctime.tv_sec = now; + LTIME_S(attr->ia_ctime) = now; if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime.tv_sec = now; + LTIME_S(attr->ia_atime) = now; if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime.tv_sec = now; -#endif + LTIME_S(attr->ia_mtime) = now; /* times */ if ((ia_valid & (ATTR_MTIME|ATTR_ATIME))==(ATTR_MTIME|ATTR_ATIME) && @@ -227,8 +219,8 @@ static void reconstruct_reint_setattr(struct mds_update_record *rec, req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); de = mds_fid2dentry(obd, rec->ur_fid1, NULL); if (IS_ERR(de)) { @@ -236,7 +228,7 @@ static void reconstruct_reint_setattr(struct mds_update_record *rec, return; } - body = lustre_msg_buf(req->rq_repmsg, 0); + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); mds_pack_inode2fid(&body->fid1, de->d_inode); mds_pack_inode2body(body, de->d_inode); @@ -262,6 +254,8 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, int rc = 0, cleanup_phase = 0, err, locked = 0; ENTRY; + LASSERT(offset == 0); + MDS_CHECK_RESENT(req, reconstruct_reint_setattr(rec, offset, req)); if (rec->ur_iattr.ia_valid & ATTR_FROM_OPEN) { @@ -297,14 +291,14 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, GOTO(cleanup, rc); rc = fsfilt_setattr(obd, de, handle, &rec->ur_iattr); - if (rc == 0 && S_ISREG(inode->i_mode) && - req->rq_reqmsg->bufcount > 1) { + if (rc == 0 && + S_ISREG(inode->i_mode) && + rec->ur_eadata != NULL) { rc = fsfilt_set_md(obd, inode, handle, - lustre_msg_buf(req->rq_reqmsg, 1), - req->rq_reqmsg->buflens[1]); + rec->ur_eadata, rec->ur_eadatalen); } - body = lustre_msg_buf(req->rq_repmsg, 0); + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); @@ -343,21 +337,21 @@ static void reconstruct_reint_create(struct mds_update_record *rec, int offset, struct mds_obd *obd = &req->rq_export->exp_obd->u.mds; struct dentry *parent, *child; struct mds_body *body; - + req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); - + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); + if (req->rq_status) return; parent = mds_fid2dentry(obd, rec->ur_fid1, NULL); LASSERT(!IS_ERR(parent)); - child = lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); + child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); LASSERT(!IS_ERR(child)); - body = lustre_msg_buf(req->rq_repmsg, offset); + body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body)); mds_pack_inode2fid(&body->fid1, child->d_inode); mds_pack_inode2body(body, child->d_inode); l_dput(parent); @@ -401,7 +395,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, ldlm_lock_dump_handle(D_OTHER, &lockh); - dchild = lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1); + dchild = ll_lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1); if (IS_ERR(dchild)) { rc = PTR_ERR(dchild); CERROR("child lookup error %d\n", rc); @@ -421,8 +415,6 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, if (rec->ur_fid2->id) dchild->d_fsdata = (void *)(unsigned long)rec->ur_fid2->id; - else - LASSERT(!(rec->ur_opcode & REINT_REPLAYING)); switch (type) { case S_IFREG:{ @@ -445,7 +437,10 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, handle = fsfilt_start(obd, dir, FSFILT_OP_SYMLINK); if (IS_ERR(handle)) GOTO(cleanup, rc = PTR_ERR(handle)); - rc = vfs_symlink(dir, dchild, rec->ur_tgt); + if (rec->ur_tgt == NULL) /* no target supplied */ + rc = -EINVAL; /* -EPROTO? */ + else + rc = vfs_symlink(dir, dchild, rec->ur_tgt); EXIT; break; } @@ -480,15 +475,9 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, struct mds_body *body; created = 1; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - iattr.ia_atime = rec->ur_time; - iattr.ia_ctime = rec->ur_time; - iattr.ia_mtime = rec->ur_time; -#else - iattr.ia_atime.tv_sec = rec->ur_time; - iattr.ia_ctime.tv_sec = rec->ur_time; - iattr.ia_mtime.tv_sec = rec->ur_time; -#endif + LTIME_S(iattr.ia_atime) = rec->ur_time; + LTIME_S(iattr.ia_ctime) = rec->ur_time; + LTIME_S(iattr.ia_mtime) = rec->ur_time; iattr.ia_uid = rec->ur_uid; iattr.ia_gid = rec->ur_gid; iattr.ia_valid = ATTR_UID | ATTR_GID | ATTR_ATIME | @@ -511,7 +500,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, /* XXX should we abort here in case of error? */ } - body = lustre_msg_buf(req->rq_repmsg, offset); + body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body)); mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); } @@ -519,7 +508,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, cleanup: err = mds_finish_transno(mds, dir, handle, req, rc, 0); - + if (rc && created) { /* Destroy the file we just created. This should not need * extra journal credits, as we have already modified all of @@ -604,7 +593,7 @@ int enqueue_ordered_locks(int lock_mode, struct obd_device *obd, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, res_id[0], LDLM_PLAIN, NULL, 0, lock_mode, &flags, ldlm_completion_ast, mds_blocking_ast, NULL, - NULL, handles[0]); + handles[0]); if (rc != ELDLM_OK) RETURN(-EIO); ldlm_lock_dump_handle(D_OTHER, handles[0]); @@ -617,7 +606,7 @@ int enqueue_ordered_locks(int lock_mode, struct obd_device *obd, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, res_id[1], LDLM_PLAIN, NULL, 0, lock_mode, &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, 0, handles[1]); + mds_blocking_ast, NULL, handles[1]); if (rc != ELDLM_OK) { ldlm_lock_decref(handles[0], lock_mode); RETURN(-EIO); @@ -638,9 +627,9 @@ static void reconstruct_reint_unlink(struct mds_update_record *rec, int offset, req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); - + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); + DEBUG_REQ(D_ERROR, req, "can't get EA for reconstructed unlink, leaking OST inodes"); } @@ -658,12 +647,13 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, struct lustre_handle parent_lockh; void *handle = NULL; struct ldlm_res_id child_res_id = { .name = {0} }; - char *name; - int namelen, rc = 0, flags = 0, return_lock = 0; + int rc = 0, flags = 0, return_lock = 0; int cleanup_phase = 0; ENTRY; - MDS_CHECK_RESENT(req, reconstruct_reint_unlink(rec, offset, req, + LASSERT(offset == 0 || offset == 2); + + MDS_CHECK_RESENT(req, reconstruct_reint_unlink(rec, offset, req, child_lockh)); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK)) @@ -680,28 +670,18 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, cleanup_phase = 1; /* Have parent dentry lock */ /* Step 2: Lookup the child */ - name = lustre_msg_buf(req->rq_reqmsg, offset + 1); - namelen = req->rq_reqmsg->buflens[offset + 1] - 1; - - dchild = lookup_one_len(name, dir_de, namelen); + dchild = ll_lookup_one_len(rec->ur_name, dir_de, rec->ur_namelen - 1); if (IS_ERR(dchild)) GOTO(cleanup, rc = PTR_ERR(dchild)); - + cleanup_phase = 2; /* child dentry */ child_inode = dchild->d_inode; if (child_inode == NULL) { - if (rec->ur_opcode & REINT_REPLAYING) { - CDEBUG(D_INODE, - "child missing (%lu/%s); OK for REPLAYING\n", - dir_inode->i_ino, rec->ur_name); - rc = 0; - } else { - CDEBUG(D_INODE, - "child doesn't exist (dir %lu, name %s)\n", - dir_inode->i_ino, rec->ur_name); - rc = -ENOENT; - } + CDEBUG(D_INODE, + "child doesn't exist (dir %lu, name %s)\n", + dir_inode->i_ino, rec->ur_name); + rc = -ENOENT; GOTO(cleanup, rc); } @@ -715,7 +695,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, child_res_id, LDLM_PLAIN, NULL, 0, LCK_EX, &flags, ldlm_completion_ast, mds_blocking_ast, - NULL, NULL, child_lockh); + NULL, child_lockh); if (rc != ELDLM_OK) GOTO(cleanup, rc); @@ -724,11 +704,12 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, OBD_FAIL_WRITE(OBD_FAIL_MDS_REINT_UNLINK_WRITE, to_kdev_t(dir_inode->i_sb->s_dev)); - /* Slightly magical; see ldlm_intent_policy */ + /* ldlm_reply in buf[0] if called via intent */ if (offset) offset = 1; - body = lustre_msg_buf(req->rq_repmsg, offset); + body = lustre_msg_buf(req->rq_repmsg, offset, sizeof (*body)); + LASSERT(body != NULL); /* Step 4: Do the unlink: client decides between rmdir/unlink! * (bug 72) */ @@ -742,8 +723,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, case S_IFREG: /* If this is the last reference to this inode, get the OBD EA * data first so the client can destroy OST objects */ - if ((child_inode->i_mode & S_IFMT) == S_IFREG && - child_inode->i_nlink == 1) { + if (S_ISREG(child_inode->i_mode) && child_inode->i_nlink == 1) { mds_pack_inode2fid(&body->fid1, child_inode); mds_pack_inode2body(body, child_inode); mds_pack_md(obd, req->rq_repmsg, offset + 1, @@ -763,7 +743,7 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, rc = vfs_unlink(dir_inode, dchild); break; default: - CERROR("bad file type %o unlinking %s\n", rec->ur_mode, name); + CERROR("bad file type %o unlinking %s\n", rec->ur_mode, rec->ur_name); LBUG(); GOTO(cleanup, rc = -EINVAL); } @@ -807,11 +787,9 @@ static void reconstruct_reint_link(struct mds_update_record *rec, int offset, req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; - - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); - else - LBUG(); /* don't support it yet, but it'll be fun! */ + + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); } static int mds_reint_link(struct mds_update_record *rec, int offset, @@ -829,6 +807,8 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, int lock_mode = 0, rc = 0, cleanup_phase = 0; ENTRY; + LASSERT(offset == 0); + MDS_CHECK_RESENT(req, reconstruct_reint_link(rec, offset, req)); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_LINK)) @@ -866,7 +846,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, cleanup_phase = 3; /* locks */ /* Step 3: Lookup the child */ - dchild = lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen - 1); + dchild = ll_lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen-1); if (IS_ERR(dchild)) { CERROR("child lookup error %ld\n", PTR_ERR(dchild)); GOTO(cleanup, rc = PTR_ERR(dchild)); @@ -875,17 +855,9 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, cleanup_phase = 4; /* child dentry */ if (dchild->d_inode) { - if (rec->ur_opcode & REINT_REPLAYING) { - /* XXX verify that the link is to the the right file? */ - CDEBUG(D_INODE, - "child exists (dir %lu, name %s) (REPLAYING)\n", - de_tgt_dir->d_inode->i_ino, rec->ur_name); - rc = 0; - } else { - CDEBUG(D_INODE, "child exists (dir %lu, name %s)\n", - de_tgt_dir->d_inode->i_ino, rec->ur_name); - rc = -EEXIST; - } + CDEBUG(D_INODE, "child exists (dir %lu, name %s)\n", + de_tgt_dir->d_inode->i_ino, rec->ur_name); + rc = -EEXIST; GOTO(cleanup, rc); } @@ -944,9 +916,9 @@ static void reconstruct_reint_rename(struct mds_update_record *rec, req->rq_transno = mcd->mcd_last_transno; req->rq_status = mcd->mcd_last_result; - - if (med->med_outstanding_reply) - mds_steal_ack_locks(med, req); + + if (req->rq_export->exp_outstanding_reply) + mds_steal_ack_locks(req->rq_export, req); else LBUG(); /* don't support it yet, but it'll be fun! */ @@ -972,12 +944,14 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, void *handle = NULL; ENTRY; + LASSERT(offset == 0); + MDS_CHECK_RESENT(req, reconstruct_reint_rename(rec, offset, req)); de_srcdir = mds_fid2dentry(mds, rec->ur_fid1, NULL); if (IS_ERR(de_srcdir)) GOTO(cleanup, rc = PTR_ERR(de_srcdir)); - + cleanup_phase = 1; /* source directory dentry */ de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL); @@ -1014,7 +988,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, cleanup_phase = 3; /* parent locks */ /* Step 2: Lookup the children */ - de_old = lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen - 1); + de_old = ll_lookup_one_len(rec->ur_name, de_srcdir, rec->ur_namelen-1); if (IS_ERR(de_old)) { CERROR("old child lookup error (%*s): %ld\n", rec->ur_namelen - 1, rec->ur_name, PTR_ERR(de_old)); @@ -1031,7 +1005,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, de_old->d_inode->i_ino == de_tgtdir->d_inode->i_ino) GOTO(cleanup, rc = -EINVAL); - de_new = lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1); + de_new = ll_lookup_one_len(rec->ur_tgt, de_tgtdir, rec->ur_tgtlen - 1); if (IS_ERR(de_new)) { CERROR("new child lookup error (%*s): %ld\n", rec->ur_tgtlen - 1, rec->ur_tgt, PTR_ERR(de_new)); @@ -1054,7 +1028,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, c1_res_id, LDLM_PLAIN, NULL, 0, LCK_EX, &flags, ldlm_completion_ast, - mds_blocking_ast, NULL, NULL, + mds_blocking_ast, NULL, &(dlm_handles[2])); lock_count = 3; } else { @@ -1150,26 +1124,16 @@ int mds_reint_rec(struct mds_update_record *rec, int offset, { struct mds_obd *mds = mds_req2mds(req); struct obd_run_ctxt saved; - struct obd_ucred uc; - int realop = rec->ur_opcode & REINT_OPCODE_MASK, rc; + int rc; ENTRY; - if (realop < 1 || realop > REINT_MAX) { - CERROR("opcode %d not valid (%sREPLAYING)\n", realop, - rec->ur_opcode & REINT_REPLAYING ? "" : "not "); - rc = req->rq_status = -EINVAL; - RETURN(rc); - } - - uc.ouc_fsuid = rec->ur_fsuid; - uc.ouc_fsgid = rec->ur_fsgid; - uc.ouc_cap = rec->ur_cap; - uc.ouc_suppgid1 = rec->ur_suppgid1; - uc.ouc_suppgid2 = rec->ur_suppgid2; + /* checked by unpacker */ + LASSERT(rec->ur_opcode <= REINT_MAX && + reinters[rec->ur_opcode] != NULL); - push_ctxt(&saved, &mds->mds_ctxt, &uc); - rc = reinters[realop] (rec, offset, req, lockh); - pop_ctxt(&saved, &mds->mds_ctxt, &uc); + push_ctxt(&saved, &mds->mds_ctxt, &rec->ur_uc); + rc = reinters[rec->ur_opcode] (rec, offset, req, lockh); + pop_ctxt(&saved, &mds->mds_ctxt, &rec->ur_uc); RETURN(rc); } diff --git a/lustre/obdclass/Makefile.am b/lustre/obdclass/Makefile.am index fb04cc1..7b7c5b9 100644 --- a/lustre/obdclass/Makefile.am +++ b/lustre/obdclass/Makefile.am @@ -1,18 +1,17 @@ - # FIXME: we need to make it clear that obdclass.o depends on # lustre_build_version, or 'make -j2' breaks! DEFS= MODULE = obdclass -if LINUX25 -FSMOD = fsfilt_ext3 -else +if EXTN FSMOD = fsfilt_extN +else +FSMOD = fsfilt_ext3 endif if LIBLUSTRE lib_LIBRARIES = liblustreclass.a -liblustreclass_a_SOURCES = uuid.c statfs_pack.c genops.c debug.c class_obd.c lustre_handles.c lustre_peer.c lprocfs_status.c +liblustreclass_a_SOURCES = uuid.c statfs_pack.c genops.c debug.c class_obd.c lustre_handles.c lustre_peer.c lprocfs_status.c simple.c class_obd.o: lustre_version @@ -24,7 +23,8 @@ else modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o fsfilt_reiserfs.o EXTRA_PROGRAMS = obdclass $(FSMOD) fsfilt_reiserfs -obdclass_SOURCES = class_obd.c debug.c genops.c sysctl.c uuid.c lprocfs_status.c lustre_handles.c lustre_peer.c +obdclass_SOURCES = class_obd.c debug.c genops.c sysctl.c uuid.c simple.c +obdclass_SOURCES += lprocfs_status.c lustre_handles.c lustre_peer.c obdclass_SOURCES += fsfilt.c statfs_pack.c endif @@ -33,7 +33,7 @@ include $(top_srcdir)/Rules # XXX I'm sure there's some automake mv-if-different helper for this. lustre_build_version: perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) $(top_builddir) > tmpver - cmp -z $(top_builddir)/include/linux/lustre_build_version.h tmpver \ + cmp -s $(top_builddir)/include/linux/lustre_build_version.h tmpver \ 2> /dev/null && \ $(RM) tmpver || \ mv tmpver $(top_builddir)/include/linux/lustre_build_version.h diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 6209d75..1e180a8 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -51,6 +51,7 @@ #include <asm/poll.h> #include <asm/uaccess.h> #include <linux/miscdevice.h> +#include <linux/smp_lock.h> #else # include <liblustre.h> @@ -60,7 +61,6 @@ #include <linux/obd_support.h> #include <linux/obd_class.h> #include <linux/lustre_debug.h> -#include <linux/smp_lock.h> #include <linux/lprocfs_status.h> #include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */ #include <linux/lustre_build_version.h> @@ -77,9 +77,11 @@ struct proc_dir_entry *proc_lustre_root = NULL; /* The following are visible and mutable through /proc/sys/lustre/. */ unsigned long obd_fail_loc; unsigned long obd_timeout = 100; -char obd_recovery_upcall[128] = "/usr/lib/lustre/ha_assist"; +unsigned long obd_bulk_timeout = 1; +char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall"; unsigned long obd_sync_filter; /* = 0, don't sync by default */ +#ifdef __KERNEL__ /* opening /dev/obd */ static int obd_class_open(struct inode * inode, struct file * file) { @@ -93,10 +95,38 @@ static int obd_class_open(struct inode * inode, struct file * file) INIT_LIST_HEAD(&ocus->ocus_conns); file->private_data = ocus; - MOD_INC_USE_COUNT; + PORTAL_MODULE_USE; RETURN(0); } +/* closing /dev/obd */ +static int obd_class_release(struct inode * inode, struct file * file) +{ + struct obd_class_user_state *ocus = file->private_data; + struct obd_class_user_conn *c; + ENTRY; + + while (!list_empty (&ocus->ocus_conns)) { + c = list_entry (ocus->ocus_conns.next, + struct obd_class_user_conn, ocuc_chain); + list_del (&c->ocuc_chain); + + CDEBUG (D_IOCTL, "Auto-disconnect %p\n", &c->ocuc_conn); + + down (&obd_conf_sem); + obd_disconnect (&c->ocuc_conn, 0); + up (&obd_conf_sem); + + OBD_FREE (c, sizeof (*c)); + } + + OBD_FREE (ocus, sizeof (*ocus)); + + PORTAL_MODULE_UNUSE; + RETURN(0); +} +#endif + static int obd_class_add_user_conn (struct obd_class_user_state *ocus, struct lustre_handle *conn) @@ -125,7 +155,7 @@ obd_class_remove_user_conn (struct obd_class_user_state *ocus, list_for_each (e, &ocus->ocus_conns) { c = list_entry (e, struct obd_class_user_conn, ocuc_chain); - if (!memcmp (conn, &c->ocuc_conn, sizeof (*conn))) { + if (conn->cookie == c->ocuc_conn.cookie) { list_del (&c->ocuc_chain); OBD_FREE (c, sizeof (*c)); return; @@ -133,76 +163,39 @@ obd_class_remove_user_conn (struct obd_class_user_state *ocus, } } -/* closing /dev/obd */ -static int obd_class_release(struct inode * inode, struct file * file) -{ - struct obd_class_user_state *ocus = file->private_data; - struct obd_class_user_conn *c; - ENTRY; - - while (!list_empty (&ocus->ocus_conns)) { - c = list_entry (ocus->ocus_conns.next, - struct obd_class_user_conn, ocuc_chain); - list_del (&c->ocuc_chain); - - CDEBUG (D_IOCTL, "Auto-disconnect %p\n", &c->ocuc_conn); - - down (&obd_conf_sem); - obd_disconnect (&c->ocuc_conn); - up (&obd_conf_sem); - - OBD_FREE (c, sizeof (*c)); - } - - OBD_FREE (ocus, sizeof (*ocus)); - - MOD_DEC_USE_COUNT; - RETURN(0); -} - static inline void obd_data2conn(struct lustre_handle *conn, struct obd_ioctl_data *data) { - conn->addr = data->ioc_addr; + memset(conn, 0, sizeof *conn); conn->cookie = data->ioc_cookie; } static inline void obd_conn2data(struct obd_ioctl_data *data, struct lustre_handle *conn) { - data->ioc_addr = conn->addr; data->ioc_cookie = conn->cookie; } -static void forcibly_detach_exports(struct obd_device *obd) +static void dump_exports(struct obd_device *obd) { - int rc; struct list_head *tmp, *n; - struct lustre_handle fake_conn; - CDEBUG(D_IOCTL, "OBD device %d (%p) has exports, " - "disconnecting them", obd->obd_minor, obd); list_for_each_safe(tmp, n, &obd->obd_exports) { struct obd_export *exp = list_entry(tmp, struct obd_export, exp_obd_chain); - fake_conn.addr = (__u64)(unsigned long)exp; - fake_conn.cookie = exp->exp_cookie; - rc = obd_disconnect(&fake_conn); - if (rc) { - CDEBUG(D_IOCTL, "disconnecting export %p failed: %d\n", - exp, rc); - } else { - CDEBUG(D_IOCTL, "export %p disconnected\n", exp); - } + CDEBUG(D_ERROR, "%s: %p %s %d %d %p\n", + obd->obd_name, exp, exp->exp_client_uuid.uuid, + atomic_read(&exp->exp_refcount), + exp->exp_failed, exp->exp_outstanding_reply ); } } - int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, unsigned long arg) { char *buf = NULL; struct obd_ioctl_data *data; + struct portals_debug_ioctl_data *debug_data; struct obd_device *obd = ocus->ocus_current_obd; struct lustre_handle conn; int err = 0, len = 0, serialised = 0; @@ -211,6 +204,14 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ RETURN(err = -ENOTTY); + /* only for debugging */ + if (cmd == PTL_IOC_DEBUG_MASK) { + debug_data = (struct portals_debug_ioctl_data*)arg; + portal_subsystem_debug = debug_data->subs; + portal_debug = debug_data->debug; + return 0; + } + switch (cmd) { case OBD_IOC_BRW_WRITE: case OBD_IOC_BRW_READ: @@ -227,9 +228,9 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, CDEBUG(D_IOCTL, "cmd = %x, obd = %p\n", cmd, obd); if (!obd && cmd != OBD_IOC_DEVICE && cmd != OBD_IOC_LIST && cmd != OBD_GET_VERSION && - cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_NEWDEV && - cmd != OBD_IOC_ADD_UUID && cmd != OBD_IOC_DEL_UUID && - cmd != OBD_IOC_CLOSE_UUID) { + cmd != OBD_IOC_NAME2DEV && cmd != OBD_IOC_UUID2DEV && + cmd != OBD_IOC_NEWDEV && cmd != OBD_IOC_ADD_UUID && + cmd != OBD_IOC_DEL_UUID && cmd != OBD_IOC_CLOSE_UUID) { CERROR("OBD ioctl: No device\n"); GOTO(out, err = -EINVAL); } @@ -244,12 +245,12 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, CDEBUG(D_IOCTL, "\n"); if (data->ioc_dev >= MAX_OBD_DEVICES || data->ioc_dev < 0) { CERROR("OBD ioctl: DEVICE insufficient devices\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "device %d\n", data->ioc_dev); ocus->ocus_current_obd = &obd_dev[data->ioc_dev]; - GOTO(out, err=0); + GOTO(out, err = 0); } case OBD_IOC_LIST: { @@ -259,7 +260,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if (!data->ioc_inlbuf1) { CERROR("No buffer passed!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } @@ -270,9 +271,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if (!obd->obd_type) continue; - if (obd->obd_flags & OBD_SET_UP) + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) status = "UP"; - else if (obd->obd_flags & OBD_ATTACHED) + else if (obd->obd_attached) status = "AT"; else status = "-"; @@ -321,11 +324,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if (!data->ioc_inllen1 || !data->ioc_inlbuf1 ) { CERROR("No name passed,!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } - if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) { + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { CERROR("Name not nul terminated!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); @@ -334,7 +337,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if (dev == -1) { CDEBUG(D_IOCTL, "No device for name %s!\n", data->ioc_inlbuf1); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, @@ -354,11 +357,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { CERROR("No UUID passed!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } - if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) { - CERROR("Name not nul terminated!\n"); - GOTO(out, err=-EINVAL); + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); @@ -366,9 +369,9 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, dev = class_uuid2dev(&uuid); data->ioc_dev = dev; if (dev == -1) { - CDEBUG(D_IOCTL, "No device for name %s!\n", + CDEBUG(D_IOCTL, "No device for UUID %s!\n", data->ioc_inlbuf1); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, @@ -379,6 +382,8 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, GOTO(out, err); } + + case OBD_IOC_NEWDEV: { int dev = -1; int i; @@ -396,7 +401,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, data->ioc_dev = dev; if (dev == -1) - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); err = copy_to_user((void *)arg, data, sizeof(*data)); if (err) @@ -409,7 +414,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, int minor, len; /* have we attached a type to this device */ - if (obd->obd_flags & OBD_ATTACHED || obd->obd_type) { + if (obd->obd_attached|| obd->obd_type) { CERROR("OBD: Device %d already typed as %s.\n", obd->obd_minor, MKSTR(obd->obd_type->typ_name)); GOTO(out, err = -EBUSY); @@ -419,7 +424,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, CERROR("No type passed!\n"); GOTO(out, err = -EINVAL); } - if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) { + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { CERROR("Type not nul terminated!\n"); GOTO(out, err = -EINVAL); } @@ -427,6 +432,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, CERROR("No name passed!\n"); GOTO(out, err = -EINVAL); } + if (data->ioc_inlbuf2[data->ioc_inllen2 - 1] != 0) { + CERROR("Name not nul terminated!\n"); + GOTO(out, err = -EINVAL); + } + if (!data->ioc_inllen3 || !data->ioc_inlbuf3) { + CERROR("No UUID passed!\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inlbuf3[data->ioc_inllen3 - 1] != 0) { + CERROR("UUID not nul terminated!\n"); + GOTO(out, err = -EINVAL); + } + CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", MKSTR(data->ioc_inlbuf1), MKSTR(data->ioc_inlbuf2), MKSTR(data->ioc_inlbuf3)); @@ -445,6 +463,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, INIT_LIST_HEAD(&obd->obd_exports); INIT_LIST_HEAD(&obd->obd_imports); spin_lock_init(&obd->obd_dev_lock); + init_waitqueue_head(&obd->obd_refcount_waitq); /* XXX belong ins setup not attach */ /* recovery data */ @@ -453,6 +472,8 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, INIT_LIST_HEAD(&obd->obd_recovery_queue); INIT_LIST_HEAD(&obd->obd_delayed_reply_queue); + init_waitqueue_head(&obd->obd_commit_waitq); + len = strlen(data->ioc_inlbuf2) + 1; OBD_ALLOC(obd->obd_name, len); if (!obd->obd_name) { @@ -462,20 +483,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, } memcpy(obd->obd_name, data->ioc_inlbuf2, len); - if (data->ioc_inlbuf3) { - int len = strlen(data->ioc_inlbuf3); - if (len >= sizeof(obd->obd_uuid)) { - CERROR("uuid must be < "LPSZ" bytes long\n", - sizeof(obd->obd_uuid)); - if (obd->obd_name) - OBD_FREE(obd->obd_name, - strlen(obd->obd_name) + 1); - class_put_type(obd->obd_type); - obd->obd_type = NULL; - GOTO(out, err=-EINVAL); - } - memcpy(obd->obd_uuid.uuid, data->ioc_inlbuf3, len); + len = strlen(data->ioc_inlbuf3); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("uuid must be < "LPSZ" bytes long\n", + sizeof(obd->obd_uuid)); + if (obd->obd_name) + OBD_FREE(obd->obd_name, + strlen(obd->obd_name) + 1); + class_put_type(obd->obd_type); + obd->obd_type = NULL; + GOTO(out, err = -EINVAL); } + memcpy(obd->obd_uuid.uuid, data->ioc_inlbuf3, len); + /* do the attach */ if (OBP(obd, attach)) err = OBP(obd,attach)(obd, sizeof(*data), data); @@ -486,7 +506,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, class_put_type(obd->obd_type); obd->obd_type = NULL; } else { - obd->obd_flags |= OBD_ATTACHED; + obd->obd_attached = 1; type->typ_refcnt++; CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n", @@ -498,13 +518,13 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, case OBD_IOC_DETACH: { ENTRY; - if (obd->obd_flags & OBD_SET_UP) { + if (obd->obd_set_up) { CERROR("OBD device %d still set up\n", obd->obd_minor); - GOTO(out, err=-EBUSY); + GOTO(out, err = -EBUSY); } - if (!(obd->obd_flags & OBD_ATTACHED) ) { + if (!obd->obd_attached) { CERROR("OBD device %d not attached\n", obd->obd_minor); - GOTO(out, err=-ENODEV); + GOTO(out, err = -ENODEV); } if (OBP(obd, detach)) err = OBP(obd,detach)(obd); @@ -514,7 +534,7 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, obd->obd_name = NULL; } - obd->obd_flags &= ~OBD_ATTACHED; + obd->obd_attached = 0; obd->obd_type->typ_refcnt--; class_put_type(obd->obd_type); obd->obd_type = NULL; @@ -523,49 +543,106 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, case OBD_IOC_SETUP: { /* have we attached a type to this device? */ - if (!(obd->obd_flags & OBD_ATTACHED)) { + if (!obd->obd_attached) { CERROR("Device %d not attached\n", obd->obd_minor); - GOTO(out, err=-ENODEV); + GOTO(out, err = -ENODEV); } /* has this been done already? */ - if ( obd->obd_flags & OBD_SET_UP ) { + if (obd->obd_set_up) { CERROR("Device %d already setup (type %s)\n", obd->obd_minor, obd->obd_type->typ_name); - GOTO(out, err=-EBUSY); + GOTO(out, err = -EBUSY); } + atomic_set(&obd->obd_refcount, 0); + if ( OBT(obd) && OBP(obd, setup) ) err = obd_setup(obd, sizeof(*data), data); if (!err) { obd->obd_type->typ_refcnt++; - obd->obd_flags |= OBD_SET_UP; + obd->obd_set_up = 1; + atomic_inc(&obd->obd_refcount); } GOTO(out, err); } case OBD_IOC_CLEANUP: { - /* have we attached a type to this device? */ - if (!(obd->obd_flags & OBD_ATTACHED)) { - CERROR("Device %d not attached\n", obd->obd_minor); - GOTO(out, err=-ENODEV); - } - if (!list_empty(&obd->obd_exports)) { - if (!data->ioc_inlbuf1 || data->ioc_inlbuf1[0] != 'F') { - CERROR("OBD device %d (%p) has exports\n", - obd->obd_minor, obd); + int force = 0, failover = 0; + char * flag; + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + GOTO(out, err = -ENODEV); + } + + if (data->ioc_inlbuf1) { + for (flag = data->ioc_inlbuf1; *flag != 0; flag++) + switch (*flag) { + case 'F': + force = 1; + break; + case 'A': + failover = 1; + break; + default: + CERROR("unrecognised flag '%c'\n", + *flag); + } + } + + if (atomic_read(&obd->obd_refcount) == 1 || force) { + /* this will stop new connections, and need to + do it before class_disconnect_exports() */ + obd->obd_stopping = 1; + } + + if (atomic_read(&obd->obd_refcount) > 1) { + struct l_wait_info lwi = LWI_TIMEOUT_INTR(60 * HZ, NULL, + NULL, NULL); + int rc; + + if (!force) { + CERROR("OBD device %d (%p) has refcount %d\n", + obd->obd_minor, obd, + atomic_read(&obd->obd_refcount)); + dump_exports(obd); GOTO(out, err = -EBUSY); } - forcibly_detach_exports(obd); + class_disconnect_exports(obd, failover); + CDEBUG(D_IOCTL, + "%s: waiting for obd refs to go away: %d\n", + obd->obd_name, atomic_read(&obd->obd_refcount)); + + rc = l_wait_event(obd->obd_refcount_waitq, + atomic_read(&obd->obd_refcount) < 2, &lwi); + if (rc == 0) { + LASSERT(atomic_read(&obd->obd_refcount) == 1); + } else { + CERROR("wait cancelled cleaning anyway. " + "refcount: %d\n", + atomic_read(&obd->obd_refcount)); + dump_exports(obd); + } + CDEBUG(D_IOCTL, "%s: awake, now finishing cleanup\n", + obd->obd_name); } + if (OBT(obd) && OBP(obd, cleanup)) - err = obd_cleanup(obd); + err = obd_cleanup(obd, force, failover); if (!err) { - obd->obd_flags &= ~OBD_SET_UP; + obd->obd_set_up = obd->obd_stopping = 0; obd->obd_type->typ_refcnt--; + atomic_dec(&obd->obd_refcount); + /* XXX this should be an LASSERT */ + if (atomic_read(&obd->obd_refcount) > 0) + CERROR("%s still has refcount %d after " + "cleanup.\n", obd->obd_name, + atomic_read(&obd->obd_refcount)); } + GOTO(out, err); } @@ -573,24 +650,24 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, struct obd_uuid cluuid = { "OBD_CLASS_UUID" }; obd_data2conn(&conn, data); - err = obd_connect(&conn, obd, &cluuid, NULL, NULL); + err = obd_connect(&conn, obd, &cluuid); - CDEBUG(D_IOCTL, "assigned export "LPX64"\n", conn.addr); + CDEBUG(D_IOCTL, "assigned export "LPX64"\n", conn.cookie); obd_conn2data(data, &conn); if (err) GOTO(out, err); err = obd_class_add_user_conn (ocus, &conn); if (err != 0) { - obd_disconnect (&conn); + obd_disconnect (&conn, 0); GOTO (out, err); } err = copy_to_user((void *)arg, data, sizeof(*data)); if (err != 0) { obd_class_remove_user_conn (ocus, &conn); - obd_disconnect (&conn); - GOTO (out, err=-EFAULT); + obd_disconnect (&conn, 0); + GOTO (out, err = -EFAULT); } GOTO(out, err); } @@ -598,19 +675,19 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, case OBD_IOC_DISCONNECT: { obd_data2conn(&conn, data); obd_class_remove_user_conn (ocus, &conn); - err = obd_disconnect(&conn); + err = obd_disconnect(&conn, 0); GOTO(out, err); } case OBD_IOC_NO_TRANSNO: { - if (!(obd->obd_flags & OBD_ATTACHED)) { + if (!obd->obd_attached) { CERROR("Device %d not attached\n", obd->obd_minor); - GOTO(out, err=-ENODEV); + GOTO(out, err = -ENODEV); } CDEBUG(D_IOCTL, "disabling committed-transno notifications on %d\n", obd->obd_minor); - obd->obd_flags |= OBD_NO_TRANSNO; + obd->obd_no_transno = 1; GOTO(out, err = 0); } @@ -654,11 +731,11 @@ int class_handle_ioctl(struct obd_class_user_state *ocus, unsigned int cmd, out: if (buf) - OBD_FREE(buf, len); + obd_ioctl_freedata(buf, len); if (serialised) up(&obd_conf_sem); RETURN(err); -} /* obd_class_ioctl */ +} /* class_handle_ioctl */ @@ -688,86 +765,20 @@ static struct miscdevice obd_psdev = { void *obd_psdev = NULL; #endif -void (*class_signal_connection_failure)(struct ptlrpc_connection *); - -#ifdef CONFIG_HIGHMEM -/* Allow at most 3/4 of the kmap mappings to be consumed by vector I/O - * requests. This avoids deadlocks on servers which have a lot of clients - * doing vector I/O. We don't need to do this for non-vector I/O requests - * because singleton requests will just block on the kmap itself and never - * deadlock waiting for additional kmaps to complete. - * - * If we are a "server" task, we can have at most a single reservation - * in excess of the maximum. This avoids a deadlock when multiple client - * threads are on the same machine as the server threads, and the clients - * have consumed all of the available mappings. As long as a single server - * thread is can make progress, we are guaranteed to avoid deadlock. - */ -#define OBD_KMAP_MAX (LAST_PKMAP * 3 / 4) -static atomic_t obd_kmap_count = ATOMIC_INIT(OBD_KMAP_MAX); -static DECLARE_WAIT_QUEUE_HEAD(obd_kmap_waitq); - -void obd_kmap_get(int count, int server) -{ - //CERROR("getting %d kmap counts (%d/%d)\n", count, - // atomic_read(&obd_kmap_count), OBD_KMAP_MAX); - if (count == 1) - atomic_dec(&obd_kmap_count); - else while (atomic_add_negative(-count, &obd_kmap_count)) { - struct l_wait_info lwi = { 0 }; - static long next_show = 0; - static int skipped = 0; - - if (server && atomic_read(&obd_kmap_count) >= -PTL_MD_MAX_IOV) - break; - - CDEBUG(D_OTHER, "negative kmap reserved count: %d\n", - atomic_read(&obd_kmap_count)); - atomic_add(count, &obd_kmap_count); - - if (time_after(jiffies, next_show)) { - CERROR("blocking %s (and %d others) for kmaps\n", - current->comm, skipped); - next_show = jiffies + 5*HZ; - skipped = 0; - } else - skipped++; - l_wait_event(obd_kmap_waitq, - atomic_read(&obd_kmap_count) >= count, &lwi); - } -} - -void obd_kmap_put(int count) -{ - atomic_add(count, &obd_kmap_count); - /* Wake up sleepers. Sadly, this wakes up all of the tasks at once. - * We could have something smarter here like: - while (atomic_read(&obd_kmap_count) > 0) - wake_up_nr(obd_kmap_waitq, 1); - although we would need to set somewhere (probably obd_class_init): - obd_kmap_waitq.flags |= WQ_FLAG_EXCLUSIVE; - For now the wait_event() condition will handle this OK I believe. - */ - if (atomic_read(&obd_kmap_count) > 0) - wake_up(&obd_kmap_waitq); -} - -EXPORT_SYMBOL(obd_kmap_get); -EXPORT_SYMBOL(obd_kmap_put); -#endif - EXPORT_SYMBOL(obd_dev); EXPORT_SYMBOL(obdo_cachep); EXPORT_SYMBOL(obd_memory); EXPORT_SYMBOL(obd_memmax); EXPORT_SYMBOL(obd_fail_loc); EXPORT_SYMBOL(obd_timeout); -EXPORT_SYMBOL(obd_recovery_upcall); +EXPORT_SYMBOL(obd_lustre_upcall); EXPORT_SYMBOL(obd_sync_filter); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); EXPORT_SYMBOL(ptlrpc_abort_inflight_superhack); EXPORT_SYMBOL(proc_lustre_root); +EXPORT_SYMBOL(lctl_fake_uuid); + EXPORT_SYMBOL(class_register_type); EXPORT_SYMBOL(class_unregister_type); EXPORT_SYMBOL(class_get_type); @@ -775,19 +786,26 @@ EXPORT_SYMBOL(class_put_type); EXPORT_SYMBOL(class_name2dev); EXPORT_SYMBOL(class_uuid2dev); EXPORT_SYMBOL(class_uuid2obd); +EXPORT_SYMBOL(class_export_get); +EXPORT_SYMBOL(class_export_put); EXPORT_SYMBOL(class_new_export); -EXPORT_SYMBOL(class_destroy_export); +EXPORT_SYMBOL(class_unlink_export); +EXPORT_SYMBOL(class_import_get); +EXPORT_SYMBOL(class_import_put); +EXPORT_SYMBOL(class_new_import); +EXPORT_SYMBOL(class_destroy_import); EXPORT_SYMBOL(class_connect); EXPORT_SYMBOL(class_conn2export); EXPORT_SYMBOL(class_conn2obd); EXPORT_SYMBOL(class_conn2cliimp); EXPORT_SYMBOL(class_conn2ldlmimp); EXPORT_SYMBOL(class_disconnect); -EXPORT_SYMBOL(class_disconnect_all); -EXPORT_SYMBOL(class_uuid_unparse); +EXPORT_SYMBOL(class_disconnect_exports); EXPORT_SYMBOL(lustre_uuid_to_peer); -EXPORT_SYMBOL(class_signal_connection_failure); +/* uuid.c */ +EXPORT_SYMBOL(class_uuid_unparse); +EXPORT_SYMBOL(client_tgtuuid2obd); EXPORT_SYMBOL(class_handle_hash); EXPORT_SYMBOL(class_handle_unhash); @@ -851,7 +869,7 @@ static void cleanup_obdclass(void) misc_deregister(&obd_psdev); for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (obd->obd_type && (obd->obd_flags & OBD_SET_UP) && + if (obd->obd_type && obd->obd_set_up && OBT(obd) && OBP(obd, detach)) { /* XXX should this call generic detach otherwise? */ OBP(obd, detach)(obd); @@ -879,14 +897,15 @@ static void cleanup_obdclass(void) * kernel patch */ #ifdef __KERNEL__ #include <linux/lustre_version.h> -#define LUSTRE_SOURCE_VERSION 13 -#if (LUSTRE_KERNEL_VERSION < LUSTRE_SOURCE_VERSION) +#define LUSTRE_MIN_VERSION 18 +#define LUSTRE_MAX_VERSION 19 +#if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION) # error Cannot continue: Your Lustre kernel patch is older than the sources -#elif (LUSTRE_KERNEL_VERSION > LUSTRE_SOURCE_VERSION) +#elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION) # error Cannot continue: Your Lustre sources are older than the kernel patch #endif -#else -#warning "Lib Lustre - no versioning information" + #else +# warning "Lib Lustre - no versioning information" #endif #ifdef __KERNEL__ diff --git a/lustre/obdclass/debug.c b/lustre/obdclass/debug.c index 6118084..f824b98 100644 --- a/lustre/obdclass/debug.c +++ b/lustre/obdclass/debug.c @@ -1,20 +1,31 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Helper routines for dumping data structs for debugging. + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. * - * Copryright (C) 2002 Cluster File Systems, Inc. + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * + * Helper routines for dumping data structs for debugging. */ #define DEBUG_SUBSYSTEM D_OTHER #define EXPORT_SYMTAB #ifndef __KERNEL__ -#include <liblustre.h> +# include <liblustre.h> #endif #include <linux/obd_ost.h> @@ -24,15 +35,16 @@ int dump_ioo(struct obd_ioobj *ioo) { - CERROR("obd_ioobj: ioo_id="LPD64", ioo_gr="LPD64", ioo_type=%d, ioo_bufct=%d\n", + CERROR("obd_ioobj: ioo_id="LPD64", ioo_gr="LPD64", ioo_type=%d, " + "ioo_bufct=%d\n", ioo->ioo_id, ioo->ioo_gr, ioo->ioo_type, ioo->ioo_bufcnt); return -EINVAL; } int dump_lniobuf(struct niobuf_local *nb) { - CERROR("niobuf_local: addr=%p, offset="LPD64", len=%d, xid=%d, page=%p\n", - nb->addr, nb->offset, nb->len, nb->xid, nb->page); + CERROR("niobuf_local: offset="LPD64", len=%d, page=%p, rc=%d\n", + nb->offset, nb->len, nb->page, nb->rc); CERROR("nb->page: index = %ld\n", nb->page ? nb->page->index : -1); return -EINVAL; @@ -40,8 +52,8 @@ int dump_lniobuf(struct niobuf_local *nb) int dump_rniobuf(struct niobuf_remote *nb) { - CERROR("niobuf_remote: offset="LPD64", len=%d, flags=%x, xid=%d\n", - nb->offset, nb->len, nb->flags, nb->xid); + CERROR("niobuf_remote: offset="LPU64", len=%d, flags=%x\n", + nb->offset, nb->len, nb->flags); return -EINVAL; } @@ -104,8 +116,8 @@ int page_debug_setup(void *addr, int len, __u64 off, __u64 id) { LASSERT(addr); - off = HTON__u64(off); - id = HTON__u64(id); + off = cpu_to_le64 (off); + id = cpu_to_le64 (id); memcpy(addr, (char *)&off, LPDS); memcpy(addr + LPDS, (char *)&id, LPDS); @@ -123,28 +135,28 @@ int page_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) LASSERT(addr); - ne_off = HTON__u64(off); - id = HTON__u64(id); + ne_off = le64_to_cpu (off); + id = le64_to_cpu (id); if (memcmp(addr, (char *)&ne_off, LPDS)) { - CERROR("%s: id "LPU64" offset "LPU64" off: "LPX64" != "LPX64"\n", - who, id, off, *(__u64 *)addr, ne_off); + CERROR("%s: id "LPX64" offset "LPU64" off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); err = -EINVAL; } if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CERROR("%s: id "LPU64" offset "LPU64" id: "LPX64" != "LPX64"\n", + CERROR("%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); err = -EINVAL; } addr += end - LPDS - LPDS; if (memcmp(addr, (char *)&ne_off, LPDS)) { - CERROR("%s: id "LPU64" offset "LPU64" end off: "LPX64" != "LPX64"\n", - who, id, off, *(__u64 *)addr, ne_off); + CERROR("%s: id "LPX64" offset "LPU64" end off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); err = -EINVAL; } if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CERROR("%s: id "LPU64" offset "LPU64" end id: "LPX64" != "LPX64"\n", - who, id, off, *(__u64 *)(addr + LPDS), id); + CERROR("%s: id "LPX64" offset "LPU64" end id: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); err = -EINVAL; } diff --git a/lustre/obdclass/fsfilt.c b/lustre/obdclass/fsfilt.c index 07ce0b3..4357b79 100644 --- a/lustre/obdclass/fsfilt.c +++ b/lustre/obdclass/fsfilt.c @@ -38,7 +38,7 @@ int fsfilt_register_ops(struct fsfilt_operations *fs_ops) RETURN(-EEXIST); } } else { - MOD_INC_USE_COUNT; + PORTAL_MODULE_USE; list_add(&fs_ops->fs_list, &fsfilt_types); } @@ -57,7 +57,7 @@ void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops) found = list_entry(p, typeof(*found), fs_list); if (found == fs_ops) { list_del(p); - MOD_DEC_USE_COUNT; + PORTAL_MODULE_UNUSE; break; } } diff --git a/lustre/obdclass/fsfilt_ext3.c b/lustre/obdclass/fsfilt_ext3.c index 72f2830..a02f1f5 100644 --- a/lustre/obdclass/fsfilt_ext3.c +++ b/lustre/obdclass/fsfilt_ext3.c @@ -4,7 +4,7 @@ * lustre/lib/fsfilt_ext3.c * Lustre filesystem abstraction routines * - * Copyright (C) 2002 Cluster File Systems, Inc. + * Copyright (C) 2002, 2003 Cluster File Systems, Inc. * Author: Andreas Dilger <adilger@clusterfs.com> * * This file is part of Lustre, http://www.lustre.org. @@ -23,25 +23,20 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -//#error "FIXME: this needs to be updated to match fsfilt_extN.c" - #define DEBUG_SUBSYSTEM S_FILTER #include <linux/fs.h> #include <linux/jbd.h> #include <linux/slab.h> -#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> -#include <linux/version.h> -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# include <linux/ext3_xattr.h> -#else -# include <asm/statfs.h> -#endif +#include <linux/ext3_xattr.h> #include <linux/kp30.h> #include <linux/lustre_fsfilt.h> #include <linux/obd.h> +#include <linux/obd_class.h> #include <linux/module.h> static kmem_cache_t *fcb_cache; @@ -75,18 +70,21 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) nblocks += EXT3_DELETE_TRANS_BLOCKS; break; case FSFILT_OP_RENAME: - /* We may be modifying two directories */ + /* modify additional directory */ nblocks += EXT3_DATA_TRANS_BLOCKS; + /* no break */ case FSFILT_OP_SYMLINK: - /* Possible new block + block bitmap + GDT for long symlink */ + /* additional block + block bitmap + GDT for long symlink */ nblocks += 3; + /* no break */ case FSFILT_OP_CREATE: case FSFILT_OP_MKDIR: case FSFILT_OP_MKNOD: - /* New inode + block bitmap + GDT for new file */ + /* modify one inode + block bitmap + GDT */ nblocks += 3; + /* no break */ case FSFILT_OP_LINK: - /* Change parent directory */ + /* modify parent directory */ nblocks += EXT3_INDEX_EXTRA_TRANS_BLOCKS+EXT3_DATA_TRANS_BLOCKS; break; case FSFILT_OP_SETATTR: @@ -97,6 +95,7 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) LBUG(); } + LASSERT(!current->journal_info); lock_kernel(); handle = journal_start(EXT3_JOURNAL(inode), nblocks); unlock_kernel(); @@ -104,12 +103,135 @@ static void *fsfilt_ext3_start(struct inode *inode, int op) return handle; } -static int fsfilt_ext3_commit(struct inode *inode, void *handle) +/* + * Calculate the number of buffer credits needed to write multiple pages in + * a single ext3 transaction. No, this shouldn't be here, but as yet ext3 + * doesn't have a nice API for calculating this sort of thing in advance. + * + * See comment above ext3_writepage_trans_blocks for details. We assume + * no data journaling is being done, but it does allow for all of the pages + * being non-contiguous. If we are guaranteed contiguous pages we could + * reduce the number of (d)indirect blocks a lot. + * + * With N blocks per page and P pages, for each inode we have at most: + * N*P indirect + * min(N*P, blocksize/4 + 1) dindirect blocks + * niocount tindirect + * + * For the entire filesystem, we have at most: + * min(sum(nindir + P), ngroups) bitmap blocks (from the above) + * min(sum(nindir + P), gdblocks) group descriptor blocks (from the above) + * objcount inode blocks + * 1 superblock + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quota files + * + * 1 EXT3_DATA_TRANS_BLOCKS for the last_rcvd update. + */ +static int fsfilt_ext3_credits_needed(int objcount, struct fsfilt_objinfo *fso) +{ + struct super_block *sb = fso->fso_dentry->d_inode->i_sb; + int blockpp = 1 << (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); + int addrpp = EXT3_ADDR_PER_BLOCK(sb) * blockpp; + int nbitmaps = 0; + int ngdblocks = 0; + int needed = objcount + 1; + int i; + + for (i = 0; i < objcount; i++, fso++) { + int nblocks = fso->fso_bufcnt * blockpp; + int ndindirect = min(nblocks, addrpp + 1); + int nindir = nblocks + ndindirect + 1; + + nbitmaps += nindir + nblocks; + ngdblocks += nindir + nblocks; + + needed += nindir; + } + + /* Assumes ext3 and ext3 have same sb_info layout at the start. */ + if (nbitmaps > EXT3_SB(sb)->s_groups_count) + nbitmaps = EXT3_SB(sb)->s_groups_count; + if (ngdblocks > EXT3_SB(sb)->s_gdb_count) + ngdblocks = EXT3_SB(sb)->s_gdb_count; + + needed += nbitmaps + ngdblocks; + + /* last_rcvd update */ + needed += EXT3_DATA_TRANS_BLOCKS; + +#ifdef CONFIG_QUOTA + /* We assume that there will be 1 bit set in s_dquot.flags for each + * quota file that is active. This is at least true for now. + */ + needed += hweight32(sb_any_quota_enabled(sb)) * + EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + + return needed; +} + +/* We have to start a huge journal transaction here to hold all of the + * metadata for the pages being written here. This is necessitated by + * the fact that we do lots of prepare_write operations before we do + * any of the matching commit_write operations, so even if we split + * up to use "smaller" transactions none of them could complete until + * all of them were opened. By having a single journal transaction, + * we eliminate duplicate reservations for common blocks like the + * superblock and group descriptors or bitmaps. + * + * We will start the transaction here, but each prepare_write will + * add a refcount to the transaction, and each commit_write will + * remove a refcount. The transaction will be closed when all of + * the pages have been written. + */ +static void *fsfilt_ext3_brw_start(int objcount, struct fsfilt_objinfo *fso, + int niocount, struct niobuf_remote *nb) +{ + journal_t *journal; + handle_t *handle; + int needed; + ENTRY; + + LASSERT(!current->journal_info); + journal = EXT3_SB(fso->fso_dentry->d_inode->i_sb)->s_journal; + needed = fsfilt_ext3_credits_needed(objcount, fso); + + /* The number of blocks we could _possibly_ dirty can very large. + * We reduce our request if it is absurd (and we couldn't get that + * many credits for a single handle anyways). + * + * At some point we have to limit the size of I/Os sent at one time, + * increase the size of the journal, or we have to calculate the + * actual journal requirements more carefully by checking all of + * the blocks instead of being maximally pessimistic. It remains to + * be seen if this is a real problem or not. + */ + if (needed > journal->j_max_transaction_buffers) { + CERROR("want too many journal credits (%d) using %d instead\n", + needed, journal->j_max_transaction_buffers); + needed = journal->j_max_transaction_buffers; + } + + lock_kernel(); + handle = journal_start(journal, needed); + unlock_kernel(); + if (IS_ERR(handle)) + CERROR("can't get handle for %d credits: rc = %ld\n", needed, + PTR_ERR(handle)); + + RETURN(handle); +} + +static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync) { int rc; + handle_t *handle = h; + + if (force_sync) + handle->h_sync = 1; /* recovery likes this */ lock_kernel(); - rc = journal_stop((handle_t *)handle); + rc = journal_stop(handle); unlock_kernel(); return rc; @@ -122,10 +244,38 @@ static int fsfilt_ext3_setattr(struct dentry *dentry, void *handle, int rc; lock_kernel(); + + /* A _really_ horrible hack to avoid removing the data stored + * in the block pointers; this is really the "small" stripe MD data. + * We can avoid further hackery by virtue of the MDS file size being + * zero all the time (which doesn't invoke block truncate at unlink + * time), so we assert we never change the MDS file size from zero. + */ + if (iattr->ia_valid & ATTR_SIZE) { + CERROR("hmm, setting %*s file size to %lld\n", + dentry->d_name.len, dentry->d_name.name, iattr->ia_size); + LASSERT(iattr->ia_size == 0); +#if 0 + /* ATTR_SIZE would invoke truncate: clear it */ + iattr->ia_valid &= ~ATTR_SIZE; + inode->i_size = iattr->ia_size; + + /* make sure _something_ gets set - so new inode + * goes to disk (probably won't work over XFS + */ + if (!iattr->ia_valid & ATTR_MODE) { + iattr->ia_valid |= ATTR_MODE; + iattr->ia_mode = inode->i_mode; + } +#endif + } if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, iattr); - else - rc = inode_setattr(inode, iattr); + else{ + rc = inode_change_ok(inode, iattr); + if (!rc) + rc = inode_setattr(inode, iattr); + } unlock_kernel(); @@ -137,29 +287,58 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, { int rc; - down(&inode->i_sem); - lock_kernel(); - rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); - unlock_kernel(); - up(&inode->i_sem); + /* Nasty hack city - store stripe MD data in the block pointers if + * it will fit, because putting it in an EA currently kills the MDS + * performance. We'll fix this with "fast EAs" in the future. + */ + if (lmm_size <= sizeof(EXT3_I(inode)->i_data) - + sizeof(EXT3_I(inode)->i_data[0])) { + /* XXX old_size is debugging only */ + int old_size = EXT3_I(inode)->i_data[0]; + if (old_size != 0) { + LASSERT(old_size < sizeof(EXT3_I(inode)->i_data)); + CERROR("setting EA on %lu again... interesting\n", + inode->i_ino); + } - if (rc) { + EXT3_I(inode)->i_data[0] = cpu_to_le32(lmm_size); + memcpy(&EXT3_I(inode)->i_data[1], lmm, lmm_size); + mark_inode_dirty(inode); + return 0; + } else { + down(&inode->i_sem); + lock_kernel(); + rc = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_LUSTRE, + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); + unlock_kernel(); + up(&inode->i_sem); + } + + if (rc) CERROR("error adding MD data to inode %lu: rc = %d\n", inode->i_ino, rc); - if (rc != -ENOSPC) LBUG(); - } return rc; } -static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) +static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) { int rc; + if (EXT3_I(inode)->i_data[0]) { + int size = le32_to_cpu(EXT3_I(inode)->i_data[0]); + LASSERT(size < sizeof(EXT3_I(inode)->i_data)); + if (lmm) { + if (size > lmm_size) + return -ERANGE; + memcpy(lmm, &EXT3_I(inode)->i_data[1], size); + } + return size; + } + down(&inode->i_sem); lock_kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, size); + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size); unlock_kernel(); up(&inode->i_sem); @@ -170,7 +349,7 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) if (rc < 0) { CDEBUG(D_INFO, "error getting EA %s from inode %lu: " "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc); - memset(lmm, 0, size); + memset(lmm, 0, lmm_size); return (rc == -ENODATA) ? 0 : rc; } @@ -178,26 +357,55 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int size) } static ssize_t fsfilt_ext3_readpage(struct file *file, char *buf, size_t count, - loff_t *offset) + loff_t *off) { struct inode *inode = file->f_dentry->d_inode; int rc = 0; if (S_ISREG(inode->i_mode)) - rc = file->f_op->read(file, buf, count, offset); + rc = file->f_op->read(file, buf, count, off); else { - struct buffer_head *bh; - - /* FIXME: this assumes the blocksize == count, but the calling - * function will detect this as an error for now */ - bh = ext3_bread(NULL, inode, - *offset >> inode->i_sb->s_blocksize_bits, - 0, &rc); - - if (bh) { - memcpy(buf, bh->b_data, inode->i_blksize); - brelse(bh); - rc = inode->i_blksize; + const int blkbits = inode->i_sb->s_blocksize_bits; + const int blksize = inode->i_sb->s_blocksize; + + CDEBUG(D_EXT2, "reading "LPSZ" at dir %lu+%llu\n", + count, inode->i_ino, *off); + while (count > 0) { + struct buffer_head *bh; + + bh = NULL; + if (*off < inode->i_size) { + int err = 0; + + bh = ext3_bread(NULL, inode, *off >> blkbits, + 0, &err); + + CDEBUG(D_EXT2, "read %u@%llu\n", blksize, *off); + + if (bh) { + memcpy(buf, bh->b_data, blksize); + brelse(bh); + } else if (err) { + /* XXX in theory we should just fake + * this buffer and continue like ext3, + * especially if this is a partial read + */ + CERROR("error read dir %lu+%llu: %d\n", + inode->i_ino, *off, err); + RETURN(err); + } + } + if (!bh) { + struct ext3_dir_entry_2 *fake = (void *)buf; + + CDEBUG(D_EXT2, "fake %u@%llu\n", blksize, *off); + memset(fake, 0, sizeof(*fake)); + fake->rec_len = cpu_to_le32(blksize); + } + count -= blksize; + buf += blksize; + *off += blksize; + rc += blksize; } } @@ -210,18 +418,17 @@ static void fsfilt_ext3_cb_func(struct journal_callback *jcb, int error) fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error); - kmem_cache_free(fcb_cache, fcb); + OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb); atomic_dec(&fcb_cache_count); } static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, void *handle, fsfilt_cb_t cb_func) { -#ifdef HAVE_JOURNAL_CALLBACK_STATUS struct fsfilt_cb_data *fcb; - fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS); - if (!fcb) + OBD_SLAB_ALLOC(fcb, fcb_cache, GFP_NOFS, sizeof *fcb); + if (fcb == NULL) RETURN(-ENOMEM); atomic_inc(&fcb_cache_count); @@ -235,17 +442,6 @@ static int fsfilt_ext3_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, journal_callback_set(handle, fsfilt_ext3_cb_func, (struct journal_callback *)fcb); unlock_kernel(); -#else -#warning "no journal callback kernel patch, faking it..." - static long next = 0; - - if (time_after(jiffies, next)) { - CERROR("no journal callback kernel patch, faking it...\n"); - next = jiffies + 300 * HZ; - } - - cb_func(obd, last_rcvd, 0); -#endif return 0; } @@ -266,13 +462,17 @@ static int fsfilt_ext3_journal_data(struct file *filp) * * This can be removed when the ext3 EA code is fixed. */ -static int fsfilt_ext3_statfs(struct super_block *sb, struct statfs *sfs) +static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) { - int rc = vfs_statfs(sb, sfs); + struct statfs sfs; + int rc = vfs_statfs(sb, &sfs); - if (!rc && sfs->f_bfree < sfs->f_ffree) - sfs->f_ffree = sfs->f_bfree; + if (!rc && sfs.f_bfree < sfs.f_ffree) { + sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree; + sfs.f_ffree = sfs.f_bfree; + } + statfs_pack(osfs, &sfs); return rc; } @@ -281,10 +481,19 @@ static int fsfilt_ext3_sync(struct super_block *sb) return ext3_force_commit(sb); } +extern int ext3_prep_san_write(struct inode *inode, long *blocks, + int nblocks, loff_t newsize); +static int fsfilt_ext3_prep_san_write(struct inode *inode, long *blocks, + int nblocks, loff_t newsize) +{ + return ext3_prep_san_write(inode, blocks, nblocks, newsize); +} + static struct fsfilt_operations fsfilt_ext3_ops = { fs_type: "ext3", fs_owner: THIS_MODULE, fs_start: fsfilt_ext3_start, + fs_brw_start: fsfilt_ext3_brw_start, fs_commit: fsfilt_ext3_commit, fs_setattr: fsfilt_ext3_setattr, fs_set_md: fsfilt_ext3_set_md, @@ -294,6 +503,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { fs_set_last_rcvd: fsfilt_ext3_set_last_rcvd, fs_statfs: fsfilt_ext3_statfs, fs_sync: fsfilt_ext3_sync, + fs_prep_san_write: fsfilt_ext3_prep_san_write, }; static int __init fsfilt_ext3_init(void) diff --git a/lustre/obdclass/fsfilt_extN.c b/lustre/obdclass/fsfilt_extN.c index d029785..ddec807 100644 --- a/lustre/obdclass/fsfilt_extN.c +++ b/lustre/obdclass/fsfilt_extN.c @@ -222,15 +222,13 @@ static void *fsfilt_extN_brw_start(int objcount, struct fsfilt_objinfo *fso, RETURN(handle); } -static int fsfilt_extN_commit(struct inode *inode, void *h /*, force_sync */) +static int fsfilt_extN_commit(struct inode *inode, void *h, int force_sync) { int rc; handle_t *handle = h; -#if 0 if (force_sync) handle->h_sync = 1; /* recovery likes this */ -#endif lock_kernel(); rc = journal_stop(handle); @@ -273,8 +271,11 @@ static int fsfilt_extN_setattr(struct dentry *dentry, void *handle, } if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, iattr); - else - rc = inode_setattr(inode, iattr); + else{ + rc = inode_change_ok(inode, iattr); + if (!rc) + rc = inode_setattr(inode, iattr); + } unlock_kernel(); @@ -386,7 +387,7 @@ static ssize_t fsfilt_extN_readpage(struct file *file, char *buf, size_t count, brelse(bh); } else if (err) { /* XXX in theory we should just fake - * this buffer and continue like ext3, + * this buffer and continue like extN, * especially if this is a partial read */ CERROR("error read dir %lu+%llu: %d\n", @@ -417,7 +418,7 @@ static void fsfilt_extN_cb_func(struct journal_callback *jcb, int error) fcb->cb_func(fcb->cb_obd, fcb->cb_last_rcvd, error); - kmem_cache_free(fcb_cache, fcb); + OBD_SLAB_FREE(fcb, fcb_cache, sizeof *fcb); atomic_dec(&fcb_cache_count); } @@ -426,8 +427,8 @@ static int fsfilt_extN_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, { struct fsfilt_cb_data *fcb; - fcb = kmem_cache_alloc(fcb_cache, GFP_NOFS); - if (!fcb) + OBD_SLAB_ALLOC(fcb, fcb_cache, GFP_NOFS, sizeof *fcb); + if (fcb == NULL) RETURN(-ENOMEM); atomic_inc(&fcb_cache_count); @@ -466,8 +467,10 @@ static int fsfilt_extN_statfs(struct super_block *sb, struct obd_statfs *osfs) struct statfs sfs; int rc = vfs_statfs(sb, &sfs); - if (!rc && sfs.f_bfree < sfs.f_ffree) + if (!rc && sfs.f_bfree < sfs.f_ffree) { + sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree; sfs.f_ffree = sfs.f_bfree; + } statfs_pack(osfs, &sfs); return rc; diff --git a/lustre/obdclass/fsfilt_reiserfs.c b/lustre/obdclass/fsfilt_reiserfs.c index 06302c5..2aba0f1 100644 --- a/lustre/obdclass/fsfilt_reiserfs.c +++ b/lustre/obdclass/fsfilt_reiserfs.c @@ -59,7 +59,8 @@ static void *fsfilt_reiserfs_brw_start(int objcount, struct fsfilt_objinfo *fso, return (void *)0xf00f00be; } -static int fsfilt_reiserfs_commit(struct inode *inode, void *handle) +static int fsfilt_reiserfs_commit(struct inode *inode, void *handle, + int force_sync) { if (handle != (void *)0xf00f00be) { CERROR("bad handle %p", handle); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index bd43554..9000771 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -37,13 +37,14 @@ #include <linux/lprocfs_status.h> extern struct list_head obd_types; +static spinlock_t obd_types_lock = SPIN_LOCK_UNLOCKED; kmem_cache_t *obdo_cachep = NULL; kmem_cache_t *import_cachep = NULL; -kmem_cache_t *export_cachep = NULL; int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); -void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp, - int dying_import); +void (*ptlrpc_abort_inflight_superhack)(struct obd_import *imp); + +struct obd_uuid lctl_fake_uuid = { .uuid = "OBD_CLASS_UUID" }; /* * support functions: we could use inter-module communication, but this @@ -53,17 +54,17 @@ static struct obd_type *class_search_type(char *name) { struct list_head *tmp; struct obd_type *type; - CDEBUG(D_INFO, "SEARCH %s\n", name); - tmp = &obd_types; + spin_lock(&obd_types_lock); list_for_each(tmp, &obd_types) { type = list_entry(tmp, struct obd_type, typ_chain); - CDEBUG(D_INFO, "TYP %s\n", type->typ_name); if (strlen(type->typ_name) == strlen(name) && strcmp(type->typ_name, name) == 0) { + spin_unlock(&obd_types_lock); return type; } } + spin_unlock(&obd_types_lock); return NULL; } @@ -117,17 +118,19 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars, *(type->typ_ops) = *ops; strcpy(type->typ_name, name); - list_add(&type->typ_chain, &obd_types); type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root, vars, type); if (IS_ERR(type->typ_procroot)) { rc = PTR_ERR(type->typ_procroot); type->typ_procroot = NULL; - list_del(&type->typ_chain); GOTO (failed, rc); } + spin_lock(&obd_types_lock); + list_add(&type->typ_chain, &obd_types); + spin_unlock(&obd_types_lock); + RETURN (0); failed: @@ -161,7 +164,9 @@ int class_unregister_type(char *name) type->typ_procroot = NULL; } + spin_lock(&obd_types_lock); list_del(&type->typ_chain); + spin_unlock(&obd_types_lock); OBD_FREE(type->typ_name, strlen(name) + 1); if (type->typ_ops != NULL) OBD_FREE(type->typ_ops, sizeof(*type->typ_ops)); @@ -171,7 +176,6 @@ int class_unregister_type(char *name) int class_name2dev(char *name) { - int res = -1; int i; if (!name) @@ -179,39 +183,33 @@ int class_name2dev(char *name) for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (obd->obd_name && strcmp(name, obd->obd_name) == 0) { - res = i; - return res; - } + if (obd->obd_name && strcmp(name, obd->obd_name) == 0) + return i; } - return res; + return -1; } int class_uuid2dev(struct obd_uuid *uuid) { - int res = -1; int i; for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0) { - res = i; - return res; - } + if (obd_uuid_equals(uuid, &obd->obd_uuid)) + return i; } - return res; + return -1; } - struct obd_device *class_uuid2obd(struct obd_uuid *uuid) { int i; for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; - if (strncmp(uuid->uuid, obd->obd_uuid.uuid, sizeof(obd->obd_uuid.uuid)) == 0) + if (obd_uuid_equals(uuid, &obd->obd_uuid)) return obd; } @@ -234,12 +232,6 @@ void obd_cleanup_caches(void) CERROR("Cannot destory ll_import_cache\n"); import_cachep = NULL; } - if (export_cachep) { - rc = kmem_cache_destroy(export_cachep); - if (rc) - CERROR("Cannot destory ll_export_cache\n"); - export_cachep = NULL; - } EXIT; } @@ -252,13 +244,6 @@ int obd_init_caches(void) if (!obdo_cachep) GOTO(out, -ENOMEM); - LASSERT(export_cachep == NULL); - export_cachep = kmem_cache_create("ll_export_cache", - sizeof(struct obd_export), - 0, 0, NULL, NULL); - if (!export_cachep) - GOTO(out, -ENOMEM); - LASSERT(import_cachep == NULL); import_cachep = kmem_cache_create("ll_import_cache", sizeof(struct obd_import), @@ -284,262 +269,277 @@ struct obd_export *class_conn2export(struct lustre_handle *conn) RETURN(NULL); } - if (conn->addr == -1) { /* this means assign a new connection */ + if (conn->cookie == -1) { /* this means assign a new connection */ CDEBUG(D_CACHE, "want a new connection\n"); RETURN(NULL); } - if (!conn->addr) { - CDEBUG(D_CACHE, "looking for null addr\n"); - fixme(); - RETURN(NULL); - } - - CDEBUG(D_IOCTL, "looking for export addr "LPX64" cookie "LPX64"\n", - conn->addr, conn->cookie); - export = (struct obd_export *) (unsigned long)conn->addr; - if (!kmem_cache_validate(export_cachep, (void *)export)) - RETURN(NULL); - - if (export->exp_cookie != conn->cookie) - RETURN(NULL); + CDEBUG(D_IOCTL, "looking for export cookie "LPX64"\n", conn->cookie); + export = class_handle2object(conn->cookie); RETURN(export); -} /* class_conn2export */ +} struct obd_device *class_conn2obd(struct lustre_handle *conn) { struct obd_export *export; export = class_conn2export(conn); - if (export) - return export->exp_obd; - fixme(); + if (export) { + struct obd_device *obd = export->exp_obd; + class_export_put(export); + return obd; + } return NULL; } struct obd_import *class_conn2cliimp(struct lustre_handle *conn) { - return &class_conn2obd(conn)->u.cli.cl_import; + return class_conn2obd(conn)->u.cli.cl_import; } struct obd_import *class_conn2ldlmimp(struct lustre_handle *conn) { - return &class_conn2export(conn)->exp_ldlm_data.led_import; + struct obd_export *export; + export = class_conn2export(conn); + if (export) { + struct obd_import *imp = export->exp_ldlm_data.led_import; + class_export_put(export); + return imp; + } + fixme(); + return NULL; +} + +/* Export management functions */ +static void export_handle_addref(void *export) +{ + class_export_get(export); +} + +struct obd_export *class_export_get(struct obd_export *exp) +{ + atomic_inc(&exp->exp_refcount); + CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount)); + return exp; +} + +void class_export_put(struct obd_export *exp) +{ + ENTRY; + + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount) - 1); + LASSERT(atomic_read(&exp->exp_refcount) > 0); + LASSERT(atomic_read(&exp->exp_refcount) < 0x5a5a5a); + if (atomic_dec_and_test(&exp->exp_refcount)) { + struct obd_device *obd = exp->exp_obd; + CDEBUG(D_IOCTL, "destroying export %p/%s\n", exp, + exp->exp_client_uuid.uuid); + + LASSERT(obd != NULL); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + if (exp->exp_connection) + ptlrpc_put_connection_superhack(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_handle.h_link)); + + obd_destroy_export(exp); + + OBD_FREE(exp, sizeof(*exp)); + atomic_dec(&obd->obd_refcount); + wake_up(&obd->obd_refcount_waitq); + } + EXIT; } struct obd_export *class_new_export(struct obd_device *obddev) { struct obd_export *export; - PORTAL_SLAB_ALLOC(export, export_cachep, sizeof(*export)); + OBD_ALLOC(export, sizeof(*export)); if (!export) { CERROR("no memory! (minor %d)\n", obddev->obd_minor); return NULL; } - get_random_bytes(&export->exp_cookie, sizeof(export->exp_cookie)); + atomic_set(&export->exp_refcount, 2); export->exp_obd = obddev; /* XXX this should be in LDLM init */ INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks); - INIT_LIST_HEAD(&export->exp_conn_chain); + + INIT_LIST_HEAD(&export->exp_handle.h_link); + class_handle_hash(&export->exp_handle, export_handle_addref); + spin_lock_init(&export->exp_lock); + spin_lock(&obddev->obd_dev_lock); + LASSERT(!obddev->obd_stopping); /* shouldn't happen, but might race */ + atomic_inc(&obddev->obd_refcount); list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); spin_unlock(&obddev->obd_dev_lock); return export; } -void class_destroy_export(struct obd_export *exp) +void class_unlink_export(struct obd_export *exp) { - LASSERT(exp->exp_cookie != DEAD_HANDLE_MAGIC); - - CDEBUG(D_IOCTL, "destroying export %p/%s\n", exp, - exp->exp_client_uuid.uuid); + class_handle_unhash(&exp->exp_handle); spin_lock(&exp->exp_obd->obd_dev_lock); - list_del(&exp->exp_obd_chain); + list_del_init(&exp->exp_obd_chain); spin_unlock(&exp->exp_obd->obd_dev_lock); - /* XXXshaver no connection here... */ - if (exp->exp_connection) - spin_lock(&exp->exp_connection->c_lock); - list_del(&exp->exp_conn_chain); - if (exp->exp_connection) { - spin_unlock(&exp->exp_connection->c_lock); - ptlrpc_put_connection_superhack(exp->exp_connection); - } + class_export_put(exp); +} - /* Abort any inflight DLM requests and NULL out their (about to be - * freed) import. */ - if (exp->exp_ldlm_data.led_import.imp_obd) - ptlrpc_abort_inflight_superhack(&exp->exp_ldlm_data.led_import, - 1); +/* Import management functions */ +static void import_handle_addref(void *import) +{ + class_import_get(import); +} - PORTAL_SLAB_FREE(exp, export_cachep, sizeof(*exp)); +struct obd_import *class_import_get(struct obd_import *import) +{ + atomic_inc(&import->imp_refcount); + CDEBUG(D_IOCTL, "import %p refcount=%d\n", import, + atomic_read(&import->imp_refcount)); + return import; } -/* a connection defines an export context in which preallocation can - be managed. */ -int class_connect(struct lustre_handle *exporth, struct obd_device *obd, - struct obd_uuid *cluuid) +void class_import_put(struct obd_import *import) { - struct obd_export * export; - if (exporth == NULL) { - LBUG(); - return -EINVAL; - } + ENTRY; - if (obd == NULL) { - LBUG(); - return -EINVAL; - } + CDEBUG(D_IOCTL, "import %p refcount=%d\n", import, + atomic_read(&import->imp_refcount) - 1); - if (cluuid == NULL) { - LBUG(); - return -EINVAL; + LASSERT(atomic_read(&import->imp_refcount) > 0); + LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a); + if (!atomic_dec_and_test(&import->imp_refcount)) { + EXIT; + return; } - export = class_new_export(obd); - if (!export) - return -ENOMEM; + CDEBUG(D_IOCTL, "destroying import %p\n", import); - exporth->addr = (__u64) (unsigned long)export; - exporth->cookie = export->exp_cookie; - memcpy(&export->exp_client_uuid, cluuid, sizeof(export->exp_client_uuid)); + ptlrpc_put_connection_superhack(import->imp_connection); - CDEBUG(D_IOCTL, "connect: addr %Lx cookie %Lx\n", - (long long)exporth->addr, (long long)exporth->cookie); - return 0; + LASSERT(list_empty(&import->imp_handle.h_link)); + OBD_FREE(import, sizeof(*import)); + EXIT; } -int class_disconnect(struct lustre_handle *conn) +struct obd_import *class_new_import(void) { - struct obd_export *export; - ENTRY; + struct obd_import *imp; - if (!(export = class_conn2export(conn))) { - fixme(); - CDEBUG(D_IOCTL, "disconnect: attempting to free " - "nonexistent client "LPX64"\n", conn->addr); - RETURN(-EINVAL); - } + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; - CDEBUG(D_IOCTL, "disconnect: addr %Lx cookie %Lx\n", - (long long)conn->addr, (long long)conn->cookie); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + spin_lock_init(&imp->imp_lock); + imp->imp_max_transno = 0; + imp->imp_peer_committed_transno = 0; - class_destroy_export(export); + atomic_set(&imp->imp_refcount, 2); + INIT_LIST_HEAD(&imp->imp_handle.h_link); + class_handle_hash(&imp->imp_handle, import_handle_addref); - RETURN(0); + return imp; } -void class_disconnect_all(struct obd_device *obddev) +void class_destroy_import(struct obd_import *import) { - int again = 1; - - while (again) { - spin_lock(&obddev->obd_dev_lock); - if (!list_empty(&obddev->obd_exports)) { - struct obd_export *export; - struct lustre_handle conn; - int rc; - - export = list_entry(obddev->obd_exports.next, - struct obd_export, - exp_obd_chain); - conn.addr = (__u64)(unsigned long)export; - conn.cookie = export->exp_cookie; - spin_unlock(&obddev->obd_dev_lock); - CERROR("force disconnecting %s:%s export %p\n", - export->exp_obd->obd_type->typ_name, - export->exp_connection ? - (char *)export->exp_connection->c_remote_uuid.uuid : - "<unconnected>", export); - rc = obd_disconnect(&conn); - if (rc < 0) { - /* AED: not so sure about this... We can't - * loop here forever, yet we shouldn't leak - * exports on a struct we will soon destroy. - */ - CERROR("destroy export %p with err: rc = %d\n", - export, rc); - class_destroy_export(export); - } - } else { - spin_unlock(&obddev->obd_dev_lock); - again = 0; - } - } -} + LASSERT(import != NULL); -#if 0 + class_handle_unhash(&import->imp_handle); -/* FIXME: Data is a space- or comma-separated list of device IDs. This will - * have to change. */ -int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data) -{ - int count, rc; - char *p; - ENTRY; + /* Abort any inflight DLM requests and NULL out their (about to be + * freed) import. */ + ptlrpc_abort_inflight_superhack(import); - for (p = data, count = 0; p < (char *)data + len; count++) { - char *end; - int tmp = simple_strtoul(p, &end, 0); + class_import_put(import); +} - if (p == end) { - CERROR("invalid device ID starting at: %s\n", p); - GOTO(err_disconnect, rc = -EINVAL); - } +/* a connection defines an export context in which preallocation can + be managed. */ +int class_connect(struct lustre_handle *exporth, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(exporth != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); - if (tmp < 0 || tmp >= MAX_OBD_DEVICES) { - CERROR("Trying to sub dev %d - dev no too large\n", - tmp); - GOTO(err_disconnect, rc = -EINVAL); - } + export = class_new_export(obd); + if (export == NULL) + return -ENOMEM; - rc = obd_connect(&obddev->obd_multi_conn[count], &obd_dev[tmp]); - if (rc) { - CERROR("cannot connect to device %d: rc = %d\n", tmp, - rc); - GOTO(err_disconnect, rc); - } + exporth->cookie = export->exp_handle.h_cookie; + memcpy(&export->exp_client_uuid, cluuid, + sizeof(export->exp_client_uuid)); + class_export_put(export); - CDEBUG(D_INFO, "target OBD %d is of type %s\n", count, - obd_dev[tmp].obd_type->typ_name); + CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n", + cluuid->uuid, exporth->cookie); + return 0; +} - p = end + 1; +int class_disconnect(struct lustre_handle *conn, int failover) +{ + struct obd_export *export = class_conn2export(conn); + ENTRY; + + if (export == NULL) { + fixme(); + CDEBUG(D_IOCTL, "disconnect: attempting to free " + "nonexistent client "LPX64"\n", conn->cookie); + RETURN(-EINVAL); } - obddev->obd_multi_count = count; + CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n", conn->cookie); + class_unlink_export(export); + class_export_put(export); RETURN(0); - - err_disconnect: - for (count--; count >= 0; count--) - obd_disconnect(&obddev->obd_multi_conn[count]); - return rc; } -/* - * remove all connections to this device - * close all connections to lower devices - * needed for forced unloads of OBD client drivers - */ -int class_multi_cleanup(struct obd_device *obddev) +void class_disconnect_exports(struct obd_device *obd, int failover) { - int i; + int rc; + struct list_head *tmp, *n, work_list; + struct lustre_handle fake_conn; + ENTRY; - for (i = 0; i < obddev->obd_multi_count; i++) { - int rc; - struct obd_device *obd = - class_conn2obd(&obddev->obd_multi_conn[i]); + /* Move all of the exports from obd_exports to a work list, en masse. */ + spin_lock(&obd->obd_dev_lock); + list_add(&work_list, &obd->obd_exports); + list_del_init(&obd->obd_exports); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + list_for_each_safe(tmp, n, &work_list) { + struct obd_export *exp = list_entry(tmp, struct obd_export, + exp_obd_chain); + + class_export_get(exp); + fake_conn.cookie = exp->exp_handle.h_cookie; + rc = obd_disconnect(&fake_conn, failover); + /* exports created from last_rcvd data, and "fake" + exports created by lctl don't have an import */ + if (exp->exp_ldlm_data.led_import != NULL) + class_destroy_import(exp->exp_ldlm_data.led_import); + class_export_put(exp); - if (!obd) { - CERROR("no such device [i %d]\n", i); - RETURN(-EINVAL); + if (rc) { + CDEBUG(D_IOCTL, "disconnecting export %p failed: %d\n", + exp, rc); + } else { + CDEBUG(D_IOCTL, "export %p disconnected\n", exp); } - - rc = obd_disconnect(&obddev->obd_multi_conn[i]); - if (rc) - CERROR("disconnect failure %d\n", obd->obd_minor); } - return 0; + EXIT; } -#endif diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 26bbdf7..2984e9c 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -31,6 +31,7 @@ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #include <asm/statfs.h> #endif +#include <linux/seq_file.h> #else #include <liblustre.h> @@ -100,7 +101,7 @@ int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, OBD_FREE(pathcopy, pathsize); - if ((cur_root==NULL) || (proc==NULL)) { + if ((cur_root == NULL) || (proc == NULL)) { CERROR("LprocFS: No memory to create /proc entry %s", list->name); return -ENOMEM; @@ -259,13 +260,14 @@ int lprocfs_rd_filegroups(char* page, char **start, off_t off, int count, int lprocfs_rd_server_uuid(char* page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* obd = (struct obd_device*)data; - struct client_obd* cli; + struct obd_device *obd = (struct obd_device *)data; + struct client_obd *cli; LASSERT(obd != NULL); cli = &obd->u.cli; *eof = 1; - return snprintf(page, count, "%s\n", cli->cl_target_uuid.uuid); + return snprintf(page, count, "%s\n", + cli->cl_import->imp_target_uuid.uuid); } int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, @@ -275,7 +277,7 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, struct ptlrpc_connection *conn; LASSERT(obd != NULL); - conn = obd->u.cli.cl_import.imp_connection; + conn = obd->u.cli.cl_import->imp_connection; LASSERT(conn != NULL); *eof = 1; return snprintf(page, count, "%s\n", conn->c_remote_uuid.uuid); @@ -318,6 +320,251 @@ int lprocfs_obd_detach(struct obd_device *dev) return 0; } +struct lprocfs_counters* lprocfs_alloc_counters(unsigned int num) +{ + struct lprocfs_counters* cntrs; + int csize; + if (num == 0) + return NULL; + + csize = offsetof(struct lprocfs_counters, cntr[num]); + OBD_ALLOC(cntrs, csize); + if (cntrs != NULL) { + cntrs->num = num; + } + return cntrs; +} + +void lprocfs_free_counters(struct lprocfs_counters* cntrs) +{ + if (cntrs != NULL) { + int csize = offsetof(struct lprocfs_counters, cntr[cntrs->num]); OBD_FREE(cntrs, csize); + } +} + +/* Reset counter under lock */ +int lprocfs_counter_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct lprocfs_counters *cntrs = (struct lprocfs_counters*) data; + unsigned int i; + LASSERT(cntrs != NULL); + + for (i = 0; i < cntrs->num; i++) { + struct lprocfs_counter *cntr = &(cntrs->cntr[i]); + spinlock_t *lock = (cntr->config & LPROCFS_CNTR_EXTERNALLOCK) ? + cntr->l.external : &cntr->l.internal; + + spin_lock(lock); + cntr->count = 0; + cntr->sum = 0; + cntr->min = (~(__u64)0); + cntr->max = 0; + cntr->sumsquare = 0; + spin_unlock(lock); + } + return 0; +} + +static void *lprocfs_counters_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_counters *cntrs = p->private; + return (*pos >= cntrs->num) ? NULL : (void*) &cntrs->cntr[*pos]; +} + +static void lprocfs_counters_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_counters_seq_next(struct seq_file *p, void *v, + loff_t *pos) +{ + struct lprocfs_counters *cntrs = p->private; + ++*pos; + return (*pos >= cntrs->num) ? NULL : (void*) &(cntrs->cntr[*pos]); +} + +/* seq file export of one lprocfs counter */ +static int lprocfs_counters_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_counters *cntrs = p->private; + struct lprocfs_counter *cntr = v; + spinlock_t *lock; + struct lprocfs_counter c; + int rc = 0; + + if (cntr == &(cntrs->cntr[0])) { + struct timeval now; + do_gettimeofday(&now); + rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n", + "snapshot_time", now.tv_sec, now.tv_usec); + if (rc < 0) + return rc; + } + + /* Take a snapshot of the counter under lock */ + lock = (cntr->config & LPROCFS_CNTR_EXTERNALLOCK) ? + cntr->l.external : &cntr->l.internal; + spin_lock(lock); + + c.count = cntr->count; + c.sum = cntr->sum; + c.min = cntr->min; + c.max = cntr->max; + c.sumsquare = cntr->sumsquare; + + spin_unlock(lock); + + rc = seq_printf(p, "%-25s "LPU64" samples [%s]", cntr->name, c.count, + cntr->units); + if (rc < 0) + goto out; + + if ((cntr->config & LPROCFS_CNTR_AVGMINMAX) && (c.count > 0)) { + rc = seq_printf(p, " "LPU64" "LPU64" "LPU64, c.min,c.max,c.sum); + if (rc < 0) + goto out; + if (cntr->config & LPROCFS_CNTR_STDDEV) + rc = seq_printf(p, " "LPU64, c.sumsquare); + if (rc < 0) + goto out; + } + rc = seq_printf(p, "\n"); + out: + return (rc < 0) ? rc : 0; +} + +struct seq_operations lprocfs_counters_seq_sops = { + .start = lprocfs_counters_seq_start, + .stop = lprocfs_counters_seq_stop, + .next = lprocfs_counters_seq_next, + .show = lprocfs_counters_seq_show, +}; + +static int lprocfs_counters_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = inode->u.generic_ip; + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_counters_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = dp->data; + return 0; +} + +struct file_operations lprocfs_counters_seq_fops = { + .open = lprocfs_counters_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int lprocfs_register_counters(struct proc_dir_entry *root, const char* name, + struct lprocfs_counters *cntrs) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + entry = create_proc_entry(name, 0444, root); + if (entry == NULL) + return -ENOMEM; + entry->proc_fops = &lprocfs_counters_seq_fops; + entry->data = (void*) cntrs; + entry->write_proc = lprocfs_counter_write; + return 0; +} + +#define LPROCFS_OBD_OP_INIT(base, cntrs, op) \ +do { \ + unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < cntrs->num); \ + LPROCFS_COUNTER_INIT(&cntrs->cntr[coffset], 0, NULL, #op, "reqs"); \ +} while (0) + + +int lprocfs_alloc_obd_counters(struct obd_device *obddev, + unsigned int num_private_counters) +{ + struct lprocfs_counters* obdops_cntrs; + unsigned int num_counters; + int rc, i; + + LASSERT(obddev->counters == NULL); + LASSERT(obddev->obd_proc_entry != NULL); + LASSERT(obddev->cntr_base == 0); + + num_counters = 1 + OBD_COUNTER_OFFSET(san_preprw)+num_private_counters; + obdops_cntrs = lprocfs_alloc_counters(num_counters); + if (!obdops_cntrs) + return -ENOMEM; + + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, iocontrol); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, get_info); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, set_info); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, attach); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, detach); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, setup); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cleanup); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, connect); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, disconnect); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, statfs); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, syncfs); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, packmd); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, unpackmd); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, preallocate); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, create); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, destroy); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, setattr); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, getattr); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, getattr_async); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, open); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, close); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, brw); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, brw_async); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, punch); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, sync); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, migrate); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, copy); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, iterate); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, preprw); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, commitrw); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, enqueue); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, match); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cancel); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, cancel_unused); + LPROCFS_OBD_OP_INIT(num_private_counters, obdops_cntrs, san_preprw); + + for (i = num_private_counters; i < num_counters; i++) { + /* If this assertion failed, it is likely that an obd + * operation was added to struct obd_ops in + * <linux/obd.h>, and that the corresponding line item + * LPROCFS_OBD_OP_INIT(.., .., opname) + * is missing from the list above. */ + LASSERT(obdops_cntrs->cntr[i].name != NULL); + } + rc = lprocfs_register_counters(obddev->obd_proc_entry, "obd_stats", + obdops_cntrs); + if (rc < 0) { + lprocfs_free_counters(obdops_cntrs); + } else { + obddev->counters = obdops_cntrs; + obddev->cntr_base = num_private_counters; + } + return rc; +} + +void lprocfs_free_obd_counters(struct obd_device *obddev) +{ + struct lprocfs_counters* obdops_cntrs = obddev->counters; + if (obdops_cntrs != NULL) { + obddev->counters = NULL; + lprocfs_free_counters(obdops_cntrs); + } +} + #endif /* LPROCFS*/ EXPORT_SYMBOL(lprocfs_register); @@ -325,6 +572,11 @@ EXPORT_SYMBOL(lprocfs_remove); EXPORT_SYMBOL(lprocfs_add_vars); EXPORT_SYMBOL(lprocfs_obd_attach); EXPORT_SYMBOL(lprocfs_obd_detach); +EXPORT_SYMBOL(lprocfs_alloc_counters); +EXPORT_SYMBOL(lprocfs_free_counters); +EXPORT_SYMBOL(lprocfs_register_counters); +EXPORT_SYMBOL(lprocfs_alloc_obd_counters); +EXPORT_SYMBOL(lprocfs_free_obd_counters); EXPORT_SYMBOL(lprocfs_rd_u64); EXPORT_SYMBOL(lprocfs_rd_uuid); diff --git a/lustre/obdclass/lustre_handles.c b/lustre/obdclass/lustre_handles.c index 01dd75b..06f86ad 100644 --- a/lustre/obdclass/lustre_handles.c +++ b/lustre/obdclass/lustre_handles.c @@ -20,7 +20,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_CLASS #ifdef __KERNEL__ #include <linux/types.h> #include <linux/random.h> @@ -146,7 +146,7 @@ static void cleanup_all_handles(void) class_handle_unhash_nolock(h); } } - spin_lock(&handle_lock); + spin_unlock(&handle_lock); } void class_handle_cleanup(void) diff --git a/lustre/lib/simple.c b/lustre/obdclass/simple.c similarity index 83% rename from lustre/lib/simple.c rename to lustre/obdclass/simple.c index c0d4f31..0ce54a3 100644 --- a/lustre/lib/simple.c +++ b/lustre/obdclass/simple.c @@ -29,21 +29,18 @@ #define DEBUG_SUBSYSTEM S_FILTER -#include <linux/obd_support.h> #include <linux/obd.h> -#include <linux/lustre_mds.h> #include <linux/lustre_lib.h> -#include <linux/lustre_net.h> -#ifdef OBD_CTXT_DEBUG /* Debugging check only needed during development */ -#define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) -#define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds())) -#define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds())) +#ifdef OBD_CTXT_DEBUG +# define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) +# define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds())) +# define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds())) #else -#define ASSERT_CTXT_MAGIC(magic) do {} while(0) -#define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0) -#define ASSERT_KERNEL_CTXT(msg) do {} while(0) +# define ASSERT_CTXT_MAGIC(magic) do {} while(0) +# define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0) +# define ASSERT_KERNEL_CTXT(msg) do {} while(0) #endif /* push / pop to root of obd store */ @@ -70,6 +67,7 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, LASSERT(atomic_read(&new_ctx->pwd->d_count)); save->pwd = dget(current->fs->pwd); save->pwdmnt = mntget(current->fs->pwdmnt); + save->ngroups = current->ngroups; LASSERT(save->pwd); LASSERT(save->pwdmnt); @@ -77,13 +75,17 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, LASSERT(new_ctx->pwdmnt); if (uc) { - save->fsuid = current->fsuid; - save->fsgid = current->fsgid; - save->cap = current->cap_effective; + save->ouc.ouc_fsuid = current->fsuid; + save->ouc.ouc_fsgid = current->fsgid; + save->ouc.ouc_cap = current->cap_effective; + save->ouc.ouc_suppgid1 = current->groups[0]; + save->ouc.ouc_suppgid2 = current->groups[1]; current->fsuid = uc->ouc_fsuid; current->fsgid = uc->ouc_fsgid; current->cap_effective = uc->ouc_cap; + current->ngroups = 0; + if (uc->ouc_suppgid1 != -1) current->groups[current->ngroups++] = uc->ouc_suppgid1; if (uc->ouc_suppgid2 != -1) @@ -103,6 +105,7 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, atomic_read(¤t->fs->pwdmnt->mnt_count)); */ } +EXPORT_SYMBOL(push_ctxt); void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx, struct obd_ucred *uc) @@ -132,14 +135,13 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx, dput(saved->pwd); mntput(saved->pwdmnt); if (uc) { - current->fsuid = saved->fsuid; - current->fsgid = saved->fsgid; - current->cap_effective = saved->cap; + current->fsuid = saved->ouc.ouc_fsuid; + current->fsgid = saved->ouc.ouc_fsgid; + current->cap_effective = saved->ouc.ouc_cap; + current->ngroups = saved->ngroups; - if (uc->ouc_suppgid1 != -1) - current->ngroups--; - if (uc->ouc_suppgid2 != -1) - current->ngroups--; + current->groups[0] = saved->ouc.ouc_suppgid1; + current->groups[1] = saved->ouc.ouc_suppgid2; } /* @@ -153,6 +155,7 @@ void pop_ctxt(struct obd_run_ctxt *saved, struct obd_run_ctxt *new_ctx, atomic_read(¤t->fs->pwdmnt->mnt_count)); */ } +EXPORT_SYMBOL(pop_ctxt); /* utility to make a file */ struct dentry *simple_mknod(struct dentry *dir, char *name, int mode) @@ -169,7 +172,7 @@ struct dentry *simple_mknod(struct dentry *dir, char *name, int mode) GOTO(out_up, dchild); if (dchild->d_inode) { - if ((dchild->d_inode->i_mode & S_IFMT) != S_IFREG) + if (!S_ISREG(dchild->d_inode->i_mode)) GOTO(out_err, err = -EEXIST); GOTO(out_up, dchild); @@ -187,6 +190,7 @@ out_err: out_up: return dchild; } +EXPORT_SYMBOL(simple_mknod); /* utility to make a directory */ struct dentry *simple_mkdir(struct dentry *dir, char *name, int mode) @@ -220,25 +224,27 @@ out_err: out_up: return dchild; } +EXPORT_SYMBOL(simple_mkdir); /* * Read a file from within kernel context. Prior to calling this * function we should already have done a push_ctxt(). */ -int lustre_fread(struct file *file, char *str, int len, loff_t *off) +int lustre_fread(struct file *file, void *buf, int len, loff_t *off) { ASSERT_KERNEL_CTXT("kernel doing read outside kernel context\n"); if (!file || !file->f_op || !file->f_op->read || !off) RETURN(-ENOSYS); - return file->f_op->read(file, str, len, off); + return file->f_op->read(file, buf, len, off); } +EXPORT_SYMBOL(lustre_fread); /* * Write a file from within kernel context. Prior to calling this * function we should already have done a push_ctxt(). */ -int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off) +int lustre_fwrite(struct file *file, const void *buf, int len, loff_t *off) { ENTRY; ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n"); @@ -252,8 +258,9 @@ int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off) if (!file->f_op->write) RETURN(-EROFS); - RETURN(file->f_op->write(file, str, len, off)); + RETURN(file->f_op->write(file, buf, len, off)); } +EXPORT_SYMBOL(lustre_fwrite); /* * Sync a file from within kernel context. Prior to calling this @@ -268,3 +275,4 @@ int lustre_fsync(struct file *file) RETURN(file->f_op->fsync(file, file->f_dentry, 0)); } +EXPORT_SYMBOL(lustre_fsync); diff --git a/lustre/obdclass/statfs_pack.c b/lustre/obdclass/statfs_pack.c index 1998ba3..1a5f6fa 100644 --- a/lustre/obdclass/statfs_pack.c +++ b/lustre/obdclass/statfs_pack.c @@ -28,35 +28,18 @@ #define EXPORT_SYMTAB #ifndef __KERNEL__ #include <liblustre.h> -#endif - +#else #include <linux/version.h> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #include <asm/statfs.h> #endif +#endif #include <linux/lustre_export.h> #include <linux/lustre_net.h> #include <linux/obd_support.h> #include <linux/obd_class.h> -void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src) -{ - tgt->os_type = HTON__u64(src->os_type); - tgt->os_blocks = HTON__u64(src->os_blocks); - tgt->os_bfree = HTON__u64(src->os_bfree); - tgt->os_bavail = HTON__u64(src->os_bavail); - tgt->os_files = HTON__u64(src->os_files); - tgt->os_ffree = HTON__u64(src->os_ffree); - tgt->os_bsize = HTON__u32(src->os_bsize); - tgt->os_namelen = HTON__u32(src->os_namelen); -} - -void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src) -{ - obd_statfs_pack(tgt, src); -} - void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs) { osfs->os_type = sfs->f_type; @@ -89,27 +72,33 @@ int obd_self_statfs(struct obd_device *obd, struct statfs *sfs) int rc; ENTRY; + LASSERT( obd != NULL ); + + spin_lock(&obd->obd_dev_lock); if (list_empty(&obd->obd_exports)) { + spin_unlock(&obd->obd_dev_lock); export = my_export = class_new_export(obd); if (export == NULL) RETURN(-ENOMEM); - } else + } else { export = list_entry(obd->obd_exports.next, typeof(*export), exp_obd_chain); - conn.addr = (unsigned long)export; - conn.cookie = export->exp_cookie; + export = class_export_get(export); + spin_unlock(&obd->obd_dev_lock); + } + conn.cookie = export->exp_handle.h_cookie; rc = obd_statfs(&conn, &osfs); if (!rc) statfs_unpack(sfs, &osfs); if (my_export) - class_destroy_export(my_export); + class_unlink_export(my_export); + + class_export_put(export); RETURN(rc); } -EXPORT_SYMBOL(obd_statfs_pack); -EXPORT_SYMBOL(obd_statfs_unpack); EXPORT_SYMBOL(statfs_pack); EXPORT_SYMBOL(statfs_unpack); EXPORT_SYMBOL(obd_self_statfs); diff --git a/lustre/obdclass/sysctl.c b/lustre/obdclass/sysctl.c index 125f392..3d68f2e 100644 --- a/lustre/obdclass/sysctl.c +++ b/lustre/obdclass/sysctl.c @@ -75,7 +75,7 @@ static ctl_table obd_table[] = { {OBD_RESET, "reset", NULL, 0, 0644, NULL, &obd_sctl_reset}, {OBD_TIMEOUT, "timeout", &obd_timeout, sizeof(int), 0644, NULL, &proc_dointvec}, /* XXX need to lock so we avoid update races with the recovery upcall! */ - {OBD_UPCALL, "recovery_upcall", obd_recovery_upcall, 128, 0644, NULL, + {OBD_UPCALL, "upcall", obd_lustre_upcall, 128, 0644, NULL, &proc_dostring, &sysctl_string }, {OBD_SYNCFILTER, "filter_sync_on_commit", &obd_sync_filter, sizeof(int), 0644, NULL, &proc_dointvec}, diff --git a/lustre/obdclass/uuid.c b/lustre/obdclass/uuid.c index fed9a8f..9f103df 100644 --- a/lustre/obdclass/uuid.c +++ b/lustre/obdclass/uuid.c @@ -13,15 +13,17 @@ #define DEBUG_SUBSYSTEM S_CLASS #ifdef __KERNEL__ -#include <linux/ctype.h> -#include <linux/kernel.h> -#else -#include <liblustre.h> +# include <linux/ctype.h> +# include <linux/kernel.h> +# include <linux/sched.h> +# include <linux/smp_lock.h> +#else +# include <liblustre.h> #endif #include <linux/obd_support.h> #include <linux/obd_class.h> -#include <linux/smp_lock.h> +#include <linux/obd_ost.h> struct uuid { __u32 time_low; @@ -138,3 +140,26 @@ void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) uuid.node[0], uuid.node[1], uuid.node[2], uuid.node[3], uuid.node[4], uuid.node[5]); } + +struct obd_device *client_tgtuuid2obd(struct obd_uuid *tgtuuid) +{ + int i; + + for (i = 0; i < MAX_OBD_DEVICES; i++) { + struct obd_device *obd = &obd_dev[i]; + if (obd->obd_type == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME, + sizeof LUSTRE_OSC_NAME) == 0) || + (strncmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME, + sizeof LUSTRE_MDC_NAME) == 0)) { + struct client_obd *cli = &obd->u.cli; + struct obd_import *imp = cli->cl_import; + if (strncmp(tgtuuid->uuid, imp->imp_target_uuid.uuid, + sizeof(imp->imp_target_uuid)) == 0) + return obd; + } + } + + return NULL; +} diff --git a/lustre/obdecho/Makefile.am b/lustre/obdecho/Makefile.am index f8ed503..08136d7 100644 --- a/lustre/obdecho/Makefile.am +++ b/lustre/obdecho/Makefile.am @@ -17,4 +17,3 @@ obdecho_SOURCES = echo.c echo_client.c lproc_echo.c $(LINX) endif include $(top_srcdir)/Rules - diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 1796957..1eaa282 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -63,7 +63,7 @@ struct xprocfs_io_stat { __u64 st_create_reqs; __u64 st_destroy_reqs; __u64 st_statfs_reqs; - __u64 st_sync_reqs; + __u64 st_syncfs_reqs; __u64 st_open_reqs; __u64 st_close_reqs; __u64 st_punch_reqs; @@ -77,6 +77,7 @@ do { \ xprocfs_iostats[smp_processor_id()].field += (count); \ } while (0) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) #define DECLARE_XPROCFS_SUM_STAT(field) \ static long long \ xprocfs_sum_##field (void) \ @@ -88,7 +89,7 @@ xprocfs_sum_##field (void) \ stat += xprocfs_iostats[i].field; \ return (stat); \ } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + DECLARE_XPROCFS_SUM_STAT (st_read_bytes) DECLARE_XPROCFS_SUM_STAT (st_read_reqs) DECLARE_XPROCFS_SUM_STAT (st_write_bytes) @@ -98,7 +99,7 @@ DECLARE_XPROCFS_SUM_STAT (st_setattr_reqs) DECLARE_XPROCFS_SUM_STAT (st_create_reqs) DECLARE_XPROCFS_SUM_STAT (st_destroy_reqs) DECLARE_XPROCFS_SUM_STAT (st_statfs_reqs) -DECLARE_XPROCFS_SUM_STAT (st_sync_reqs) +DECLARE_XPROCFS_SUM_STAT (st_syncfs_reqs) DECLARE_XPROCFS_SUM_STAT (st_open_reqs) DECLARE_XPROCFS_SUM_STAT (st_close_reqs) DECLARE_XPROCFS_SUM_STAT (st_punch_reqs) @@ -146,7 +147,7 @@ xprocfs_init (char *name) xprocfs_dir = proc_mkdir (dirname, NULL); if (xprocfs_dir == NULL) { - CERROR ("Can't make dir\n"); + CERROR ("Can't make procfs dir %s\n", dirname); return; } @@ -160,7 +161,7 @@ xprocfs_init (char *name) xprocfs_add_stat ("create_reqs", xprocfs_sum_st_create_reqs); xprocfs_add_stat ("destroy_reqs", xprocfs_sum_st_destroy_reqs); xprocfs_add_stat ("statfs_reqs", xprocfs_sum_st_statfs_reqs); - xprocfs_add_stat ("sync_reqs", xprocfs_sum_st_sync_reqs); + xprocfs_add_stat ("syncfs_reqs", xprocfs_sum_st_syncfs_reqs); xprocfs_add_stat ("open_reqs", xprocfs_sum_st_open_reqs); xprocfs_add_stat ("close_reqs", xprocfs_sum_st_close_reqs); xprocfs_add_stat ("punch_reqs", xprocfs_sum_st_punch_reqs); @@ -181,7 +182,7 @@ void xprocfs_fini (void) remove_proc_entry ("create_reqs", xprocfs_dir); remove_proc_entry ("destroy_reqs", xprocfs_dir); remove_proc_entry ("statfs_reqs", xprocfs_dir); - remove_proc_entry ("sync_reqs", xprocfs_dir); + remove_proc_entry ("syncfs_reqs", xprocfs_dir); remove_proc_entry ("open_reqs", xprocfs_dir); remove_proc_entry ("close_reqs", xprocfs_dir); remove_proc_entry ("punch_reqs", xprocfs_dir); @@ -191,20 +192,20 @@ void xprocfs_fini (void) } static int echo_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { return class_connect(conn, obd, cluuid); } -static int echo_disconnect(struct lustre_handle *conn) +static int echo_disconnect(struct lustre_handle *conn, int failover) { struct obd_export *exp = class_conn2export(conn); LASSERT (exp != NULL); - ldlm_cancel_locks_for_export (exp); - return (class_disconnect (conn)); + ldlm_cancel_locks_for_export(exp); + class_export_put(exp); + return (class_disconnect(conn, failover)); } static __u64 echo_next_id(struct obd_device *obddev) @@ -226,7 +227,7 @@ int echo_create(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return -EINVAL; } @@ -255,7 +256,7 @@ int echo_destroy(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); RETURN(-EINVAL); } @@ -275,7 +276,8 @@ int echo_destroy(struct lustre_handle *conn, struct obdo *oa, } static int echo_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti) + struct lov_stripe_md *md, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct lustre_handle *fh = obdo_handle (oa); struct obd_device *obd = class_conn2obd (conn); @@ -283,7 +285,7 @@ static int echo_open(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1); if (!obd) { - CERROR ("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return (-EINVAL); } @@ -292,7 +294,6 @@ static int echo_open(struct lustre_handle *conn, struct obdo *oa, return (-EINVAL); } - fh->addr = oa->o_id; fh->cookie = ECHO_HANDLE_MAGIC; oa->o_valid |= OBD_MD_FLHANDLE; @@ -308,7 +309,7 @@ static int echo_close(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); return (-EINVAL); } @@ -334,7 +335,7 @@ static int echo_getattr(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_getattr_reqs, 1); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); RETURN(-EINVAL); } @@ -357,7 +358,7 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_setattr_reqs, 1); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); RETURN(-EINVAL); } @@ -376,7 +377,7 @@ static int echo_setattr(struct lustre_handle *conn, struct obdo *oa, /* This allows us to verify that desc_private is passed unmolested */ #define DESC_PRIV 0x10293847 -int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, +int echo_preprw(int cmd, struct obd_export *export, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb, struct niobuf_local *res, void **desc_private, struct obd_trans_info *oti) @@ -392,11 +393,9 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, else XPROCFS_BUMP_MYCPU_IOSTAT (st_read_reqs, 1); - obd = class_conn2obd(conn); - if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + obd = export->exp_obd; + if (obd == NULL) RETURN(-EINVAL); - } memset(res, 0, sizeof(*res) * niocount); @@ -405,8 +404,6 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, *desc_private = (void *)DESC_PRIV; - obd_kmap_get(niocount, 1); - for (i = 0; i < objcount; i++, obj++) { int gfp_mask = (obj->ioo_id & 1) ? GFP_HIGHUSER : GFP_KERNEL; int isobj0 = obj->ioo_id == 0; @@ -434,24 +431,30 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount, atomic_inc(&obd->u.echo.eo_prep); r->offset = nb->offset; - r->addr = kmap(r->page); r->len = nb->len; + LASSERT ((r->offset & (PAGE_SIZE - 1)) + r->len <= PAGE_SIZE); - CDEBUG(D_PAGE, "$$$$ get page %p, addr %p@"LPU64"\n", - r->page, r->addr, r->offset); + CDEBUG(D_PAGE, "$$$$ get page %p @ "LPU64" for %d\n", + r->page, r->offset, r->len); if (cmd == OBD_BRW_READ) { + r->rc = r->len; XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes,r->len); - if (verify) - page_debug_setup(r->addr, r->len, + if (verify) { + page_debug_setup(kmap (r->page), r->len, r->offset,obj->ioo_id); + kunmap (r->page); + } + r->rc = r->len; } else { XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes, r->len); - if (verify) - page_debug_setup(r->addr, r->len, + if (verify) { + page_debug_setup(kmap (r->page), r->len, 0xecc0ecc0ecc0ecc0, 0xecc0ecc0ecc0ecc0); + kunmap (r->page); + } } } } @@ -474,28 +477,23 @@ preprw_cleanup: __free_pages(r->page, 0); atomic_dec(&obd->u.echo.eo_prep); } - obd_kmap_put(niocount); memset(res, 0, sizeof(*res) * niocount); return rc; } -int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, +int echo_commitrw(int cmd, struct obd_export *export, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, void *desc_private, struct obd_trans_info *oti) { struct obd_device *obd; struct niobuf_local *r = res; - int rc = 0; - int vrc = 0; - int i; + int i, vrc = 0, rc = 0; ENTRY; - obd = class_conn2obd(conn); - if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + obd = export->exp_obd; + if (obd == NULL) RETURN(-EINVAL); - } if ((cmd & OBD_BRW_RWMASK) == OBD_BRW_READ) { CDEBUG(D_PAGE, "reading %d obdos with %d IOs\n", @@ -520,11 +518,14 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, struct page *page = r->page; void *addr; + kmap (page); + if (!page || !(addr = page_address(page)) || !kern_addr_valid(addr)) { CERROR("bad page objid "LPU64":%p, buf %d/%d\n", obj->ioo_id, page, j, obj->ioo_bufcnt); + kunmap (page); GOTO(commitrw_cleanup, rc = -EFAULT); } @@ -541,7 +542,6 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount, kunmap(page); /* NB see comment above regarding object0 pages */ - obd_kmap_put(1); __free_pages(page, 0); atomic_dec(&obd->u.echo.eo_prep); } @@ -556,8 +556,6 @@ commitrw_cleanup: while (++r < res + niocount) { struct page *page = r->page; - kunmap(page); - obd_kmap_put(1); /* NB see comment above regarding object0 pages */ __free_pages(page, 0); atomic_dec(&obd->u.echo.eo_prep); @@ -584,7 +582,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(0); } -static int echo_cleanup(struct obd_device *obddev) +static int echo_cleanup(struct obd_device *obddev, int force, int failover) { ENTRY; diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 2239762..31f7334 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -355,8 +355,14 @@ echo_put_object (struct ec_object *eco) eco->eco_refcount--; LASSERT (eco->eco_refcount >= 0); - if (eco->eco_refcount != 0 || - !eco->eco_deleted) { + CDEBUG(D_INFO, "put %p: "LPX64"=%u#%u&%d refs %d del %d\n", + eco, eco->eco_id, + eco->eco_lsm->lsm_stripe_size, + eco->eco_lsm->lsm_stripe_count, + eco->eco_lsm->lsm_stripe_offset, + eco->eco_refcount, eco->eco_deleted); + + if (eco->eco_refcount != 0 || !eco->eco_deleted) { spin_unlock (&ec->ec_lock); return; } @@ -367,7 +373,7 @@ echo_put_object (struct ec_object *eco) * attempting to enqueue on this object number until we can be * sure there will be no more lock callbacks. */ - obd_cancel_unused (&ec->ec_conn, eco->eco_lsm, 0); + obd_cancel_unused(&ec->ec_conn, eco->eco_lsm, 0, NULL); /* now we can let it go */ spin_lock (&ec->ec_lock); @@ -414,7 +420,6 @@ echo_client_kbrw (struct obd_device *obd, int rw, obd_off offset, obd_size count) { struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_brw_set *set; obd_count npages; struct brw_page *pga; struct brw_page *pgp; @@ -438,17 +443,12 @@ echo_client_kbrw (struct obd_device *obd, int rw, lsm->lsm_object_id != oa->o_id)) return (-EINVAL); - set = obd_brw_set_new(); - if (set == NULL) - return (-ENOMEM); - /* XXX think again with misaligned I/O */ npages = count >> PAGE_SHIFT; - rc = -ENOMEM; OBD_ALLOC(pga, npages * sizeof(*pga)); if (pga == NULL) - goto out_0; + return (-ENOMEM); for (i = 0, pgp = pga, off = offset; i < npages; @@ -459,7 +459,7 @@ echo_client_kbrw (struct obd_device *obd, int rw, rc = -ENOMEM; pgp->pg = alloc_pages (gfp_mask, 0); if (pgp->pg == NULL) - goto out_1; + goto out; pgp->count = PAGE_SIZE; pgp->off = off; @@ -484,12 +484,9 @@ echo_client_kbrw (struct obd_device *obd, int rw, } } - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL); - if (rc == 0) - rc = ll_brw_sync_wait(set, CB_PHASE_START); + rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, NULL); - out_1: + out: if (rc != 0) verify = 0; @@ -514,8 +511,6 @@ echo_client_kbrw (struct obd_device *obd, int rw, __free_pages(pgp->pg, 0); } OBD_FREE(pga, npages * sizeof(*pga)); - out_0: - obd_brw_set_decref(set); return (rc); } @@ -526,7 +521,6 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, obd_off offset, obd_size count, char *buffer) { struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_brw_set *set; obd_count npages; struct brw_page *pga; struct brw_page *pgp; @@ -546,17 +540,12 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, (lsm != NULL && lsm->lsm_object_id != oa->o_id)) return (-EINVAL); - set = obd_brw_set_new(); - if (set == NULL) - return (-ENOMEM); - /* XXX think again with misaligned I/O */ npages = count >> PAGE_SHIFT; - rc = -ENOMEM; OBD_ALLOC(pga, npages * sizeof(*pga)); if (pga == NULL) - goto out_0; + return (-ENOMEM); rc = alloc_kiovec (1, &kiobuf); if (rc != 0) @@ -579,11 +568,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, pgp->flag = 0; } - set->brw_callback = ll_brw_sync_wait; - rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, set, NULL); - - if (rc == 0) - rc = ll_brw_sync_wait(set, CB_PHASE_START); + rc = obd_brw(rw, &ec->ec_conn, lsm, npages, pga, NULL); // if (rw == OBD_BRW_READ) // mark_dirty_kiobuf (kiobuf, count); @@ -593,8 +578,6 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, free_kiovec (1, &kiobuf); out_1: OBD_FREE(pga, npages * sizeof(*pga)); - out_0: - obd_brw_set_decref(set); return (rc); } #else @@ -620,14 +603,14 @@ echo_open (struct obd_export *exp, struct obdo *oa) rc = echo_get_object (&eco, obd, oa); if (rc != 0) - return (rc); + return rc; rc = -ENOMEM; OBD_ALLOC (ecoo, sizeof (*ecoo)); if (ecoo == NULL) goto failed_0; - rc = obd_open (&ec->ec_conn, oa, eco->eco_lsm, NULL); + rc = obd_open(&ec->ec_conn, oa, eco->eco_lsm, NULL, &ecoo->ecoo_och); if (rc != 0) goto failed_1; @@ -638,12 +621,9 @@ echo_open (struct obd_export *exp, struct obdo *oa) spin_lock (&ec->ec_lock); list_add (&ecoo->ecoo_exp_chain, &exp->exp_ec_data.eced_open_head); - - ufh->addr = (__u64)((long) ecoo); ufh->cookie = ecoo->ecoo_cookie = ec->ec_unique++; - spin_unlock (&ec->ec_lock); - return (0); + return 0; failed_1: OBD_FREE (ecoo, sizeof (*ecoo)); @@ -664,24 +644,23 @@ echo_close (struct obd_export *exp, struct obdo *oa) int rc; if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) - return (-EINVAL); + return -EINVAL; spin_lock (&ec->ec_lock); list_for_each (el, &exp->exp_ec_data.eced_open_head) { ecoo = list_entry (el, struct ec_open_object, ecoo_exp_chain); - if ((__u64)((long)ecoo) == ufh->addr) { - found = (ecoo->ecoo_cookie == ufh->cookie); - if (found) - list_del (&ecoo->ecoo_exp_chain); + found = (ecoo->ecoo_cookie == ufh->cookie); + if (found) { + list_del (&ecoo->ecoo_exp_chain); break; } } spin_unlock (&ec->ec_lock); - if (!found) - return (-EINVAL); + memcpy(&ecoo->ecoo_oa.o_inline, &ecoo->ecoo_och, FD_OSTDATA_SIZE); + ecoo->ecoo_oa.o_valid |= OBD_MD_FLHANDLE; rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa, ecoo->ecoo_object->eco_lsm, NULL); @@ -718,16 +697,16 @@ echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new, switch (flag) { case LDLM_CB_BLOCKING: - CDEBUG (D_INFO, "blocking callback on "LPX64", handle "LPX64"." - LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie); + CDEBUG(D_INFO, "blocking callback on "LPX64", handle "LPX64"\n", + eco->eco_id, lockh.cookie); rc = ldlm_cli_cancel (&lockh); if (rc != ELDLM_OK) CERROR ("ldlm_cli_cancel failed: %d\n", rc); break; case LDLM_CB_CANCELING: - CDEBUG (D_INFO, "canceling callback on "LPX64", handle "LPX64"." - LPX64"\n", eco->eco_id, lockh.addr, lockh.cookie); + CDEBUG(D_INFO, "cancel callback on "LPX64", handle "LPX64"\n", + eco->eco_id, lockh.cookie); break; default: @@ -750,15 +729,15 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa, int rc; if (!(mode == LCK_PR || mode == LCK_PW)) - return (-EINVAL); + return -EINVAL; if ((offset & (PAGE_SIZE - 1)) != 0 || (nob & (PAGE_SIZE - 1)) != 0) - return (-EINVAL); + return -EINVAL; rc = echo_get_object (&eco, obd, oa); if (rc != 0) - return (rc); + return rc; rc = -ENOMEM; OBD_ALLOC (ecl, sizeof (*ecl)); @@ -768,32 +747,28 @@ echo_enqueue (struct obd_export *exp, struct obdo *oa, ecl->ecl_mode = mode; ecl->ecl_object = eco; ecl->ecl_extent.start = offset; - ecl->ecl_extent.end = (nob == 0) ? ((obd_off)-1) : (offset + nob - 1); + ecl->ecl_extent.end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1); flags = 0; rc = obd_enqueue (&ec->ec_conn, eco->eco_lsm, NULL, LDLM_EXTENT, &ecl->ecl_extent,sizeof(ecl->ecl_extent), mode, &flags, echo_ldlm_callback, eco, sizeof (*eco), - &ecl->ecl_handle); + &ecl->ecl_lock_handle); if (rc != 0) goto failed_1; - CDEBUG (D_INFO, "enqueue handle "LPX64"."LPX64"\n", - ecl->ecl_handle.addr, ecl->ecl_handle.cookie); + CDEBUG(D_INFO, "enqueue handle "LPX64"\n", ecl->ecl_lock_handle.cookie); /* NB ecl takes object ref from echo_get_object() above */ + spin_lock(&ec->ec_lock); - spin_lock (&ec->ec_lock); - - list_add (&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks); - - ulh->addr = (__u64)((long)ecl); + list_add(&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks); ulh->cookie = ecl->ecl_cookie = ec->ec_unique++; - spin_unlock (&ec->ec_lock); + spin_unlock(&ec->ec_lock); oa->o_valid |= OBD_MD_FLHANDLE; - return (0); + return 0; failed_1: OBD_FREE (ecl, sizeof (*ecl)); @@ -814,17 +789,15 @@ echo_cancel (struct obd_export *exp, struct obdo *oa) int rc; if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) - return (-EINVAL); + return -EINVAL; spin_lock (&ec->ec_lock); list_for_each (el, &exp->exp_ec_data.eced_locks) { ecl = list_entry (el, struct ec_lock, ecl_exp_chain); - - if ((__u64)((long)ecl) == ulh->addr) { - found = (ecl->ecl_cookie == ulh->cookie); - if (found) - list_del (&ecl->ecl_exp_chain); + found = (ecl->ecl_cookie == ulh->cookie); + if (found) { + list_del (&ecl->ecl_exp_chain); break; } } @@ -834,15 +807,13 @@ echo_cancel (struct obd_export *exp, struct obdo *oa) if (!found) return (-ENOENT); - rc = obd_cancel (&ec->ec_conn, - ecl->ecl_object->eco_lsm, - ecl->ecl_mode, - &ecl->ecl_handle); + rc = obd_cancel(&ec->ec_conn, ecl->ecl_object->eco_lsm, ecl->ecl_mode, + &ecl->ecl_lock_handle); echo_put_object (ecl->ecl_object); OBD_FREE (ecl, sizeof (*ecl)); - return (rc); + return rc; } static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, @@ -987,8 +958,10 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, GOTO (out, rc = -ENOTTY); } + EXIT; out: - RETURN(rc); + class_export_put(exp); + return rc; } static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) @@ -1013,8 +986,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) obd_str2uuid(&uuid, data->ioc_inlbuf1); tgt = class_uuid2obd(&uuid); - if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) || - !(tgt->obd_flags & OBD_SET_UP)) { + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { CERROR("device not attached or not set up (%d)\n", data->ioc_dev); RETURN(rc = -EINVAL); @@ -1024,7 +996,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) INIT_LIST_HEAD (&ec->ec_objects); ec->ec_unique = 0; - rc = obd_connect(&ec->ec_conn, tgt, &echo_uuid, NULL, NULL); + rc = obd_connect(&ec->ec_conn, tgt, &echo_uuid); if (rc) { CERROR("fail to connect to device %d\n", data->ioc_dev); return (rc); @@ -1033,7 +1005,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) ec->ec_lsmsize = obd_alloc_memmd (&ec->ec_conn, &lsm); if (ec->ec_lsmsize < 0) { CERROR ("Can't get # stripes: %d\n", rc); - obd_disconnect (&ec->ec_conn); + obd_disconnect (&ec->ec_conn, 0); rc = ec->ec_lsmsize; } else { ec->ec_nstripes = lsm->lsm_stripe_count; @@ -1043,7 +1015,7 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(rc); } -static int echo_cleanup(struct obd_device * obddev) +static int echo_cleanup(struct obd_device * obddev, int force, int failover) { struct list_head *el; struct ec_object *eco; @@ -1067,7 +1039,7 @@ static int echo_cleanup(struct obd_device * obddev) echo_put_object (eco); } - rc = obd_disconnect (&ec->ec_conn); + rc = obd_disconnect (&ec->ec_conn, 0); if (rc != 0) CERROR("fail to disconnect device: %d\n", rc); @@ -1075,8 +1047,7 @@ static int echo_cleanup(struct obd_device * obddev) } static int echo_connect(struct lustre_handle *conn, struct obd_device *src, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { struct obd_export *exp; int rc; @@ -1084,14 +1055,15 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *src, rc = class_connect(conn, src, cluuid); if (rc == 0) { exp = class_conn2export (conn); - INIT_LIST_HEAD (&exp->exp_ec_data.eced_open_head); - INIT_LIST_HEAD (&exp->exp_ec_data.eced_locks); + INIT_LIST_HEAD(&exp->exp_ec_data.eced_open_head); + INIT_LIST_HEAD(&exp->exp_ec_data.eced_locks); + class_export_put(exp); } RETURN (rc); } -static int echo_disconnect(struct lustre_handle *conn) +static int echo_disconnect(struct lustre_handle *conn, int failover) { struct obd_export *exp = class_conn2export (conn); struct obd_device *obd; @@ -1101,7 +1073,7 @@ static int echo_disconnect(struct lustre_handle *conn) int rc; if (exp == NULL) - return (-EINVAL); + GOTO(out, rc = -EINVAL); obd = exp->exp_obd; ec = &obd->u.echo_client; @@ -1113,9 +1085,9 @@ static int echo_disconnect(struct lustre_handle *conn) list_del (&ecl->ecl_exp_chain); rc = obd_cancel (&ec->ec_conn, ecl->ecl_object->eco_lsm, - ecl->ecl_mode, &ecl->ecl_handle); + ecl->ecl_mode, &ecl->ecl_lock_handle); - CERROR ("Cancel lock on object "LPX64" on disconnect (%d)\n", + CDEBUG (D_INFO, "Cancel lock on object "LPX64" on disconnect (%d)\n", ecl->ecl_object->eco_id, rc); echo_put_object (ecl->ecl_object); @@ -1128,6 +1100,10 @@ static int echo_disconnect(struct lustre_handle *conn) struct ec_open_object, ecoo_exp_chain); list_del (&ecoo->ecoo_exp_chain); + memcpy (&ecoo->ecoo_oa.o_inline, &ecoo->ecoo_och, + FD_OSTDATA_SIZE); + ecoo->ecoo_oa.o_valid |= OBD_MD_FLHANDLE; + rc = obd_close (&ec->ec_conn, &ecoo->ecoo_oa, ecoo->ecoo_object->eco_lsm, NULL); @@ -1138,8 +1114,11 @@ static int echo_disconnect(struct lustre_handle *conn) OBD_FREE (ecoo, sizeof (*ecoo)); } - rc = class_disconnect (conn); - RETURN (rc); + rc = class_disconnect (conn, 0); + GOTO(out, rc); + out: + class_export_put(exp); + return rc; } static struct obd_ops echo_obd_ops = { diff --git a/lustre/obdfilter/Makefile.am b/lustre/obdfilter/Makefile.am index 4e4e8b1..b9addf1 100644 --- a/lustre/obdfilter/Makefile.am +++ b/lustre/obdfilter/Makefile.am @@ -6,16 +6,6 @@ MODULE = obdfilter modulefs_DATA = obdfilter.o EXTRA_PROGRAMS = obdfilter - -LINX=simple.c -simple.c: - test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c - -FILTERC = filter.c lproc_obdfilter.c -obdfilter_SOURCES = $(FILTERC) $(LINX) - -dist-hook: - list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done +obdfilter_SOURCES = filter.c lproc_obdfilter.c include $(top_srcdir)/Rules - diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 0632af0..21d05ef 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -53,8 +53,13 @@ #include <linux/mount.h> #endif -static kmem_cache_t *filter_open_cache; -static kmem_cache_t *filter_dentry_cache; +enum { + LPROC_FILTER_READS = 0, + LPROC_FILTER_READ_BYTES = 1, + LPROC_FILTER_WRITES = 2, + LPROC_FILTER_WRITE_BYTES = 3, + LPROC_FILTER_LAST = LPROC_FILTER_WRITE_BYTES +1 +}; /* should be generic per-obd stats... */ struct xprocfs_io_stat { @@ -149,9 +154,9 @@ xprocfs_init (char *name) snprintf (dirname, sizeof (dirname), "sys/%s", name); - xprocfs_dir = proc_mkdir ("sys/obdfilter", NULL); + xprocfs_dir = proc_mkdir (dirname, NULL); if (xprocfs_dir == NULL) { - CERROR ("Can't make dir\n"); + CERROR ("Can't make procfs dir %s\n", dirname); return; } @@ -212,25 +217,66 @@ static inline const char *obd_mode_to_type(int mode) return obd_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } -static void filter_last_rcvd_cb(struct obd_device *obd, __u64 last_rcvd, - int error) +static void filter_ffd_addref(void *ffdp) +{ + struct filter_file_data *ffd = ffdp; + + atomic_inc(&ffd->ffd_refcount); + CDEBUG(D_INFO, "GETting ffd %p : new refcount %d\n", ffd, + atomic_read(&ffd->ffd_refcount)); +} + +static struct filter_file_data *filter_ffd_new(void) { - CDEBUG(D_HA, "got callback for last_rcvd "LPD64": rc = %d\n", - last_rcvd, error); - if (!error && last_rcvd > obd->obd_last_committed) - obd->obd_last_committed = last_rcvd; + struct filter_file_data *ffd; + + OBD_ALLOC(ffd, sizeof *ffd); + if (ffd == NULL) { + CERROR("out of memory\n"); + return NULL; + } + + atomic_set(&ffd->ffd_refcount, 2); + + INIT_LIST_HEAD(&ffd->ffd_handle.h_link); + class_handle_hash(&ffd->ffd_handle, filter_ffd_addref); + + return ffd; } -void filter_start_transno(struct obd_export *export) +static struct filter_file_data *filter_handle2ffd(struct lustre_handle *handle) { -#ifdef FILTER_TRANSNO_SEM - struct obd_device * obd = export->exp_obd; + struct filter_file_data *ffd = NULL; ENTRY; + LASSERT(handle != NULL); + ffd = class_handle2object(handle->cookie); + if (ffd != NULL) + LASSERT(ffd->ffd_file->private_data == ffd); + RETURN(ffd); +} - down(&obd->u.filter.fo_transno_sem); -#endif +static void filter_ffd_put(struct filter_file_data *ffd) +{ + CDEBUG(D_INFO, "PUTting ffd %p : new refcount %d\n", ffd, + atomic_read(&ffd->ffd_refcount) - 1); + LASSERT(atomic_read(&ffd->ffd_refcount) > 0 && + atomic_read(&ffd->ffd_refcount) < 0x5a5a); + if (atomic_dec_and_test(&ffd->ffd_refcount)) { + LASSERT(list_empty(&ffd->ffd_handle.h_link)); + OBD_FREE(ffd, sizeof *ffd); + } } +static void filter_ffd_destroy(struct filter_file_data *ffd) +{ + class_handle_unhash(&ffd->ffd_handle); + filter_ffd_put(ffd); +} + +static void filter_commit_cb(struct obd_device *obd, __u64 transno, int error) +{ + obd_transno_commit_cb(obd, transno, error); +} /* Assumes caller has already pushed us into the kernel context. */ int filter_finish_transno(struct obd_export *export, void *handle, struct obd_trans_info *oti, int rc) @@ -244,16 +290,11 @@ int filter_finish_transno(struct obd_export *export, void *handle, ssize_t written; /* Propagate error code. */ - if (rc) { -#ifdef FILTER_TRANSNO_SEM - up(&filter->fo_transno_sem); -#endif + if (rc) RETURN(rc); - } - if (!(obd->obd_flags & OBD_REPLAYABLE)) { - RETURN(0); - } + if (!obd->obd_replayable) + RETURN(rc); /* we don't allocate new transnos for replayed requests */ #if 0 @@ -264,14 +305,10 @@ int filter_finish_transno(struct obd_export *export, void *handle, off = fed->fed_lr_off; -#ifndef FILTER_TRANSNO_SEM spin_lock(&filter->fo_translock); -#endif last_rcvd = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd); filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd + 1); -#ifndef FILTER_TRANSNO_SEM spin_unlock(&filter->fo_translock); -#endif if (oti) oti->oti_transno = last_rcvd; fcd->fcd_last_rcvd = cpu_to_le64(last_rcvd); @@ -285,15 +322,12 @@ int filter_finish_transno(struct obd_export *export, void *handle, #else fcd->fcd_last_xid = 0; #endif - fsfilt_set_last_rcvd(obd, last_rcvd, handle, filter_last_rcvd_cb); + fsfilt_set_last_rcvd(obd, last_rcvd, handle, filter_commit_cb); written = lustre_fwrite(filter->fo_rcvd_filp, (char *)fcd, sizeof(*fcd), &off); CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = " LPSZ"\n", last_rcvd, fcd->fcd_uuid, fed->fed_lr_idx, written); -#ifdef FILTER_TRANSNO_SEM - up(&filter->fo_transno_sem); -#endif if (written == sizeof(*fcd)) RETURN(0); CERROR("error writing to last_rcvd file: rc = %d\n", (int)written); @@ -305,9 +339,9 @@ int filter_finish_transno(struct obd_export *export, void *handle, /* write the pathname into the string */ static char *filter_id(char *buf, struct filter_obd *filter, obd_id id, - obd_mode mode) + obd_mode mode) { - if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0) + if (!S_ISREG(mode) || filter->fo_subdir_count == 0) sprintf(buf, "O/%s/"LPU64, obd_mode_to_type(mode), id); else sprintf(buf, "O/%s/d%d/"LPU64, obd_mode_to_type(mode), @@ -330,7 +364,7 @@ static inline void f_dput(struct dentry *dentry) static void filter_drelease(struct dentry *dentry) { if (dentry->d_fsdata) - kmem_cache_free(filter_dentry_cache, dentry->d_fsdata); + OBD_FREE(dentry->d_fsdata, sizeof(struct filter_dentry_data)); } struct dentry_operations filter_dops = { @@ -349,34 +383,38 @@ struct dentry_operations filter_dops = { * Otherwise, we have just read the data from the last_rcvd file and * we know its offset. */ -int filter_client_add(struct filter_obd *filter, +int filter_client_add(struct obd_device *obd, struct filter_obd *filter, struct filter_export_data *fed, int cl_idx) { + unsigned long *bitmap = filter->fo_last_rcvd_slots; int new_client = (cl_idx == -1); - LASSERT(filter->fo_last_rcvd_slots != NULL); + LASSERT(bitmap != NULL); + + /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp(fed->fed_fcd->fcd_uuid, "OBD_CLASS_UUID")) + RETURN(0); /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so * there's no need for extra complication here */ if (new_client) { - cl_idx = find_first_zero_bit(filter->fo_last_rcvd_slots, - FILTER_LR_MAX_CLIENTS); + cl_idx = find_first_zero_bit(bitmap, FILTER_LR_MAX_CLIENTS); repeat: if (cl_idx >= FILTER_LR_MAX_CLIENTS) { CERROR("no client slots - fix FILTER_LR_MAX_CLIENTS\n"); return -ENOMEM; } - if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) { + if (test_and_set_bit(cl_idx, bitmap)) { CERROR("FILTER client %d: found bit is set in bitmap\n", cl_idx); - cl_idx = find_next_zero_bit(filter->fo_last_rcvd_slots, + cl_idx = find_next_zero_bit(bitmap, FILTER_LR_MAX_CLIENTS, cl_idx); goto repeat; } } else { - if (test_and_set_bit(cl_idx, filter->fo_last_rcvd_slots)) { + if (test_and_set_bit(cl_idx, bitmap)) { CERROR("FILTER client %d: bit already set in bitmap!\n", cl_idx); LBUG(); @@ -394,14 +432,28 @@ int filter_client_add(struct filter_obd *filter, struct obd_run_ctxt saved; loff_t off = fed->fed_lr_off; ssize_t written; + void *handle; CDEBUG(D_INFO, "writing client fcd at idx %u (%llu) (len %u)\n", fed->fed_lr_idx,off,(unsigned int)sizeof(*fed->fed_fcd)); push_ctxt(&saved, &filter->fo_ctxt, NULL); - written = lustre_fwrite(filter->fo_rcvd_filp, + /* Transaction eeded to fix for bug 1403 */ + handle = fsfilt_start(obd, + filter->fo_rcvd_filp->f_dentry->d_inode, + FSFILT_OP_SETATTR); + if (IS_ERR(handle)) { + written = PTR_ERR(handle); + CERROR("unable to start transaction: rc %d\n", + (int)written); + } else { + written = lustre_fwrite(filter->fo_rcvd_filp, (char *)fed->fed_fcd, sizeof(*fed->fed_fcd), &off); + fsfilt_commit(obd, + filter->fo_rcvd_filp->f_dentry->d_inode, + handle, 0); + } pop_ctxt(&saved, &filter->fo_ctxt, NULL); if (written != sizeof(*fed->fed_fcd)) { @@ -413,7 +465,7 @@ int filter_client_add(struct filter_obd *filter, return 0; } -int filter_client_free(struct obd_export *exp) +int filter_client_free(struct obd_export *exp, int failover) { struct filter_export_data *fed = &exp->exp_filter_data; struct filter_obd *filter = &exp->exp_obd->u.filter; @@ -425,6 +477,11 @@ int filter_client_free(struct obd_export *exp) if (!fed->fed_fcd) RETURN(0); + if (failover != 0) { + OBD_FREE(fed->fed_fcd, sizeof(*fed->fed_fcd)); + RETURN(0); + } + LASSERT(filter->fo_last_rcvd_slots != NULL); off = fed->fed_lr_off; @@ -444,7 +501,9 @@ int filter_client_free(struct obd_export *exp) sizeof(zero_fcd), &off); /* XXX: this write gets lost sometimes, unless this sync is here. */ - file_fsync(filter->fo_rcvd_filp, filter->fo_rcvd_filp->f_dentry, 1); + if (written > 0) + file_fsync(filter->fo_rcvd_filp, + filter->fo_rcvd_filp->f_dentry, 1); pop_ctxt(&saved, &filter->fo_ctxt, NULL); if (written != sizeof(zero_fcd)) { @@ -522,7 +581,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp, RETURN(-ENOMEM); filter->fo_fsd = fsd; - OBD_ALLOC(filter->fo_last_rcvd_slots, + OBD_ALLOC(filter->fo_last_rcvd_slots, FILTER_LR_MAX_CLIENT_WORDS * sizeof(unsigned long)); if (filter->fo_last_rcvd_slots == NULL) { OBD_FREE(fsd, sizeof(*fsd)); @@ -585,7 +644,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp, * the header. If we find clients with higher last_rcvd values * then those clients may need recovery done. */ - if (!(obd->obd_flags & OBD_REPLAYABLE)) { + if (!obd->obd_replayable) { CERROR("%s: recovery support OFF\n", obd->obd_name); GOTO(out, rc = 0); } @@ -634,9 +693,8 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp, LPU64"\n", fcd->fcd_uuid, cl_idx, last_rcvd, le64_to_cpu(fsd->fsd_last_rcvd), le64_to_cpu(fcd->fcd_mount_count), mount_count); - /* disabled until OST recovery is actually working */ - - if (!exp) { + if (exp == NULL) { + /* XXX this rc is ignored */ rc = -ENOMEM; break; } @@ -644,13 +702,14 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp, sizeof exp->exp_client_uuid.uuid); fed = &exp->exp_filter_data; fed->fed_fcd = fcd; - filter_client_add(filter, fed, cl_idx); + filter_client_add(obd, filter, fed, cl_idx); /* create helper if export init gets more complex */ INIT_LIST_HEAD(&fed->fed_open_head); spin_lock_init(&fed->fed_lock); fcd = NULL; obd->obd_recoverable_clients++; + class_export_put(exp); } else { CDEBUG(D_INFO, "discarded client %d UUID '%s' count "LPU64"\n", @@ -664,20 +723,22 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp, if (last_rcvd > le64_to_cpu(filter->fo_fsd->fsd_last_rcvd)) filter->fo_fsd->fsd_last_rcvd = cpu_to_le64(last_rcvd); - obd->obd_last_committed = le64_to_cpu(filter->fo_fsd->fsd_last_rcvd); + obd->obd_last_committed = + le64_to_cpu(filter->fo_fsd->fsd_last_rcvd); if (obd->obd_recoverable_clients) { CERROR("RECOVERY: %d recoverable clients, last_rcvd " LPU64"\n", obd->obd_recoverable_clients, le64_to_cpu(filter->fo_fsd->fsd_last_rcvd)); - obd->obd_next_recovery_transno = obd->obd_last_committed + 1; - obd->obd_flags |= OBD_RECOVERING; + obd->obd_next_recovery_transno = + obd->obd_last_committed + 1; + obd->obd_recovering = 1; } - if (fcd) - OBD_FREE(fcd, sizeof(*fcd)); - } + if (fcd) + OBD_FREE(fcd, sizeof(*fcd)); + out: fsd->fsd_mount_count = cpu_to_le64(mount_count + 1); @@ -805,7 +866,7 @@ err_O_sub: OBD_FREE(filter->fo_dentry_O_sub, filter->fo_subdir_count * sizeof(dentry)); err_client: - class_disconnect_all(obd); + class_disconnect_exports(obd, 0); err_filp: if (filp_close(file, 0)) CERROR("can't close %s after error\n", LAST_RCVD); @@ -932,78 +993,174 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd, RETURN(dchild); } +/* direct cut-n-paste of mds_blocking_ast() */ +int filter_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + int do_ast; + ENTRY; + + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + RETURN(0); + } + + /* XXX layering violation! -phil */ + l_lock(&lock->l_resource->lr_namespace->ns_lock); + /* Get this: if filter_blocking_ast is racing with ldlm_intent_policy, + * such that mds_blocking_ast is called just before l_i_p takes the + * ns_lock, then by the time we get the lock, we might not be the + * correct blocking function anymore. So check, and return early, if + * so. */ + if (lock->l_blocking_ast != filter_blocking_ast) { + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + RETURN(0); + } + + lock->l_flags |= LDLM_FL_CBPENDING; + do_ast = (!lock->l_readers && !lock->l_writers); + l_unlock(&lock->l_resource->lr_namespace->ns_lock); + + if (do_ast) { + struct lustre_handle lockh; + int rc; + + LDLM_DEBUG(lock, "already unused, calling ldlm_cli_cancel"); + ldlm_lock2handle(lock, &lockh); + rc = ldlm_cli_cancel(&lockh); + if (rc < 0) + CERROR("ldlm_cli_cancel: %d\n", rc); + } else { + LDLM_DEBUG(lock, "Lock still has references, will be " + "cancelled later"); + } + RETURN(0); +} + +static int filter_lock_dentry(struct obd_device *obd, struct dentry *de, + int lock_mode, struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id = { .name = {0} }; + int flags = 0, rc; + ENTRY; + + res_id.name[0] = de->d_inode->i_ino; + res_id.name[1] = de->d_inode->i_generation; + rc = ldlm_cli_enqueue(NULL, NULL, obd->obd_namespace, NULL, + res_id, LDLM_PLAIN, NULL, 0, lock_mode, + &flags, ldlm_completion_ast, + filter_blocking_ast, NULL, lockh); + + RETURN(rc == ELDLM_OK ? 0 : -ENOLCK); /* XXX translate ldlm code */ +} + static inline struct dentry *filter_parent(struct obd_device *obd, obd_mode mode, obd_id objid) { struct filter_obd *filter = &obd->u.filter; - LASSERT((mode & S_IFMT) == S_IFREG); /* only regular files for now */ - if ((mode & S_IFMT) != S_IFREG || filter->fo_subdir_count == 0) + LASSERT(S_ISREG(mode)); /* only regular files for now */ + if (!S_ISREG(mode) || filter->fo_subdir_count == 0) return filter->fo_dentry_O_mode[(mode & S_IFMT) >> S_SHIFT]; return filter->fo_dentry_O_sub[objid & (filter->fo_subdir_count - 1)]; } +static inline struct dentry *filter_parent_lock(struct obd_device *obd, + obd_mode mode, obd_id objid, + int lock_mode, + struct lustre_handle *lockh) +{ + struct dentry *de = filter_parent(obd, mode, objid); + int rc; + + if (IS_ERR(de)) + return de; + + rc = filter_lock_dentry(obd, de, lock_mode, lockh); + return rc ? ERR_PTR(rc) : de; +} + static struct file *filter_obj_open(struct obd_export *export, - __u64 id, __u32 type) + __u64 id, __u32 type, int parent_mode, + struct lustre_handle *parent_lockh) { - struct filter_obd *filter = &export->exp_obd->u.filter; + struct obd_device *obd = export->exp_obd; + struct filter_obd *filter = &obd->u.filter; struct super_block *sb = filter->fo_sb; - struct dentry *dentry; + struct dentry *dchild = NULL, *parent; struct filter_export_data *fed = &export->exp_filter_data; - struct filter_dentry_data *fdd; - struct filter_file_data *ffd; + struct filter_dentry_data *fdd = NULL; + struct filter_file_data *ffd = NULL; struct obd_run_ctxt saved; char name[24]; struct file *file; + int len, cleanup_phase = 0; ENTRY; + push_ctxt(&saved, &filter->fo_ctxt, NULL); + if (!sb || !sb->s_dev) { CERROR("fatal: device not initialized.\n"); - RETURN(ERR_PTR(-ENXIO)); + GOTO(cleanup, file = ERR_PTR(-ENXIO)); } if (!id) { CERROR("fatal: invalid obdo "LPU64"\n", id); - RETURN(ERR_PTR(-ESTALE)); + GOTO(cleanup, file = ERR_PTR(-ESTALE)); } if (!(type & S_IFMT)) { CERROR("OBD %s, object "LPU64" has bad type: %o\n", __FUNCTION__, id, type); - RETURN(ERR_PTR(-EINVAL)); + GOTO(cleanup, file = ERR_PTR(-EINVAL)); } - PORTAL_SLAB_ALLOC(ffd, filter_open_cache, sizeof(*ffd)); - if (!ffd) { + ffd = filter_ffd_new(); + if (ffd == NULL) { CERROR("obdfilter: out of memory\n"); - RETURN(ERR_PTR(-ENOMEM)); + GOTO(cleanup, file = ERR_PTR(-ENOMEM)); } + cleanup_phase = 1; + /* We preallocate this to avoid blocking while holding fo_fddlock */ - fdd = kmem_cache_alloc(filter_dentry_cache, SLAB_KERNEL); - if (!fdd) { + OBD_ALLOC(fdd, sizeof *fdd); + if (fdd == NULL) { CERROR("obdfilter: out of memory\n"); - GOTO(out_ffd, file = ERR_PTR(-ENOMEM)); + GOTO(cleanup, file = ERR_PTR(-ENOMEM)); } - push_ctxt(&saved, &filter->fo_ctxt, NULL); - file = filp_open(filter_id(name, filter, id, type), - O_RDWR | O_LARGEFILE, type); - pop_ctxt(&saved, &filter->fo_ctxt, NULL); + cleanup_phase = 2; + + parent = filter_parent_lock(obd, type, id, parent_mode, parent_lockh); + if (IS_ERR(parent)) + GOTO(cleanup, file = (void *)parent); + + cleanup_phase = 3; + + len = snprintf(name, sizeof(name), LPU64, id); + dchild = lookup_one_len(name, parent, len); + if (IS_ERR(dchild)) + GOTO(cleanup, file = (void *)dchild); + LASSERT(dchild->d_inode); + cleanup_phase = 4; + + /* dentry_open does a dput(de) and mntput(mds->mds_vfsmnt) on error */ + mntget(filter->fo_vfsmnt); + file = dentry_open(dchild, filter->fo_vfsmnt, O_RDWR | O_LARGEFILE); if (IS_ERR(file)) { + dchild = NULL; /* prevent a double dput in step 4 */ CERROR("error opening %s: rc %ld\n", name, PTR_ERR(file)); - GOTO(out_fdd, file); + GOTO(cleanup, file); } - dentry = file->f_dentry; spin_lock(&filter->fo_fddlock); - if (dentry->d_fsdata) { + if (dchild->d_fsdata) { spin_unlock(&filter->fo_fddlock); - kmem_cache_free(filter_dentry_cache, fdd); - fdd = dentry->d_fsdata; - LASSERT(kmem_cache_validate(filter_dentry_cache, fdd)); + OBD_FREE(fdd, sizeof *fdd); + fdd = dchild->d_fsdata; /* should only happen during client recovery */ if (fdd->fdd_flags & FILTER_FLAG_DESTROY) CDEBUG(D_INODE,"opening destroyed object "LPU64"\n",id); @@ -1013,35 +1170,43 @@ static struct file *filter_obj_open(struct obd_export *export, fdd->fdd_flags = 0; fdd->fdd_objid = id; /* If this is racy, then we can use {cmp}xchg and atomic_add */ - dentry->d_fsdata = fdd; + dchild->d_fsdata = fdd; spin_unlock(&filter->fo_fddlock); } - get_random_bytes(&ffd->ffd_servercookie, sizeof(ffd->ffd_servercookie)); ffd->ffd_file = file; LASSERT(file->private_data == NULL); file->private_data = ffd; - if (!dentry->d_op) - dentry->d_op = &filter_dops; + if (!dchild->d_op) + dchild->d_op = &filter_dops; else - LASSERT(dentry->d_op == &filter_dops); + LASSERT(dchild->d_op == &filter_dops); spin_lock(&fed->fed_lock); list_add(&ffd->ffd_export_list, &fed->fed_open_head); spin_unlock(&fed->fed_lock); CDEBUG(D_INODE, "opened objid "LPU64": rc = %p\n", id, file); - EXIT; -out: - return file; - -out_fdd: - kmem_cache_free(filter_dentry_cache, fdd); -out_ffd: - ffd->ffd_servercookie = DEAD_HANDLE_MAGIC; - PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd)); - goto out; +cleanup: + switch (cleanup_phase) { + case 4: + if (IS_ERR(file)) + l_dput(dchild); + case 3: + if (IS_ERR(file)) + ldlm_lock_decref(parent_lockh, parent_mode); + case 2: + if (IS_ERR(file)) + OBD_FREE(fdd, sizeof *fdd); + case 1: + if (IS_ERR(file)) + filter_ffd_destroy(ffd); + filter_ffd_put(ffd); + case 0: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + } + RETURN(file); } /* Caller must hold i_sem on dir_dentry->d_inode */ @@ -1071,16 +1236,23 @@ static int filter_destroy_internal(struct obd_device *obd, RETURN(rc); } +/* If closing because we are failing this device, then + don't do the unlink on close. +*/ static int filter_close_internal(struct obd_export *export, struct filter_file_data *ffd, - struct obd_trans_info *oti) + struct obd_trans_info *oti, + int failover) { struct obd_device *obd = export->exp_obd; struct filter_obd *filter = &obd->u.filter; struct file *filp = ffd->ffd_file; struct dentry *object_dentry = dget(filp->f_dentry); struct filter_dentry_data *fdd = object_dentry->d_fsdata; - int rc, rc2; + struct lustre_handle parent_lockh; + int rc, rc2, cleanup_phase = 0; + struct dentry *dir_dentry; + struct obd_run_ctxt saved; ENTRY; LASSERT(filp->private_data == ffd); @@ -1089,39 +1261,56 @@ static int filter_close_internal(struct obd_export *export, rc = filp_close(filp, 0); if (atomic_dec_and_test(&fdd->fdd_open_count) && - fdd->fdd_flags & FILTER_FLAG_DESTROY) { - struct dentry *dir_dentry = filter_parent(obd, S_IFREG, fdd->fdd_objid); - struct obd_run_ctxt saved; + fdd->fdd_flags & FILTER_FLAG_DESTROY && !failover) { void *handle; - down(&dir_dentry->d_inode->i_sem); push_ctxt(&saved, &filter->fo_ctxt, NULL); - filter_start_transno(export); + cleanup_phase = 1; + + dir_dentry = filter_parent_lock(obd, S_IFREG, fdd->fdd_objid, + LCK_PW, &parent_lockh); + if (IS_ERR(dir_dentry)) + GOTO(cleanup, rc = PTR_ERR(dir_dentry)); + cleanup_phase = 2; + handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_UNLINK); - if (IS_ERR(handle)) { - rc = filter_finish_transno(export, handle, oti, - PTR_ERR(handle)); - GOTO(out, rc); - } + if (IS_ERR(handle)) + GOTO(cleanup, rc = PTR_ERR(handle)); + /* XXX unlink from PENDING directory now too */ rc2 = filter_destroy_internal(obd, dir_dentry, object_dentry); if (rc2 && !rc) rc = rc2; rc = filter_finish_transno(export, handle, oti, rc); - rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle); + rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0); if (rc2) { CERROR("error on commit, err = %d\n", rc2); if (!rc) rc = rc2; } - out: - pop_ctxt(&saved, &filter->fo_ctxt, NULL); - up(&dir_dentry->d_inode->i_sem); } - f_dput(object_dentry); - PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd)); +cleanup: + switch(cleanup_phase) { + case 2: + if (rc || oti == NULL) { + ldlm_lock_decref(&parent_lockh, LCK_PW); + } else { + memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh, + sizeof(parent_lockh)); + oti->oti_ack_locks[0].mode = LCK_PW; + } + case 1: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + case 0: + f_dput(object_dentry); + filter_ffd_destroy(ffd); + break; + default: + CERROR("invalid cleanup_phase %d\n", cleanup_phase); + LBUG(); + } RETURN(rc); } @@ -1149,9 +1338,17 @@ static int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (IS_ERR(mnt)) GOTO(err_ops, rc); -#if OST_RECOVERY - obd->obd_flags |= OBD_REPLAYABLE; -#endif + if (data->ioc_inllen3 > 0 && data->ioc_inlbuf3) { + if (*data->ioc_inlbuf3 == 'f') { + obd->obd_replayable = 1; + obd_sync_filter = 1; + CERROR("%s: configured for recovery and sync write\n", + obd->obd_name); + } else { + CERROR("unrecognised flag '%c'\n", + *data->ioc_inlbuf3); + } + } filter = &obd->u.filter; filter->fo_vfsmnt = mnt; @@ -1168,11 +1365,7 @@ static int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (rc) GOTO(err_kfree, rc); -#ifdef FILTER_TRANSNO_SEM - init_MUTEX(&filter->fo_transno_sem); -#else spin_lock_init(&filter->fo_translock); -#endif spin_lock_init(&filter->fo_fddlock); spin_lock_init(&filter->fo_objidlock); INIT_LIST_HEAD(&filter->fo_export_list); @@ -1202,7 +1395,13 @@ err_ops: static int filter_setup(struct obd_device *obd, obd_count len, void *buf) { - return filter_common_setup(obd, len, buf, NULL); + struct obd_ioctl_data* data = buf; + char *option = NULL; + + if (!strcmp(data->ioc_inlbuf2, "ext3")) + option = "asyncdel"; + + return filter_common_setup(obd, len, buf, option); } /* sanobd setup methods - use a specific mount option */ @@ -1215,23 +1414,28 @@ static int filter_san_setup(struct obd_device *obd, obd_count len, void *buf) RETURN(-EINVAL); /* for extN/ext3 filesystem, we must mount it with 'writeback' mode */ - if (!strcmp(data->ioc_inlbuf2, "extN") || - !strcmp(data->ioc_inlbuf2, "ext3")) + if (!strcmp(data->ioc_inlbuf2, "extN")) option = "data=writeback"; + else if (!strcmp(data->ioc_inlbuf2, "ext3")) + option = "data=writeback,asyncdel"; else LBUG(); /* just a reminder */ return filter_common_setup(obd, len, buf, option); } -static int filter_cleanup(struct obd_device *obd) +static int filter_cleanup(struct obd_device *obd, int force, int failover) { struct super_block *sb; ENTRY; + if (failover) + CERROR("%s: shutting down for failover; client state will" + " be preserved.\n", obd->obd_name); + if (!list_empty(&obd->obd_exports)) { - CERROR("still has clients!\n"); - class_disconnect_all(obd); + CERROR("%s: still has clients!\n", obd->obd_name); + class_disconnect_exports(obd, failover); if (!list_empty(&obd->obd_exports)) { CERROR("still has exports after forced cleanup?\n"); RETURN(-EBUSY); @@ -1248,8 +1452,16 @@ static int filter_cleanup(struct obd_device *obd) shrink_dcache_parent(sb->s_root); unlock_kernel(); + + if (atomic_read(&obd->u.filter.fo_vfsmnt->mnt_count) > 1){ + CERROR("%s: mount point busy, mnt_count: %d\n", obd->obd_name, + atomic_read(&obd->u.filter.fo_vfsmnt->mnt_count)); + } + mntput(obd->u.filter.fo_vfsmnt); obd->u.filter.fo_sb = 0; +/* destroy_buffers(obd->u.filter.fo_sb->s_dev);*/ + kfree(obd->u.filter.fo_fstype); fsfilt_put_ops(obd->obd_fsops); @@ -1261,20 +1473,43 @@ static int filter_cleanup(struct obd_device *obd) int filter_attach(struct obd_device *dev, obd_count len, void *data) { struct lprocfs_static_vars lvars; + struct lprocfs_counters* cntrs; + int rc; lprocfs_init_vars(&lvars); - return lprocfs_obd_attach(dev, lvars.obd_vars); + rc = lprocfs_obd_attach(dev, lvars.obd_vars); + if (rc != 0) + return rc; + + rc = lprocfs_alloc_obd_counters(dev, LPROC_FILTER_LAST); + if (rc != 0) + return rc; + + /* Init obdfilter private counters here */ + cntrs = dev->counters; + LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_READS], + 0, NULL, "read", "reqs"); + LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_READ_BYTES], + LPROCFS_CNTR_AVGMINMAX, + NULL, "read_bytes", "bytes"); + LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_WRITES], + 0, NULL, "write", "reqs"); + + LPROCFS_COUNTER_INIT(&cntrs->cntr[LPROC_FILTER_WRITE_BYTES], + LPROCFS_CNTR_AVGMINMAX, + NULL, "write_bytes", "bytes"); + return rc; } int filter_detach(struct obd_device *dev) { + lprocfs_free_obd_counters(dev); return lprocfs_obd_detach(dev); } /* nearly identical to mds_connect */ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_uuid *cluuid) { struct obd_export *exp; struct filter_export_data *fed; @@ -1294,11 +1529,12 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, LASSERT(exp); fed = &exp->exp_filter_data; + class_export_put(exp); INIT_LIST_HEAD(&exp->exp_filter_data.fed_open_head); spin_lock_init(&exp->exp_filter_data.fed_lock); - if (!(obd->obd_flags & OBD_REPLAYABLE)) + if (!obd->obd_replayable) RETURN(0); OBD_ALLOC(fcd, sizeof(*fcd)); @@ -1311,7 +1547,7 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, fed->fed_fcd = fcd; fcd->fcd_mount_count = cpu_to_le64(filter->fo_fsd->fsd_mount_count); - rc = filter_client_add(filter, fed, -1); + rc = filter_client_add(obd, filter, fed, -1); if (rc) GOTO(out_fcd, rc); @@ -1320,21 +1556,16 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, out_fcd: OBD_FREE(fcd, sizeof(*fcd)); out_export: - class_disconnect(conn); + class_disconnect(conn, 0); RETURN(rc); } -/* also incredibly similar to mds_disconnect */ -static int filter_disconnect(struct lustre_handle *conn) +static void filter_destroy_export(struct obd_export *exp) { - struct obd_export *exp = class_conn2export(conn); - struct filter_export_data *fed; - int rc; - ENTRY; + struct filter_export_data *fed = &exp->exp_filter_data; - LASSERT(exp); - fed = &exp->exp_filter_data; + ENTRY; spin_lock(&fed->fed_lock); while (!list_empty(&fed->fed_open_head)) { struct filter_file_data *ffd; @@ -1347,20 +1578,37 @@ static int filter_disconnect(struct lustre_handle *conn) CERROR("force close file %*s (hdl %p:"LPX64") on disconnect\n", ffd->ffd_file->f_dentry->d_name.len, ffd->ffd_file->f_dentry->d_name.name, - ffd, ffd->ffd_servercookie); + ffd, ffd->ffd_handle.h_cookie); - filter_close_internal(exp, ffd, NULL); + filter_close_internal(exp, ffd, NULL, exp->exp_failover); spin_lock(&fed->fed_lock); } spin_unlock(&fed->fed_lock); + if (exp->exp_obd->obd_replayable) + filter_client_free(exp, exp->exp_failover); + EXIT; +} + +/* also incredibly similar to mds_disconnect */ +static int filter_disconnect(struct lustre_handle *conn, int failover) +{ + struct obd_export *exp = class_conn2export(conn); + int rc; + unsigned long flags; + ENTRY; + + LASSERT(exp); ldlm_cancel_locks_for_export(exp); - if (exp->exp_obd->obd_flags & OBD_REPLAYABLE) - filter_client_free(exp); + spin_lock_irqsave(&exp->exp_lock, flags); + exp->exp_failover = failover; + spin_unlock_irqrestore(&exp->exp_lock, flags); - rc = class_disconnect(conn); + rc = class_disconnect(conn, failover); + fsfilt_sync(exp->exp_obd, exp->exp_obd->u.filter.fo_sb); + class_export_put(exp); /* XXX cleanup preallocated inodes */ RETURN(rc); } @@ -1386,25 +1634,6 @@ static void filter_from_inode(struct obdo *oa, struct inode *inode, int valid) EXIT; } -static struct filter_file_data *filter_handle2ffd(struct lustre_handle *handle) -{ - struct filter_file_data *ffd = NULL; - ENTRY; - - if (!handle || !handle->addr) - RETURN(NULL); - - ffd = (struct filter_file_data *)(unsigned long)(handle->addr); - if (!kmem_cache_validate(filter_open_cache, (void *)ffd)) - RETURN(NULL); - - if (ffd->ffd_servercookie != handle->cookie) - RETURN(NULL); - - LASSERT(ffd->ffd_file->private_data == ffd); - RETURN(ffd); -} - static struct dentry *__filter_oa2dentry(struct lustre_handle *conn, struct obdo *oa, int locked,char *what) { @@ -1414,14 +1643,16 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn, struct lustre_handle *ost_handle = obdo_handle(oa); struct filter_file_data *ffd = filter_handle2ffd(ost_handle); - if (ffd) + if (ffd != NULL) { dentry = dget(ffd->ffd_file->f_dentry); + filter_ffd_put(ffd); + } } if (!dentry) { struct obd_device *obd = class_conn2obd(conn); if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); RETURN(ERR_PTR(-EINVAL)); } dentry = filter_fid2dentry(obd, filter_parent(obd, oa->o_mode, @@ -1437,7 +1668,6 @@ static struct dentry *__filter_oa2dentry(struct lustre_handle *conn, if (!dentry->d_inode) { CERROR("%s on non-existent object: "LPU64"\n", what, oa->o_id); f_dput(dentry); - LBUG(); RETURN(ERR_PTR(-ENOENT)); } @@ -1486,7 +1716,7 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa, dentry = filter_oa2dentry(conn, oa, 0); if (IS_ERR(dentry)) - RETURN(PTR_ERR(dentry)); + GOTO(out_exp, rc = PTR_ERR(dentry)); iattr_from_obdo(&iattr, oa, oa->o_valid); iattr.ia_mode = (iattr.ia_mode & ~S_IFMT) | S_IFREG; @@ -1497,19 +1727,16 @@ static int filter_setattr(struct lustre_handle *conn, struct obdo *oa, if (iattr.ia_valid & ATTR_SIZE) down(&inode->i_sem); - filter_start_transno(export); handle = fsfilt_start(obd, dentry->d_inode, FSFILT_OP_SETATTR); - if (IS_ERR(handle)) { - rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); - GOTO(out_unlock, rc); - } + if (IS_ERR(handle)) + GOTO(out_unlock, rc = PTR_ERR(handle)); if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, &iattr); else rc = inode_setattr(inode, &iattr); rc = filter_finish_transno(export, handle, oti, rc); - rc2 = fsfilt_commit(obd, dentry->d_inode, handle); + rc2 = fsfilt_commit(obd, dentry->d_inode, handle, 0); if (rc2) { CERROR("error on commit, err = %d\n", rc2); if (!rc) @@ -1527,28 +1754,34 @@ out_unlock: pop_ctxt(&saved, &filter->fo_ctxt, NULL); f_dput(dentry); + out_exp: + class_export_put(export); RETURN(rc); } static int filter_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *ea, struct obd_trans_info *oti) + struct lov_stripe_md *ea, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct obd_export *export; struct lustre_handle *handle; struct filter_file_data *ffd; struct file *filp; + struct lustre_handle parent_lockh; int rc = 0; ENTRY; export = class_conn2export(conn); if (!export) { - CDEBUG(D_IOCTL, "fatal: invalid client "LPX64"\n", conn->addr); - RETURN(-EINVAL); + CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n", + conn->cookie); + GOTO(out, rc = -EINVAL); } XPROCFS_BUMP_MYCPU_IOSTAT (st_open_reqs, 1); - filp = filter_obj_open(export, oa->o_id, oa->o_mode); + filp = filter_obj_open(export, oa->o_id, oa->o_mode, + LCK_PR, &parent_lockh); if (IS_ERR(filp)) GOTO(out, rc = PTR_ERR(filp)); @@ -1556,42 +1789,45 @@ static int filter_open(struct lustre_handle *conn, struct obdo *oa, ffd = filp->private_data; handle = obdo_handle(oa); - handle->addr = (__u64)(unsigned long)ffd; - handle->cookie = ffd->ffd_servercookie; + handle->cookie = ffd->ffd_handle.h_cookie; oa->o_valid |= OBD_MD_FLHANDLE; - EXIT; + out: - return rc; -} /* filter_open */ + class_export_put(export); + if (!rc) { + memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh, + sizeof(parent_lockh)); + oti->oti_ack_locks[0].mode = LCK_PR; + } + RETURN(rc); +} static int filter_close(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti) { - struct obd_export *exp; + struct obd_export *exp = class_conn2export(conn); struct filter_file_data *ffd; struct filter_export_data *fed; int rc; ENTRY; - exp = class_conn2export(conn); if (!exp) { - CDEBUG(D_IOCTL, "fatal: invalid client "LPX64"\n", conn->addr); - RETURN(-EINVAL); + CDEBUG(D_IOCTL, "invalid client cookie"LPX64"\n", conn->cookie); + GOTO(out, rc = -EINVAL); } XPROCFS_BUMP_MYCPU_IOSTAT (st_close_reqs, 1); if (!(oa->o_valid & OBD_MD_FLHANDLE)) { CERROR("no handle for close of objid "LPU64"\n", oa->o_id); - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); } ffd = filter_handle2ffd(obdo_handle(oa)); - if (!ffd) { - struct lustre_handle *handle = obdo_handle(oa); - CERROR("bad handle ("LPX64") or cookie ("LPX64") for close\n", - handle->addr, handle->cookie); - RETURN(-ESTALE); + if (ffd == NULL) { + CERROR("bad handle ("LPX64") for close\n", + obdo_handle(oa)->cookie); + GOTO(out, rc = -ESTALE); } fed = &exp->exp_filter_data; @@ -1599,57 +1835,69 @@ static int filter_close(struct lustre_handle *conn, struct obdo *oa, list_del(&ffd->ffd_export_list); spin_unlock(&fed->fed_lock); - rc = filter_close_internal(exp, ffd, oti); - - RETURN(rc); -} /* filter_close */ + rc = filter_close_internal(exp, ffd, oti, 0); + filter_ffd_put(ffd); + GOTO(out, rc); + out: + class_export_put(exp); + return rc; +} static int filter_create(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti) { - struct obd_export *export = class_conn2export(conn); + struct obd_export *export; struct obd_device *obd = class_conn2obd(conn); struct filter_obd *filter = &obd->u.filter; struct obd_run_ctxt saved; struct dentry *dir_dentry; - struct dentry *new; + struct lustre_handle parent_lockh; + struct dentry *new = NULL; struct iattr; void *handle; - int err, rc; + int err, rc, cleanup_phase; ENTRY; if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); - return -EINVAL; + CERROR("invalid client cookie "LPX64"\n", conn->cookie); + RETURN(-EINVAL); } + export = class_conn2export(conn); XPROCFS_BUMP_MYCPU_IOSTAT (st_create_reqs, 1); oa->o_id = filter_next_id(obd); push_ctxt(&saved, &filter->fo_ctxt, NULL); - dir_dentry = filter_parent(obd, S_IFREG, oa->o_id); - down(&dir_dentry->d_inode->i_sem); + retry: + cleanup_phase = 0; + dir_dentry = filter_parent_lock(obd, S_IFREG, oa->o_id, LCK_PW, + &parent_lockh); + if (IS_ERR(dir_dentry)) + GOTO(cleanup, rc = PTR_ERR(dir_dentry)); + cleanup_phase = 1; + new = filter_fid2dentry(obd, dir_dentry, oa->o_id, 0); if (IS_ERR(new)) - GOTO(out, rc = PTR_ERR(new)); - + GOTO(cleanup, rc = PTR_ERR(new)); if (new->d_inode) { char buf[32]; /* This would only happen if lastobjid was bad on disk */ - CERROR("objid %s already exists\n", - filter_id(buf, filter, oa->o_mode, oa->o_id)); - LBUG(); - GOTO(out, rc = -EEXIST); + CERROR("Serious error: objid %s already exists; is this " + "filesystem corrupt? I will try to work around it.\n", + filter_id(buf, filter, oa->o_id, oa->o_mode)); + f_dput(new); + ldlm_lock_decref(&parent_lockh, LCK_PW); + oa->o_id = filter_next_id(obd); + goto retry; } - filter_start_transno(export); + cleanup_phase = 2; handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_CREATE); - if (IS_ERR(handle)) { - rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); - GOTO(out_put, rc); - } + if (IS_ERR(handle)) + GOTO(cleanup, rc = PTR_ERR(handle)); + rc = vfs_create(dir_dentry->d_inode, new, oa->o_mode); if (rc) CERROR("create failed rc = %d\n", rc); @@ -1661,7 +1909,7 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa, if (!rc) rc = err; } - err = fsfilt_commit(obd, dir_dentry->d_inode, handle); + err = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0); if (err) { CERROR("error on commit, err = %d\n", err); if (!rc) @@ -1669,7 +1917,7 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa, } if (rc) - GOTO(out_put, rc); + GOTO(cleanup, rc); /* Set flags for fields we have set in the inode struct */ oa->o_valid = OBD_MD_FLID | OBD_MD_FLBLKSZ | OBD_MD_FLBLOCKS | @@ -1677,50 +1925,70 @@ static int filter_create(struct lustre_handle *conn, struct obdo *oa, filter_from_inode(oa, new->d_inode, oa->o_valid); EXIT; -out_put: - f_dput(new); -out: - up(&dir_dentry->d_inode->i_sem); - pop_ctxt(&saved, &filter->fo_ctxt, NULL); - return rc; +cleanup: + switch(cleanup_phase) { + case 2: + f_dput(new); + case 1: /* locked parent dentry */ + if (rc || oti == NULL) { + ldlm_lock_decref(&parent_lockh, LCK_PW); + } else { + memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh, + sizeof(parent_lockh)); + oti->oti_ack_locks[0].mode = LCK_PW; + } + case 0: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + class_export_put(export); + break; + default: + CERROR("invalid cleanup_phase %d\n", cleanup_phase); + LBUG(); + } + + RETURN(rc); } static int filter_destroy(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti) { - struct obd_export *export = class_conn2export(conn); + struct obd_export *export; struct obd_device *obd = class_conn2obd(conn); struct filter_obd *filter = &obd->u.filter; - struct dentry *dir_dentry, *object_dentry; + struct dentry *dir_dentry, *object_dentry = NULL; struct filter_dentry_data *fdd; struct obd_run_ctxt saved; - void *handle; - int rc, rc2; + void *handle = NULL; + struct lustre_handle parent_lockh; + int rc, rc2, cleanup_phase = 0; ENTRY; if (!obd) { - CERROR("invalid client "LPX64"\n", conn->addr); + CERROR("invalid client cookie "LPX64"\n", conn->cookie); RETURN(-EINVAL); } + export = class_conn2export(conn); XPROCFS_BUMP_MYCPU_IOSTAT (st_destroy_reqs, 1); CDEBUG(D_INODE, "destroying objid "LPU64"\n", oa->o_id); - dir_dentry = filter_parent(obd, oa->o_mode, oa->o_id); - down(&dir_dentry->d_inode->i_sem); + push_ctxt(&saved, &filter->fo_ctxt, NULL); + dir_dentry = filter_parent_lock(obd, oa->o_mode, oa->o_id, + LCK_PW, &parent_lockh); + if (IS_ERR(dir_dentry)) + GOTO(cleanup, rc = PTR_ERR(dir_dentry)); + cleanup_phase = 1; object_dentry = filter_oa2dentry(conn, oa, 0); if (IS_ERR(object_dentry)) - GOTO(out, rc = -ENOENT); + GOTO(cleanup, rc = -ENOENT); + cleanup_phase = 2; - push_ctxt(&saved, &filter->fo_ctxt, NULL); - filter_start_transno(export); handle = fsfilt_start(obd, dir_dentry->d_inode, FSFILT_OP_UNLINK); - if (IS_ERR(handle)) { - rc = filter_finish_transno(export, handle, oti,PTR_ERR(handle)); - GOTO(out_ctxt, rc); - } + if (IS_ERR(handle)) + GOTO(cleanup, rc = PTR_ERR(handle)); + cleanup_phase = 3; fdd = object_dentry->d_fsdata; if (fdd && atomic_read(&fdd->fdd_open_count)) { @@ -1734,28 +2002,41 @@ static int filter_destroy(struct lustre_handle *conn, struct obdo *oa, CDEBUG(D_INODE, "repeat destroy of %dx open objid "LPU64"\n", atomic_read(&fdd->fdd_open_count), oa->o_id); - GOTO(out_commit, rc = 0); + GOTO(cleanup, rc = 0); } rc = filter_destroy_internal(obd, dir_dentry, object_dentry); -out_commit: - /* XXX save last_rcvd on disk */ - rc = filter_finish_transno(export, handle, oti, rc); - rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle); - if (rc2) { - CERROR("error on commit, err = %d\n", rc2); - if (!rc) - rc = rc2; +cleanup: + switch(cleanup_phase) { + case 3: + rc = filter_finish_transno(export, handle, oti, rc); + rc2 = fsfilt_commit(obd, dir_dentry->d_inode, handle, 0); + if (rc2) { + CERROR("error on commit, err = %d\n", rc2); + if (!rc) + rc = rc2; + } + case 2: + f_dput(object_dentry); + case 1: + if (rc || oti == NULL) { + ldlm_lock_decref(&parent_lockh, LCK_PW); + } else { + memcpy(&oti->oti_ack_locks[0].lock, &parent_lockh, + sizeof(parent_lockh)); + oti->oti_ack_locks[0].mode = LCK_PW; + } + case 0: + pop_ctxt(&saved, &filter->fo_ctxt, NULL); + class_export_put(export); + break; + default: + CERROR("invalid cleanup_phase %d\n", cleanup_phase); + LBUG(); } -out_ctxt: - pop_ctxt(&saved, &filter->fo_ctxt, NULL); - f_dput(object_dentry); - EXIT; -out: - up(&dir_dentry->d_inode->i_sem); - return rc; + RETURN(rc); } /* NB start and end are used for punch, but not truncate */ @@ -1770,7 +2051,8 @@ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa, XPROCFS_BUMP_MYCPU_IOSTAT (st_punch_reqs, 1); if (end != OBD_OBJECT_EOF) - CERROR("PUNCH not supported, only truncate works\n"); + CERROR("PUNCH not supported, only truncate: end = "LPX64"\n", + end); CDEBUG(D_INODE, "calling truncate for object "LPU64", valid = %x, " "o_size = "LPD64"\n", oa->o_id, oa->o_valid, start); @@ -1781,43 +2063,73 @@ static int filter_truncate(struct lustre_handle *conn, struct obdo *oa, static inline void lustre_put_page(struct page *page) { - kunmap(page); page_cache_release(page); } - -static struct page * -lustre_get_page_read(struct inode *inode, struct niobuf_local *lnb) +static int filter_start_page_read(struct inode *inode, struct niobuf_local *lnb) { - unsigned long index = lnb->offset >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; + unsigned long index = lnb->offset >> PAGE_SHIFT; int rc; - page = read_cache_page(mapping, index, - (filler_t*)mapping->a_ops->readpage, NULL); - if (!IS_ERR(page)) { - wait_on_page(page); - lnb->addr = kmap(page); - lnb->page = page; - if (!PageUptodate(page)) { - CERROR("page index %lu not uptodate\n", index); - GOTO(err_page, rc = -EIO); - } - if (PageError(page)) { - CERROR("page index %lu has error\n", index); - GOTO(err_page, rc = -EIO); - } + page = grab_cache_page(mapping, index); /* locked page */ + if (IS_ERR(page)) + return lnb->rc = PTR_ERR(page); + + lnb->page = page; + + if (inode->i_size < lnb->offset + lnb->len - 1) + lnb->rc = inode->i_size - lnb->offset; + else + lnb->rc = lnb->len; + + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + + rc = mapping->a_ops->readpage(NULL, page); + if (rc < 0) { + CERROR("page index %lu, rc = %d\n", index, rc); + lnb->page = NULL; + lustre_put_page(page); + return lnb->rc = rc; } - return page; + + return 0; +} + +static int filter_finish_page_read(struct niobuf_local *lnb) +{ + if (lnb->page == NULL) + return 0; + + if (PageUptodate(lnb->page)) + return 0; + + wait_on_page(lnb->page); + if (!PageUptodate(lnb->page)) { + CERROR("page index %lu/offset "LPX64" not uptodate\n", + lnb->page->index, lnb->offset); + GOTO(err_page, lnb->rc = -EIO); + } + if (PageError(lnb->page)) { + CERROR("page index %lu/offset "LPX64" has error\n", + lnb->page->index, lnb->offset); + GOTO(err_page, lnb->rc = -EIO); + } + + return 0; err_page: - lustre_put_page(page); - return ERR_PTR(rc); + lustre_put_page(lnb->page); + lnb->page = NULL; + return lnb->rc; } -static struct page * -lustre_get_page_write(struct inode *inode, unsigned long index) +static struct page *lustre_get_page_write(struct inode *inode, + unsigned long index) { struct address_space *mapping = inode->i_mapping; struct page *page; @@ -1826,7 +2138,6 @@ lustre_get_page_write(struct inode *inode, unsigned long index) page = grab_cache_page(mapping, index); /* locked page */ if (!IS_ERR(page)) { - kmap(page); /* Note: Called with "O" and "PAGE_SIZE" this is essentially * a no-op for most filesystems, because we write the whole * page. For partial-page I/O this will read in the page. @@ -1888,7 +2199,7 @@ static int lustre_commit_write(struct niobuf_local *lnb) LASSERT(to <= PAGE_SIZE); err = page->mapping->a_ops->commit_write(NULL, page, from, to); if (!err && IS_SYNC(inode)) - err = waitfor_one_page(page); + waitfor_one_page(page); //SetPageUptodate(page); // the client commit_write will do this SetPageReferenced(page); @@ -1897,8 +2208,8 @@ static int lustre_commit_write(struct niobuf_local *lnb) return err; } -struct page *filter_get_page_write(struct inode *inode, - struct niobuf_local *lnb, int *pglocked) +int filter_get_page_write(struct inode *inode, struct niobuf_local *lnb, + int *pglocked) { unsigned long index = lnb->offset >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; @@ -1923,14 +2234,11 @@ struct page *filter_get_page_write(struct inode *inode, } POISON((void *)addr, 0xBA, PAGE_SIZE); page = virt_to_page(addr); - kmap(page); page->index = index; - lnb->addr = (void *)addr; lnb->page = page; lnb->flags |= N_LOCAL_TEMP_PAGE; } else if (!IS_ERR(page)) { (*pglocked)++; - kmap(page); rc = mapping->a_ops->prepare_write(NULL, page, lnb->offset & ~PAGE_MASK, @@ -1946,17 +2254,16 @@ struct page *filter_get_page_write(struct inode *inode, LBUG(); GOTO(err_unlock, rc = -EIO); } - lnb->addr = page_address(page); lnb->page = page; } - return page; + return 0; err_unlock: unlock_page(page); lustre_put_page(page); err: - return ERR_PTR(rc); + return lnb->rc = rc; } /* @@ -1987,30 +2294,34 @@ static int filter_commit_write(struct niobuf_local *lnb, int err) for (bh = head, block_start = 0; bh != head || !block_start; block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; - if (buffer_new(bh)) - memset(lnb->addr + block_start, 0, blocksize); + if (buffer_new(bh)) { + memset(kmap(lnb->page) + block_start, 0, + blocksize); + kunmap(lnb->page); + } } } #endif return lustre_commit_write(lnb); } -static int filter_preprw(int cmd, struct lustre_handle *conn, +static int filter_preprw(int cmd, struct obd_export *export, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_remote *nb, struct niobuf_local *res, void **desc_private, struct obd_trans_info *oti) { struct obd_run_ctxt saved; - struct obd_export *export; struct obd_device *obd; struct obd_ioobj *o; - struct niobuf_remote *rnb = nb; - struct niobuf_local *lnb = res; + struct niobuf_remote *rnb; + struct niobuf_local *lnb; struct fsfilt_objinfo *fso; - int pglocked = 0; - int rc = 0; - int i; + struct dentry *dentry; + struct inode *inode; + struct lprocfs_counters *cntrs; + int pglocked = 0, rc = 0, i, j; + ENTRY; if ((cmd & OBD_BRW_WRITE) != 0) @@ -2020,14 +2331,18 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, memset(res, 0, niocount * sizeof(*res)); - export = class_conn2export(conn); - obd = class_conn2obd(conn); - if (!obd) { - CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr); + obd = export->exp_obd; + if (obd == NULL) RETURN(-EINVAL); - } - LASSERT(objcount < 16); // theoretically we support multi-obj BRW + cntrs = obd->counters; + if ((cmd & OBD_BRW_WRITE) != 0) + LPROCFS_COUNTER_INCBY1(&cntrs->cntr[LPROC_FILTER_WRITES]); + else + LPROCFS_COUNTER_INCBY1(&cntrs->cntr[LPROC_FILTER_READS]); + + // theoretically we support multi-obj BRW RPCs, but until then... + LASSERT(objcount == 1); OBD_ALLOC(fso, objcount * sizeof(*fso)); if (!fso) @@ -2037,7 +2352,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, for (i = 0, o = obj; i < objcount; i++, o++) { struct filter_dentry_data *fdd; - struct dentry *dentry; LASSERT(o->ioo_bufcnt); @@ -2045,7 +2359,7 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, o->ioo_id), o->ioo_id, 0); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) GOTO(out_objinfo, rc = PTR_ERR(dentry)); fso[i].fso_dentry = dentry; @@ -2054,10 +2368,26 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, if (!dentry->d_inode) { CERROR("trying to BRW to non-existent file "LPU64"\n", o->ioo_id); - f_dput(dentry); GOTO(out_objinfo, rc = -ENOENT); } + /* If we ever start to support mutli-object BRW RPCs, we will + * need to get locks on mulitple inodes (in order) or use the + * DLM to do the locking for us (and use the same locking in + * filter_setattr() for truncate). That isn't all, because + * there still exists the possibility of a truncate starting + * a new transaction while holding the ext3 rwsem = write + * while some writes (which have started their transactions + * here) blocking on the ext3 rwsem = read => lock inversion. + * + * The handling gets very ugly when dealing with locked pages. + * It may be easier to just get rid of the locked page code + * (which has problems of its own) and either discover we do + * not need it anymore (i.e. it was a symptom of another bug) + * or ensure we get the page locks in an appropriate order. + */ + if (cmd & OBD_BRW_WRITE) + down(&dentry->d_inode->i_sem); fdd = dentry->d_fsdata; if (!fdd || !atomic_read(&fdd->fdd_open_count)) CDEBUG(D_PAGE, "I/O to unopened object "LPU64"\n", @@ -2065,22 +2395,6 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, } if (cmd & OBD_BRW_WRITE) { -#warning "FIXME: we need inode->i_sem for each object to protect vs truncate" - /* Even worse, we need to get locks on mulitple inodes (in - * order) or use the DLM to do the locking for us (and use - * the same locking in filter_setattr() for truncate. The - * handling gets very ugly when dealing with locked pages. - * It may be easier to just get rid of the locked page code - * (which has problems of its own) and either discover we do - * not need it anymore (i.e. it was a symptom of another bug) - * or ensure we get the page locks in an appropriate order. - */ - /* Danger, Will Robinson! You are taking a lock here and also - * starting a transaction and releasing/finishing then in - * filter_commitrw(), so you must call fsfilt_commit() and - * finish_transno() if an error occurs in this function. - */ - filter_start_transno(export); *desc_private = fsfilt_brw_start(obd, objcount, fso, niocount, nb); if (IS_ERR(*desc_private)) { @@ -2092,52 +2406,65 @@ static int filter_preprw(int cmd, struct lustre_handle *conn, } } - obd_kmap_get(niocount, 1); - - for (i = 0, o = obj; i < objcount; i++, o++) { - struct dentry *dentry; - struct inode *inode; - int j; - + for (i = 0, o = obj, rnb = nb, lnb = res; i < objcount; i++, o++) { dentry = fso[i].fso_dentry; inode = dentry->d_inode; for (j = 0; j < o->ioo_bufcnt; j++, rnb++, lnb++) { - struct page *page; - if (j == 0) lnb->dentry = dentry; else lnb->dentry = dget(dentry); - /* lnb->offset is aligned, while rnb->offset isn't, - * and we need to copy the fields to lnb anyways. - */ - memcpy(lnb, rnb, sizeof(*rnb)); + lnb->offset = rnb->offset; + lnb->len = rnb->len; + lnb->flags = rnb->flags; + if (cmd & OBD_BRW_WRITE) { - page = filter_get_page_write(inode, lnb, - &pglocked); + rc = filter_get_page_write(inode,lnb,&pglocked); XPROCFS_BUMP_MYCPU_IOSTAT(st_write_bytes, lnb->len); + LPROCFS_COUNTER_INCR(&cntrs->cntr[LPROC_FILTER_WRITE_BYTES], lnb->len); + } else if (inode->i_size <= rnb->offset) { + /* If there's no more data, abort early. + * lnb->page == NULL and lnb->rc == 0, so it's + * easy to detect later. */ + f_dput(lnb->dentry); + lnb->dentry = NULL; + break; } else { - page = lustre_get_page_read(inode, lnb); + rc = filter_start_page_read(inode, lnb); XPROCFS_BUMP_MYCPU_IOSTAT(st_read_bytes, lnb->len); + LPROCFS_COUNTER_INCR(&cntrs->cntr[LPROC_FILTER_READ_BYTES], lnb->len); } - if (IS_ERR(page)) { - rc = PTR_ERR(page); + if (rc) { CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, "error on page @"LPU64"%u/%u: rc = %d\n", lnb->offset, j, o->ioo_bufcnt, rc); f_dput(dentry); GOTO(out_pages, rc); } + + if ((cmd & OBD_BRW_READ) && lnb->rc < lnb->len) { + /* Likewise with a partial read */ + break; + } } } + while ((cmd & OBD_BRW_READ) && lnb-- > res) { + rc = filter_finish_page_read(lnb); + if (rc) { + CERROR("error on page %u@"LPU64": rc = %d\n", + lnb->len, lnb->offset, rc); + f_dput(lnb->dentry); + GOTO(out_pages, rc); + } + } EXIT; out: OBD_FREE(fso, objcount * sizeof(*fso)); @@ -2147,30 +2474,36 @@ out: out_pages: while (lnb-- > res) { - if (cmd & OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) { filter_commit_write(lnb, rc); - else + up(&lnb->dentry->d_inode->i_sem); + } else { lustre_put_page(lnb->page); + } f_dput(lnb->dentry); } - obd_kmap_put(niocount); if (cmd & OBD_BRW_WRITE) { filter_finish_transno(export, *desc_private, oti, rc); fsfilt_commit(obd, filter_parent(obd,S_IFREG,obj->ioo_id)->d_inode, - *desc_private); + *desc_private, 0); } goto out; /* dropped the dentry refs already (one per page) */ out_objinfo: - for (i = 0; i < objcount && fso[i].fso_dentry; i++) + for (i = 0; i < objcount && fso[i].fso_dentry; i++) { + if (cmd & OBD_BRW_WRITE) + up(&fso[i].fso_dentry->d_inode->i_sem); f_dput(fso[i].fso_dentry); + } goto out; } static int filter_write_locked_page(struct niobuf_local *lnb) { struct page *lpage; + void *lpage_addr; + void *lnb_addr; int rc; ENTRY; @@ -2195,11 +2528,15 @@ static int filter_write_locked_page(struct niobuf_local *lnb) RETURN(rc); } - /* lpage is kmapped in lustre_get_page_write() above and kunmapped in - * lustre_commit_write() below, lnb->page was kmapped previously in - * filter_get_page_write() and kunmapped in lustre_put_page() below. - */ - memcpy(page_address(lpage), page_address(lnb->page), PAGE_SIZE); + /* 2 kmaps == vanishingly small deadlock opportunity */ + lpage_addr = kmap(lpage); + lnb_addr = kmap(lnb->page); + + memcpy(lpage_addr, lnb_addr, PAGE_SIZE); + + kunmap(lnb->page); + kunmap(lpage); + lustre_put_page(lnb->page); lnb->page = lpage; @@ -2211,19 +2548,17 @@ static int filter_write_locked_page(struct niobuf_local *lnb) RETURN(rc); } -static int filter_syncfs(struct lustre_handle *conn) +static int filter_syncfs(struct obd_export *exp) { - struct obd_device *obd; + struct obd_device *obd = exp->exp_obd; ENTRY; - obd = class_conn2obd(conn); - XPROCFS_BUMP_MYCPU_IOSTAT (st_syncfs_reqs, 1); RETURN(fsfilt_sync(obd, obd->u.filter.fo_sb)); } -static int filter_commitrw(int cmd, struct lustre_handle *conn, +static int filter_commitrw(int cmd, struct obd_export *export, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, void *desc_private, struct obd_trans_info *oti) @@ -2231,11 +2566,8 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, struct obd_run_ctxt saved; struct obd_ioobj *o; struct niobuf_local *lnb; - struct obd_export *export = class_conn2export(conn); - struct obd_device *obd = class_conn2obd(conn); - int found_locked = 0; - int rc = 0; - int i; + struct obd_device *obd = export->exp_obd; + int found_locked = 0, rc = 0, i; ENTRY; push_ctxt(&saved, &obd->u.filter.fo_ctxt, NULL); @@ -2246,9 +2578,14 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) { int j; - if (cmd & OBD_BRW_WRITE) + if (cmd & OBD_BRW_WRITE) { inode_update_time(lnb->dentry->d_inode, 1); + up(&lnb->dentry->d_inode->i_sem); + } for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { + if (lnb->page == NULL) { + continue; + } if (lnb->flags & N_LOCAL_TEMP_PAGE) { found_locked++; continue; @@ -2259,16 +2596,16 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, if (!rc) rc = err; - } else + } else { lustre_put_page(lnb->page); + } - obd_kmap_put(1); f_dput(lnb->dentry); } } for (i = 0, o = obj, lnb = res; found_locked > 0 && i < objcount; - i++, o++) { + i++, o++) { int j; for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { int err; @@ -2276,7 +2613,6 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, continue; err = filter_write_locked_page(lnb); - obd_kmap_put(1); if (!rc) rc = err; f_dput(lnb->dentry); @@ -2290,14 +2626,13 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, int err; rc = filter_finish_transno(export, desc_private, oti, rc); - err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private); + err = fsfilt_commit(obd, dir_dentry->d_inode, desc_private, + obd_sync_filter); if (err) rc = err; - if (obd_sync_filter) { - /* this can fail with ENOMEM, what should we do then? */ - filter_syncfs(conn); - } - /* XXX <adilger> LASSERT(last_rcvd == last_committed)*/ + if (obd_sync_filter) + LASSERT(oti->oti_transno <= obd->obd_last_committed); + } LASSERT(!current->journal_info); @@ -2308,9 +2643,9 @@ static int filter_commitrw(int cmd, struct lustre_handle *conn, static int filter_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set, - struct obd_trans_info *oti) + struct brw_page *pga, struct obd_trans_info *oti) { + struct obd_export *export = class_conn2export(conn); struct obd_ioobj ioo; struct niobuf_local *lnb; struct niobuf_remote *rnb; @@ -2319,6 +2654,9 @@ static int filter_brw(int cmd, struct lustre_handle *conn, int ret = 0; ENTRY; + if (export == NULL) + RETURN(-EINVAL); + OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local)); OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote)); @@ -2335,7 +2673,7 @@ static int filter_brw(int cmd, struct lustre_handle *conn, ioo.ioo_type = S_IFREG; ioo.ioo_bufcnt = oa_bufs; - ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb, + ret = filter_preprw(cmd, export, 1, &ioo, oa_bufs, rnb, lnb, &desc_private, oti); if (ret != 0) GOTO(out, ret); @@ -2343,16 +2681,20 @@ static int filter_brw(int cmd, struct lustre_handle *conn, for (i = 0; i < oa_bufs; i++) { void *virt = kmap(pga[i].pg); obd_off off = pga[i].off & ~PAGE_MASK; + void *addr = kmap(lnb[i].page); + + /* 2 kmaps == vanishingly small deadlock opportunity */ if (cmd & OBD_BRW_WRITE) - memcpy(lnb[i].addr + off, virt + off, pga[i].count); + memcpy(addr + off, virt + off, pga[i].count); else - memcpy(virt + off, lnb[i].addr + off, pga[i].count); + memcpy(virt + off, addr + off, pga[i].count); + kunmap(addr); kunmap(virt); } - ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private, + ret = filter_commitrw(cmd, export, 1, &ioo, oa_bufs, lnb, desc_private, oti); out: @@ -2360,6 +2702,7 @@ out: OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local)); if (rnb) OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote)); + class_export_put(export); RETURN(ret); } @@ -2381,7 +2724,8 @@ static int filter_san_preprw(int cmd, struct lustre_handle *conn, obd = class_conn2obd(conn); if (!obd) { - CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr); + CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n", + conn->cookie); RETURN(-EINVAL); } @@ -2451,29 +2795,32 @@ static int filter_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) RETURN(fsfilt_statfs(obd, obd->u.filter.fo_sb, osfs)); } -static int filter_get_info(struct lustre_handle *conn, obd_count keylen, - void *key, obd_count *vallen, void **val) +static int filter_get_info(struct lustre_handle *conn, __u32 keylen, + void *key, __u32 *vallen, void *val) { struct obd_device *obd; ENTRY; obd = class_conn2obd(conn); if (!obd) { - CDEBUG(D_IOCTL, "invalid client "LPX64"\n", conn->addr); + CDEBUG(D_IOCTL, "invalid client cookie "LPX64"\n", + conn->cookie); RETURN(-EINVAL); } - if ( keylen == strlen("blocksize") && - memcmp(key, "blocksize", keylen) == 0 ) { - *vallen = sizeof(long); - *val = (void *)(long)obd->u.filter.fo_sb->s_blocksize; + if (keylen == strlen("blocksize") && + memcmp(key, "blocksize", keylen) == 0) { + __u32 *blocksize = val; + *vallen = sizeof(*blocksize); + *blocksize = obd->u.filter.fo_sb->s_blocksize; RETURN(0); } - if ( keylen == strlen("blocksize_bits") && - memcmp(key, "blocksize_bits", keylen) == 0 ){ - *vallen = sizeof(long); - *val = (void *)(long)obd->u.filter.fo_sb->s_blocksize_bits; + if (keylen == strlen("blocksize_bits") && + memcmp(key, "blocksize_bits", keylen) == 0) { + __u32 *blocksize_bits = val; + *vallen = sizeof(*blocksize_bits); + *blocksize_bits = obd->u.filter.fo_sb->s_blocksize_bits; RETURN(0); } @@ -2505,12 +2852,7 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, if (page == NULL) RETURN(-ENOMEM); -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - while (TryLockPage(page)) - ___wait_on_page(page); -#else - wait_on_page_locked(page); -#endif + wait_on_page(page); /* XXX with brw vector I/O, we could batch up reads and writes here, * all we need to do is allocate multiple pages to handle the I/Os @@ -2518,14 +2860,6 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, */ while (index < ((src->o_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) { struct brw_page pg; - struct obd_brw_set *set; - - set = obd_brw_set_new(); - if (set == NULL) { - err = -ENOMEM; - EXIT; - break; - } pg.pg = page; pg.count = PAGE_SIZE; @@ -2533,26 +2867,16 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, pg.flag = 0; page->index = index; - set->brw_callback = ll_brw_sync_wait; - err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, set,NULL); - obd_brw_set_decref(set); + err = obd_brw(OBD_BRW_READ, src_conn, &srcmd, 1, &pg, NULL); if (err) { EXIT; break; } - set = obd_brw_set_new(); - if (set == NULL) { - err = -ENOMEM; - EXIT; - break; - } pg.flag = OBD_BRW_CREATE; CDEBUG(D_INFO, "Read page %ld ...\n", page->index); - set->brw_callback = ll_brw_sync_wait; - err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, set,oti); - obd_brw_set_decref(set); + err = obd_brw(OBD_BRW_WRITE, dst_conn, &dstmd, 1, &pg, oti); /* XXX should handle dst->o_size, dst->o_blocks here */ if (err) { @@ -2574,26 +2898,27 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, } static struct obd_ops filter_obd_ops = { - o_owner: THIS_MODULE, - o_attach: filter_attach, - o_detach: filter_detach, - o_get_info: filter_get_info, - o_setup: filter_setup, - o_cleanup: filter_cleanup, - o_connect: filter_connect, - o_disconnect: filter_disconnect, - o_statfs: filter_statfs, - o_syncfs: filter_syncfs, - o_getattr: filter_getattr, - o_create: filter_create, - o_setattr: filter_setattr, - o_destroy: filter_destroy, - o_open: filter_open, - o_close: filter_close, - o_brw: filter_brw, - o_punch: filter_truncate, - o_preprw: filter_preprw, - o_commitrw: filter_commitrw + o_owner: THIS_MODULE, + o_attach: filter_attach, + o_detach: filter_detach, + o_get_info: filter_get_info, + o_setup: filter_setup, + o_cleanup: filter_cleanup, + o_connect: filter_connect, + o_disconnect: filter_disconnect, + o_statfs: filter_statfs, + o_syncfs: filter_syncfs, + o_getattr: filter_getattr, + o_create: filter_create, + o_setattr: filter_setattr, + o_destroy: filter_destroy, + o_open: filter_open, + o_close: filter_close, + o_brw: filter_brw, + o_punch: filter_truncate, + o_preprw: filter_preprw, + o_commitrw: filter_commitrw, + o_destroy_export: filter_destroy_export, #if 0 o_san_preprw: filter_san_preprw, o_preallocate: filter_preallocate_inodes, @@ -2604,26 +2929,27 @@ static struct obd_ops filter_obd_ops = { }; static struct obd_ops filter_sanobd_ops = { - o_owner: THIS_MODULE, - o_attach: filter_attach, - o_detach: filter_detach, - o_get_info: filter_get_info, - o_setup: filter_san_setup, - o_cleanup: filter_cleanup, - o_connect: filter_connect, - o_disconnect: filter_disconnect, - o_statfs: filter_statfs, - o_getattr: filter_getattr, - o_create: filter_create, - o_setattr: filter_setattr, - o_destroy: filter_destroy, - o_open: filter_open, - o_close: filter_close, - o_brw: filter_brw, - o_punch: filter_truncate, - o_preprw: filter_preprw, - o_commitrw: filter_commitrw, - o_san_preprw: filter_san_preprw, + o_owner: THIS_MODULE, + o_attach: filter_attach, + o_detach: filter_detach, + o_get_info: filter_get_info, + o_setup: filter_san_setup, + o_cleanup: filter_cleanup, + o_connect: filter_connect, + o_disconnect: filter_disconnect, + o_statfs: filter_statfs, + o_getattr: filter_getattr, + o_create: filter_create, + o_setattr: filter_setattr, + o_destroy: filter_destroy, + o_open: filter_open, + o_close: filter_close, + o_brw: filter_brw, + o_punch: filter_truncate, + o_preprw: filter_preprw, + o_commitrw: filter_commitrw, + o_san_preprw: filter_san_preprw, + o_destroy_export: filter_destroy_export #if 0 o_preallocate: filter_preallocate_inodes, o_migrate: filter_migrate, @@ -2639,41 +2965,19 @@ static int __init obdfilter_init(void) int rc; printk(KERN_INFO "Lustre Filtering OBD driver; info@clusterfs.com\n"); - filter_open_cache = kmem_cache_create("ll_filter_fdata", - sizeof(struct filter_file_data), - 0, 0, NULL, NULL); - if (!filter_open_cache) - RETURN(-ENOMEM); - - filter_dentry_cache = kmem_cache_create("ll_filter_dentry", - sizeof(struct filter_dentry_data), - 0, 0, NULL, NULL); - if (!filter_dentry_cache) { - rc = -ENOMEM; - goto err1; - } xprocfs_init ("filter"); - lprocfs_init_vars(&lvars); rc = class_register_type(&filter_obd_ops, lvars.module_vars, OBD_FILTER_DEVICENAME); if (rc) - goto err2; + return rc; rc = class_register_type(&filter_sanobd_ops, lvars.module_vars, OBD_FILTER_SAN_DEVICENAME); if (rc) - goto err3; - - return 0; -err3: - class_unregister_type(OBD_FILTER_DEVICENAME); -err2: - kmem_cache_destroy(filter_dentry_cache); -err1: - kmem_cache_destroy(filter_open_cache); + class_unregister_type(OBD_FILTER_DEVICENAME); return rc; } @@ -2681,10 +2985,6 @@ static void __exit obdfilter_exit(void) { class_unregister_type(OBD_FILTER_SAN_DEVICENAME); class_unregister_type(OBD_FILTER_DEVICENAME); - if (kmem_cache_destroy(filter_dentry_cache)) - CERROR("couldn't free obdfilter dentry cache\n"); - if (kmem_cache_destroy(filter_open_cache)) - CERROR("couldn't free obdfilter open cache\n"); xprocfs_fini (); } diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index c4e0747..89203e5 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -55,6 +55,18 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof, return snprintf(page, count, "%s\n", dev->u.filter.fo_fstype); } +int lprocfs_filter_rd_mntdev(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* obd = (struct obd_device *)data; + + LASSERT(obd != NULL); + LASSERT(obd->u.filter.fo_vfsmnt->mnt_devname); + *eof = 1; + return snprintf(page, count, "%s\n", + obd->u.filter.fo_vfsmnt->mnt_devname); +} + struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "blocksize", rd_blksize, 0, 0 }, @@ -64,6 +76,7 @@ struct lprocfs_vars lprocfs_obd_vars[] = { { "filesfree", rd_filesfree, 0, 0 }, { "filegroups", rd_filegroups, 0, 0 }, { "fstype", rd_fstype, 0, 0 }, + { "mntdev", lprocfs_filter_rd_mntdev, 0, 0 }, { 0 } }; diff --git a/lustre/osc/Makefile.am b/lustre/osc/Makefile.am index 19fd65c..dc0b4d8 100644 --- a/lustre/osc/Makefile.am +++ b/lustre/osc/Makefile.am @@ -5,25 +5,14 @@ DEFS= - if LIBLUSTRE lib_LIBRARIES = libosc.a -LINX= obd_pack.c client.c -libosc_a_SOURCES = osc_request.c $(LINX) +libosc_a_SOURCES = osc_request.c else MODULE = osc modulefs_DATA = osc.o EXTRA_PROGRAMS = osc -LINX= obd_pack.c client.c -osc_SOURCES = osc_request.c lproc_osc.c $(LINX) +osc_SOURCES = osc_request.c lproc_osc.c osc_lib.c endif -obd_pack.c: - test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c -client.c: - test -e client.c || ln -sf $(top_srcdir)/lib/client.c - -dist-hook: - list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done - include $(top_srcdir)/Rules diff --git a/lustre/osc/osc_lib.c b/lustre/osc/osc_lib.c new file mode 100644 index 0000000..aa04a1a --- /dev/null +++ b/lustre/osc/osc_lib.c @@ -0,0 +1,76 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_OSC + +#ifdef __KERNEL__ +# include <linux/module.h> +# include <linux/obd.h> +# include <linux/obd_ost.h> +# include <linux/lustre_net.h> +# include <linux/lustre_dlm.h> + +/* convert a pathname into a kdev_t */ +static kdev_t path2dev(char *path) +{ + struct dentry *dentry; + struct nameidata nd; + kdev_t dev; + KDEVT_VAL(dev, 0); + + if (!path_init(path, LOOKUP_FOLLOW, &nd)) + return 0; + + if (path_walk(path, &nd)) + return 0; + + dentry = nd.dentry; + if (dentry->d_inode && !is_bad_inode(dentry->d_inode) && + S_ISBLK(dentry->d_inode->i_mode)) + dev = dentry->d_inode->i_rdev; + path_release(&nd); + + return dev; +} + +int client_sanobd_setup(struct obd_device *obddev, obd_count len, void *buf) +{ + struct obd_ioctl_data* data = buf; + struct client_obd *cli = &obddev->u.cli; + ENTRY; + + if (data->ioc_inllen3 < 1) { + CERROR("setup requires a SAN device pathname\n"); + RETURN(-EINVAL); + } + + client_obd_setup(obddev, len, buf); + + cli->cl_sandev = path2dev(data->ioc_inlbuf3); + if (!kdev_t_to_nr(cli->cl_sandev)) { + CERROR("%s seems not a valid SAN device\n", data->ioc_inlbuf3); + RETURN(-EINVAL); + } + + RETURN(0); +} +#endif diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 515aa70..2289c74 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -48,65 +48,20 @@ #include <linux/kp30.h> #include <linux/lustre_mds.h> /* for mds_objid */ #include <linux/obd_ost.h> -#include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */ + +#ifndef __CYGWIN__ #include <linux/ctype.h> #include <linux/init.h> +#else +#include <ctype.h> +#endif + #include <linux/lustre_ha.h> #include <linux/obd_support.h> /* for OBD_FAIL_CHECK */ #include <linux/lustre_lite.h> /* for ll_i2info */ #include <portals/lib-types.h> /* for PTL_MD_MAX_IOV */ #include <linux/lprocfs_status.h> -/* It is important that ood_fh remain the first item in this structure: that - * way, we don't have to re-pack the obdo's inline data before we send it to - * the server, we can just send the whole struct unaltered. */ -#define OSC_OBDO_DATA_MAGIC 0xD15EA5ED -struct osc_obdo_data { - struct lustre_handle ood_fh; - struct ptlrpc_request *ood_request; - __u32 ood_magic; -}; -#include <linux/obd_lov.h> /* just for the startup assertion; is that wrong? */ - -static int send_sync(struct obd_import *imp, struct ll_fid *rootfid, - int level, int msg_flags) -{ - struct ptlrpc_request *req; - struct mds_body *body; - int rc, size = sizeof(*body); - ENTRY; - - req = ptlrpc_prep_req(imp, OST_SYNCFS, 1, &size, NULL); - if (!req) - GOTO(out, rc = -ENOMEM); - - body = lustre_msg_buf(req->rq_reqmsg, 0); - req->rq_level = level; - req->rq_replen = lustre_msg_size(1, &size); - - req->rq_reqmsg->flags |= msg_flags; - rc = ptlrpc_queue_wait(req); - - if (!rc) { - CDEBUG(D_NET, "last_committed="LPU64 - ", last_xid="LPU64"\n", - req->rq_repmsg->last_committed, - req->rq_repmsg->last_xid); - } - - EXIT; - out: - ptlrpc_req_finished(req); - return rc; -} - -static int signal_completed_replay(struct obd_import *imp) -{ - struct ll_fid fid; - - return send_sync(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY); -} - static int osc_attach(struct obd_device *dev, obd_count len, void *data) { struct lprocfs_static_vars lvars; @@ -120,7 +75,7 @@ static int osc_detach(struct obd_device *dev) return lprocfs_obd_detach(dev); } -/* Pack OSC object metadata for shipment to the MDS. */ +/* Pack OSC object metadata for disk storage (LE byte order). */ static int osc_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, struct lov_stripe_md *lsm) { @@ -142,20 +97,36 @@ static int osc_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, if (!*lmmp) RETURN(-ENOMEM); } + if (lsm) { LASSERT(lsm->lsm_object_id); - (*lmmp)->lmm_object_id = (lsm->lsm_object_id); + (*lmmp)->lmm_object_id = cpu_to_le64 (lsm->lsm_object_id); } RETURN(lmm_size); } +/* Unpack OSC object metadata from disk storage (LE byte order). */ static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, - struct lov_mds_md *lmm) + struct lov_mds_md *lmm, int lmm_bytes) { int lsm_size; ENTRY; + if (lmm != NULL) { + if (lmm_bytes < sizeof (*lmm)) { + CERROR("lov_mds_md too small: %d, need %d\n", + lmm_bytes, (int)sizeof(*lmm)); + RETURN (-EINVAL); + } + /* XXX LOV_MAGIC etc check? */ + + if (lmm->lmm_object_id == cpu_to_le64 (0)) { + CERROR ("lov_mds_md: zero lmm_object_id\n"); + RETURN (-EINVAL); + } + } + lsm_size = sizeof(**lsmp); if (!lsmp) RETURN(lsm_size); @@ -172,21 +143,76 @@ static int osc_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, RETURN(-ENOMEM); } - /* XXX endianness */ if (lmm) { - (*lsmp)->lsm_object_id = (lmm->lmm_object_id); + /* XXX zero *lsmp? */ + (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id); + (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; LASSERT((*lsmp)->lsm_object_id); } RETURN(lsm_size); } -inline void oti_from_request(struct obd_trans_info *oti, - struct ptlrpc_request *req) +#warning "FIXME: make this be sent from OST" +#define OSC_BRW_MAX_SIZE 65536 +#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE) + +static int osc_getattr_interpret(struct ptlrpc_request *req, + struct osc_getattr_async_args *aa, int rc) { - if (oti && req->rq_repmsg) - oti->oti_transno = NTOH__u64(req->rq_repmsg->transno); - EXIT; + struct obdo *oa = aa->aa_oa; + struct ost_body *body; + ENTRY; + + if (rc != 0) { + CERROR("failed: rc = %d\n", rc); + RETURN (rc); + } + + body = lustre_swab_repbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + RETURN (-EPROTO); + } + + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); + memcpy(oa, &body->oa, sizeof(*oa)); + + /* This should really be sent by the OST */ + oa->o_blksize = OSC_BRW_MAX_SIZE; + oa->o_valid |= OBD_MD_FLBLKSZ; + + RETURN (0); +} + +static int osc_getattr_async(struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *md, + struct ptlrpc_request_set *set) +{ + struct ptlrpc_request *request; + struct ost_body *body; + int size = sizeof(*body); + struct osc_getattr_async_args *aa; + ENTRY; + + request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_GETATTR, 1, + &size, NULL); + if (!request) + RETURN(-ENOMEM); + + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); + memcpy(&body->oa, oa, sizeof(*oa)); + + request->rq_replen = lustre_msg_size(1, &size); + request->rq_interpret_reply = osc_getattr_interpret; + + LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args)); + aa = (struct osc_getattr_async_args *)&request->rq_async_args; + aa->aa_oa = oa; + + ptlrpc_set_add_req (set, request); + RETURN (0); } static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, @@ -202,8 +228,7 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); -#warning FIXME: pack only valid fields instead of memcpy, endianness + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); @@ -214,32 +239,103 @@ static int osc_getattr(struct lustre_handle *conn, struct obdo *oa, GOTO(out, rc); } - body = lustre_msg_buf(request->rq_repmsg, 0); + body = lustre_swab_repbuf(request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + GOTO (out, rc = -EPROTO); + } + CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); memcpy(oa, &body->oa, sizeof(*oa)); + /* This should really be sent by the OST */ + oa->o_blksize = OSC_BRW_MAX_SIZE; + oa->o_valid |= OBD_MD_FLBLKSZ; + EXIT; out: ptlrpc_req_finished(request); return rc; } +/* The import lock must already be held. */ +static inline void osc_update_body_handle(struct list_head *head, + struct lustre_handle *old, + struct lustre_handle *new, int op) +{ + struct list_head *tmp; + struct ost_body *body; + struct ptlrpc_request *req; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ + + list_for_each(tmp, head) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + + /* XXX ok to remove when bug 1303 resolved - rread 05/27/03 */ + LASSERT (req != last_req); + last_req = req; + + if (req->rq_reqmsg->opc != op) + continue; + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*body)); + if (memcmp(obdo_handle(&body->oa), old, sizeof(*old))) + continue; + + DEBUG_REQ(D_HA, req, "updating close body with new fh"); + memcpy(obdo_handle(&body->oa), new, sizeof(*new)); + } +} + +static void osc_replay_open(struct ptlrpc_request *req) +{ + struct lustre_handle old; + struct ost_body *body; + struct obd_client_handle *och = req->rq_replay_data; + struct lustre_handle *oa_handle; + ENTRY; + + body = lustre_swab_repbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + LASSERT (body != NULL); + + oa_handle = obdo_handle(&body->oa); + + memcpy(&old, &och->och_fh, sizeof(old)); + CDEBUG(D_HA, "updating cookie from "LPD64" to "LPD64"\n", + och->och_fh.cookie, oa_handle->cookie); + memcpy(&och->och_fh, oa_handle, sizeof(och->och_fh)); + + /* A few frames up, ptlrpc_replay holds the lock, so this is safe. */ + osc_update_body_handle(&req->rq_import->imp_sending_list, &old, + &och->och_fh, OST_CLOSE); + osc_update_body_handle(&req->rq_import->imp_delayed_list, &old, + &och->och_fh, OST_CLOSE); + EXIT; +} + + static int osc_open(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md *md, struct obd_trans_info *oti) + struct lov_stripe_md *md, struct obd_trans_info *oti, + struct obd_client_handle *och) { struct ptlrpc_request *request; struct ost_body *body; + unsigned long flags; int rc, size = sizeof(*body); ENTRY; + LASSERT(och != NULL); request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_OPEN, 1, &size, NULL); if (!request) RETURN(-ENOMEM); - request->rq_flags |= PTL_RPC_FL_REPLAY; - body = lustre_msg_buf(request->rq_reqmsg, 0); -#warning FIXME: pack only valid fields instead of memcpy, endianness + spin_lock_irqsave (&request->rq_lock, flags); + request->rq_replay = 1; + spin_unlock_irqrestore (&request->rq_lock, flags); + + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); @@ -248,28 +344,34 @@ static int osc_open(struct lustre_handle *conn, struct obdo *oa, if (rc) GOTO(out, rc); - if (oa) { - struct osc_obdo_data ood; - body = lustre_msg_buf(request->rq_repmsg, 0); - memcpy(oa, &body->oa, sizeof(*oa)); - - /* If the open succeeded, we better have a handle */ - /* BlueArc OSTs don't send back (o_valid | FLHANDLE). sigh. - * Temporary workaround until fixed. -phil 24 Feb 03 */ - //LASSERT(oa->o_valid & OBD_MD_FLHANDLE); - oa->o_valid |= OBD_MD_FLHANDLE; - - memcpy(&ood.ood_fh, obdo_handle(oa), sizeof(ood.ood_fh)); - ood.ood_request = ptlrpc_request_addref(request); - ood.ood_magic = OSC_OBDO_DATA_MAGIC; - - /* Save this data in the request; it will be passed back to us - * in future obdos. This memcpy is guaranteed to be safe, - * because we check at compile-time that sizeof(ood) is smaller - * than oa->o_inline. */ - memcpy(&oa->o_inline, &ood, sizeof(ood)); + body = lustre_swab_repbuf (request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Can't unpack ost_body\n"); + GOTO (out, rc = -EPROTO); } + memcpy(oa, &body->oa, sizeof(*oa)); + + /* If the open succeeded, we better have a handle */ + /* BlueArc OSTs don't send back (o_valid | FLHANDLE). sigh. + * Temporary workaround until fixed. -phil 24 Feb 03 */ + // if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) { + // CERROR ("No file handle\n"); + // GOTO (out, rc = -EPROTO); + // } + oa->o_valid |= OBD_MD_FLHANDLE; + + /* This should really be sent by the OST */ + oa->o_blksize = OSC_BRW_MAX_SIZE; + oa->o_valid |= OBD_MD_FLBLKSZ; + + memcpy(&och->och_fh, obdo_handle(oa), sizeof(och->och_fh)); + request->rq_replay_cb = osc_replay_open; + request->rq_replay_data = och; + och->och_req = ptlrpc_request_addref(request); + och->och_magic = OBD_CLIENT_HANDLE_MAGIC; + EXIT; out: ptlrpc_req_finished(request); @@ -282,55 +384,70 @@ static int osc_close(struct lustre_handle *conn, struct obdo *oa, struct obd_import *import = class_conn2cliimp(conn); struct ptlrpc_request *request; struct ost_body *body; - struct osc_obdo_data *ood; + struct obd_client_handle *och; unsigned long flags; int rc, size = sizeof(*body); ENTRY; LASSERT(oa != NULL); - ood = (struct osc_obdo_data *)&oa->o_inline; - LASSERT(ood->ood_magic == OSC_OBDO_DATA_MAGIC); + och = (struct obd_client_handle *)&oa->o_inline; + if (och->och_magic == 0) { + /* Zero magic means that this file was never opened on this + * OST--almost certainly because the OST was inactive at + * open-time */ + RETURN(0); + } + LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); request = ptlrpc_prep_req(import, OST_CLOSE, 1, &size, NULL); if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); -#warning FIXME: pack only valid fields instead of memcpy, endianness + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); rc = ptlrpc_queue_wait(request); - if (rc) { - /* FIXME: Does this mean that the file is still open locally? - * If not, and I somehow suspect not, we need to cleanup - * below */ - GOTO(out, rc); - } - - spin_lock_irqsave(&import->imp_lock, flags); - ood->ood_request->rq_flags &= ~PTL_RPC_FL_REPLAY; - /* see comments in llite/file.c:ll_mdc_close() */ - if (ood->ood_request->rq_transno) { - LBUG(); /* this can't happen yet */ - if (!request->rq_transno) { - request->rq_transno = ood->ood_request->rq_transno; - ptlrpc_retain_replayable_request(request, import); + if (rc) + CDEBUG(D_HA, "Suppressing close error %d\n", rc); // bug 1036 + + /* och_req == NULL can't happen any more, right? --phik */ + if (och->och_req != NULL) { + spin_lock_irqsave(&import->imp_lock, flags); + spin_lock (&och->och_req->rq_lock); + och->och_req->rq_replay = 0; + spin_unlock (&och->och_req->rq_lock); + /* see comments in llite/file.c:ll_mdc_close() */ + if (och->och_req->rq_transno) { + /* this can't happen yet, because the OSTs don't yet + * issue transnos for OPEN requests -phik 21 Apr 2003 */ + LBUG(); + if (!request->rq_transno && import->imp_replayable) { + request->rq_transno = och->och_req->rq_transno; + ptlrpc_retain_replayable_request(request, + import); + } + spin_unlock_irqrestore(&import->imp_lock, flags); + } else { + spin_unlock_irqrestore(&import->imp_lock, flags); } - spin_unlock_irqrestore(&import->imp_lock, flags); - } else { - spin_unlock_irqrestore(&import->imp_lock, flags); - ptlrpc_req_finished(ood->ood_request); + + ptlrpc_req_finished(och->och_req); } - body = lustre_msg_buf(request->rq_repmsg, 0); - memcpy(oa, &body->oa, sizeof(*oa)); + if (!rc) { + body = lustre_swab_repbuf (request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + rc = -EPROTO; + CDEBUG(D_HA, "Suppressing close error %d\n", rc); // bug 1036 + } else + memcpy(oa, &body->oa, sizeof(*oa)); + } - EXIT; - out: ptlrpc_req_finished(request); - return rc; + RETURN(0); } static int osc_setattr(struct lustre_handle *conn, struct obdo *oa, @@ -346,7 +463,7 @@ static int osc_setattr(struct lustre_handle *conn, struct obdo *oa, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); @@ -358,12 +475,11 @@ static int osc_setattr(struct lustre_handle *conn, struct obdo *oa, } static int osc_create(struct lustre_handle *conn, struct obdo *oa, - struct lov_stripe_md **ea, struct obd_trans_info *oti_in) + struct lov_stripe_md **ea, struct obd_trans_info *oti) { struct ptlrpc_request *request; struct ost_body *body; struct lov_stripe_md *lsm; - struct obd_trans_info *oti, trans_info; int rc, size = sizeof(*body); ENTRY; @@ -377,17 +493,12 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa, RETURN(rc); } - if (oti_in) - oti = oti_in; - else - oti = &trans_info; - request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_CREATE, 1, &size, NULL); if (!request) GOTO(out, rc = -ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); @@ -396,15 +507,28 @@ static int osc_create(struct lustre_handle *conn, struct obdo *oa, if (rc) GOTO(out_req, rc); - body = lustre_msg_buf(request->rq_repmsg, 0); + body = lustre_swab_repbuf (request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + GOTO (out_req, rc = -EPROTO); + } + memcpy(oa, &body->oa, sizeof(*oa)); + /* This should really be sent by the OST */ + oa->o_blksize = OSC_BRW_MAX_SIZE; + oa->o_valid |= OBD_MD_FLBLKSZ; + lsm->lsm_object_id = oa->o_id; lsm->lsm_stripe_count = 0; + lsm->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; *ea = lsm; - oti_from_request(oti, request); - CDEBUG(D_HA, "transno: "LPD64"\n", oti->oti_transno); + if (oti != NULL) + oti->oti_transno = request->rq_repmsg->transno; + + CDEBUG(D_HA, "transno: "LPD64"\n", request->rq_repmsg->transno); EXIT; out_req: ptlrpc_req_finished(request); @@ -433,14 +557,13 @@ static int osc_punch(struct lustre_handle *conn, struct obdo *oa, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); -#warning FIXME: pack only valid fields instead of memcpy, endianness, valid + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); /* overload the size and blocks fields in the oa with start/end */ - body->oa.o_size = HTON__u64(start); - body->oa.o_blocks = HTON__u64(end); - body->oa.o_valid |= HTON__u32(OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + body->oa.o_size = start; + body->oa.o_blocks = end; + body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); request->rq_replen = lustre_msg_size(1, &size); @@ -448,7 +571,13 @@ static int osc_punch(struct lustre_handle *conn, struct obdo *oa, if (rc) GOTO(out, rc); - body = lustre_msg_buf(request->rq_repmsg, 0); + body = lustre_swab_repbuf (request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("can't unpack ost_body\n"); + GOTO (out, rc = -EPROTO); + } + memcpy(oa, &body->oa, sizeof(*oa)); EXIT; @@ -474,8 +603,7 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); -#warning FIXME: pack only valid fields instead of memcpy, endianness + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); memcpy(&body->oa, oa, sizeof(*oa)); request->rq_replen = lustre_msg_size(1, &size); @@ -484,7 +612,13 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa, if (rc) GOTO(out, rc); - body = lustre_msg_buf(request->rq_repmsg, 0); + body = lustre_swab_repbuf (request, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Can't unpack body\n"); + GOTO (out, rc = -EPROTO); + } + memcpy(oa, &body->oa, sizeof(*oa)); EXIT; @@ -493,191 +627,259 @@ static int osc_destroy(struct lustre_handle *conn, struct obdo *oa, return rc; } -/* Our bulk-unmapping bottom half. */ -static void unmap_and_decref_bulk_desc(void *data) +/* We assume that the reason this OSC got a short read is because it read + * beyond the end of a stripe file; i.e. lustre is reading a sparse file + * via the LOV, and it _knows_ it's reading inside the file, it's just that + * this stripe never got written at or beyond this stripe offset yet. */ +static void handle_short_read(int nob_read, obd_count page_count, + struct brw_page *pga) { - struct ptlrpc_bulk_desc *desc = data; - struct list_head *tmp; - ENTRY; - - list_for_each(tmp, &desc->bd_page_list) { - struct ptlrpc_bulk_page *bulk; - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); + char *ptr; + + /* skip bytes read OK */ + while (nob_read > 0) { + LASSERT (page_count > 0); + + if (pga->count > nob_read) { + /* EOF inside this page */ + ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK); + memset(ptr + nob_read, 0, pga->count - nob_read); + kunmap(pga->pg); + page_count--; + pga++; + break; + } - kunmap(bulk->bp_page); - obd_kmap_put(1); + nob_read -= pga->count; + page_count--; + pga++; } - ptlrpc_bulk_decref(desc); - EXIT; + /* zero remaining pages */ + while (page_count-- > 0) { + ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK); + memset(ptr, 0, pga->count); + kunmap(pga->pg); + pga++; + } } - -/* this is the callback function which is invoked by the Portals - * event handler associated with the bulk_sink queue and bulk_source queue. - */ -static void osc_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc) +static int check_write_rcs (struct ptlrpc_request *request, + int niocount, obd_count page_count, + struct brw_page *pga) { - ENTRY; - - LASSERT(desc->bd_brw_set != NULL); - LASSERT(desc->bd_brw_set->brw_callback != NULL); - - /* It's important that you don't use desc->bd_brw_set after this - * callback runs. If you do, take a reference on it. */ - desc->bd_brw_set->brw_callback(desc->bd_brw_set, CB_PHASE_FINISH); - - /* We can't kunmap the desc from interrupt context, so we do it from - * the bottom half above. */ - prepare_work(&desc->bd_queue, unmap_and_decref_bulk_desc, desc); - schedule_work(&desc->bd_queue); + int i; + __u32 *remote_rcs; + + /* return error if any niobuf was in error */ + remote_rcs = lustre_swab_repbuf(request, 1, + sizeof(*remote_rcs) * niocount, NULL); + if (remote_rcs == NULL) { + CERROR ("Missing/short RC vector on BRW_WRITE reply\n"); + return (-EPROTO); + } + if (lustre_msg_swabbed (request->rq_repmsg)) + for (i = 0; i < niocount; i++) + __swab32s (&remote_rcs[i]); + + for (i = 0; i < niocount; i++) { + if (remote_rcs[i] < 0) + return (remote_rcs[i]); + + if (remote_rcs[i] != 0) { + CERROR ("rc[%d] invalid (%d) req %p\n", + i, remote_rcs[i], request); + return (-EPROTO); + } + } - EXIT; + return (0); } -/* - * This is called when there was a bulk error return. However, we don't know - * whether the bulk completed or not. We cancel the portals bulk descriptors, - * so that if the OST decides to send them later we don't double free. Then - * remove this descriptor from the set so that the set callback doesn't wait - * forever for the last CB_PHASE_FINISH to be called, and finally dump all of - * the bulk descriptor references. - */ -static void osc_ptl_ev_abort(struct ptlrpc_bulk_desc *desc) +static inline int can_merge_pages (struct brw_page *p1, struct brw_page *p2) { - ENTRY; - - LASSERT(desc->bd_brw_set != NULL); - - /* XXX reconcile this with ll_sync_brw_timeout() handling, and/or - * just make osc_ptl_ev_hdlr() check desc->bd_flags for either - * PTL_BULK_FL_RCVD or PTL_BULK_FL_SENT, and pass CB_PHASE_ABORT - * to brw_callback() and do the rest of the cleanup there. I - * also think ll_sync_brw_timeout() is missing an PtlMEUnlink, - * but I could be wrong. - */ - if (ptlrpc_abort_bulk(desc)) { - EXIT; - return; + if (p1->flag != p2->flag) { + /* XXX we don't make much use of 'flag' right now + * but this will warn about usage when we do */ + CERROR ("different flags set %d, %d\n", + p1->flag, p2->flag); + return (0); } - obd_brw_set_del(desc); - unmap_and_decref_bulk_desc(desc); - EXIT; + return (p1->off + p1->count == p2->off); } -static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm, - obd_count page_count, struct brw_page *pga, - struct obd_brw_set *set) +#if CHECKSUM_BULK +static __u64 cksum_pages(int nob, obd_count page_count, struct brw_page *pga) { - struct obd_import *imp = class_conn2cliimp(conn); - struct ptlrpc_connection *connection = imp->imp_connection; - struct ptlrpc_request *request = NULL; - struct ptlrpc_bulk_desc *desc = NULL; - struct ost_body *body; - int rc, size[3] = {sizeof(*body)}, mapped = 0; - struct obd_ioobj *iooptr; - struct niobuf_remote *nioptr; - __u32 xid; - ENTRY; - -restart_bulk: - size[1] = sizeof(struct obd_ioobj); - size[2] = page_count * sizeof(struct niobuf_remote); + __u64 cksum = 0; + char *ptr; + int i; - request = ptlrpc_prep_req(imp, OST_READ, 3, size, NULL); - if (!request) - RETURN(-ENOMEM); + while (nob > 0) { + LASSERT (page_count > 0); - body = lustre_msg_buf(request->rq_reqmsg, 0); - body->oa.o_valid = HTON__u32(OBD_MD_FLCKSUM * CHECKSUM_BULK); + ptr = kmap (pga->pg); + ost_checksum (&cksum, ptr + (pga->off & (PAGE_SIZE - 1)), + pga->count > nob ? nob : pga->count); + kunmap (pga->pg); - desc = ptlrpc_prep_bulk(connection); - if (!desc) - GOTO(out_req, rc = -ENOMEM); - desc->bd_portal = OST_BULK_PORTAL; - desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr; - CDEBUG(D_PAGE, "desc = %p\n", desc); + nob -= pga->count; + page_count--; + pga++; + } - iooptr = lustre_msg_buf(request->rq_reqmsg, 1); - nioptr = lustre_msg_buf(request->rq_reqmsg, 2); - ost_pack_ioo(iooptr, lsm, page_count); - /* end almost identical to brw_write case */ + return (cksum); +} +#endif - xid = ptlrpc_next_xid(); /* single xid for all pages */ +static int osc_brw_prep_request(struct obd_import *imp, + struct lov_stripe_md *lsm, obd_count page_count, + struct brw_page *pga, int cmd, + int *requested_nobp, int *niocountp, + struct ptlrpc_request **reqp) +{ + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct ost_body *body; + struct obd_ioobj *ioobj; + struct niobuf_remote *niobuf; + unsigned long flags; + int niocount; + int size[3]; + int i; + int requested_nob; + int opc; + int rc; + + opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ; + + for (niocount = i = 1; i < page_count; i++) + if (!can_merge_pages (&pga[i - 1], &pga[i])) + niocount++; + + size[0] = sizeof (*body); + size[1] = sizeof (*ioobj); + size[2] = niocount * sizeof (*niobuf); + + req = ptlrpc_prep_req (imp, opc, 3, size, NULL); + if (req == NULL) + return (-ENOMEM); + + if (opc == OST_WRITE) + desc = ptlrpc_prep_bulk_imp(req, BULK_GET_SOURCE, + OST_BULK_PORTAL); + else + desc = ptlrpc_prep_bulk_imp(req, BULK_PUT_SINK, + OST_BULK_PORTAL); + if (desc == NULL) + GOTO (out, rc = -ENOMEM); + /* NB request now owns desc and will free it when it gets freed */ + + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body)); + ioobj = lustre_msg_buf(req->rq_reqmsg, 1, sizeof(*ioobj)); + niobuf = lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf)); + + ioobj->ioo_id = lsm->lsm_object_id; + ioobj->ioo_gr = 0; + ioobj->ioo_type = S_IFREG; + ioobj->ioo_bufcnt = niocount; + + LASSERT (page_count > 0); + for (requested_nob = i = 0; i < page_count; i++, niobuf++) { + struct brw_page *pg = &pga[i]; + struct brw_page *pg_prev = pg - 1; + + LASSERT (pg->count > 0); + LASSERT ((pg->off & (PAGE_SIZE - 1)) + pg->count <= PAGE_SIZE); + LASSERT (i == 0 || pg->off > pg_prev->off); + + rc = ptlrpc_prep_bulk_page (desc, pg->pg, + pg->off & (PAGE_SIZE - 1), + pg->count); + if (rc != 0) + GOTO (out, rc); - obd_kmap_get(page_count, 0); + requested_nob += pg->count; - for (mapped = 0; mapped < page_count; mapped++, nioptr++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) { - unmap_and_decref_bulk_desc(desc); - GOTO(out_req, rc = -ENOMEM); + if (i > 0 && + can_merge_pages (pg_prev, pg)) { + niobuf--; + niobuf->len += pg->count; + } else { + niobuf->offset = pg->off; + niobuf->len = pg->count; + niobuf->flags = pg->flag; } - - LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); - - bulk->bp_xid = xid; /* single xid for all pages */ - bulk->bp_buf = kmap(pga[mapped].pg); - bulk->bp_page = pga[mapped].pg; - bulk->bp_buflen = PAGE_SIZE; - ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count, - pga[mapped].flag, bulk->bp_xid); } - /* - * Register the bulk first, because the reply could arrive out of order, - * and we want to be ready for the bulk data. - * - * One reference is released when osc_ptl_ev_hdlr() is called by - * portals, the other when the caller removes us from the "set" list. - * - * On error, we never do the brw_finish, so we handle all decrefs. - */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_READ_BULK)) { - CERROR("obd_fail_loc=%x, skipping register_bulk\n", - OBD_FAIL_OSC_BRW_READ_BULK); + LASSERT ((void *)(niobuf - niocount) == + lustre_msg_buf(req->rq_reqmsg, 2, niocount * sizeof(*niobuf))); +#if CHECKSUM_BULK + body->oa.o_valid |= OBD_MD_FLCKSUM; + if (opc == OST_BRW_WRITE) + body->oa.o_rdev = cksum_pages (requested_nob, page_count, pga); +#endif + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_no_resend = 1; + spin_unlock_irqrestore (&req->rq_lock, flags); + + /* size[0] still sizeof (*body) */ + if (opc == OST_WRITE) { + /* 1 RC per niobuf */ + size[1] = sizeof(__u32) * niocount; + req->rq_replen = lustre_msg_size(2, size); } else { - rc = ptlrpc_register_bulk_put(desc); - if (rc) { - unmap_and_decref_bulk_desc(desc); - GOTO(out_req, rc); - } - obd_brw_set_add(set, desc); + /* 1 RC for the whole I/O */ + req->rq_replen = lustre_msg_size(1, size); } - request->rq_flags |= PTL_RPC_FL_NO_RESEND; - request->rq_replen = lustre_msg_size(1, size); - rc = ptlrpc_queue_wait(request); + *niocountp = niocount; + *requested_nobp = requested_nob; + *reqp = req; + return (0); - /* XXX bug 937 here */ - if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) { - DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); - ptlrpc_req_finished(request); - goto restart_bulk; + out: + ptlrpc_req_finished (req); + return (rc); +} + +static int osc_brw_fini_request (struct ptlrpc_request *req, + int requested_nob, int niocount, + obd_count page_count, struct brw_page *pga, + int rc) +{ + if (rc < 0) + return (rc); + + if (req->rq_reqmsg->opc == OST_WRITE) { + if (rc > 0) { + CERROR ("Unexpected +ve rc %d\n", rc); + return (-EPROTO); + } + + return (check_write_rcs(req, niocount, page_count, pga)); } - if (rc) { - osc_ptl_ev_abort(desc); - GOTO(out_req, rc); + if (rc > requested_nob) { + CERROR ("Unexpected rc %d (%d requested)\n", + rc, requested_nob); + return (-EPROTO); } + if (rc < requested_nob) + handle_short_read (rc, page_count, pga); + #if CHECKSUM_BULK - body = lustre_msg_buf(request->rq_repmsg, 0); - if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM)) { + imp = req->rq_import; + body = lustre_swab_repmsg (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Can't unpack body\n"); + } else if (body->oa.o_valid & OBD_MD_FLCKSUM) { static int cksum_counter; - __u64 server_cksum = NTOH__u64(body->oa.o_rdev); - __u64 cksum = 0; - - for (mapped = 0; mapped < page_count; mapped++) { - char *ptr = kmap(pga[mapped].pg); - int off = pga[mapped].off & (PAGE_SIZE - 1); - int len = pga[mapped].count; - - LASSERT(off + len <= PAGE_SIZE); - ost_checksum(&cksum, ptr + off, len); - kunmap(pga[mapped].pg); - } + __u64 server_cksum = body->oa.o_rdev; + __u64 cksum = cksum_pages (rc, page_count, pga); cksum_counter++; if (server_cksum != cksum) { @@ -698,143 +900,208 @@ restart_bulk: imp->imp_connection->c_peer.peer_nid); } #endif - - EXIT; - out_req: - ptlrpc_req_finished(request); - return rc; + return (0); } -static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm, - obd_count page_count, struct brw_page *pga, - struct obd_brw_set *set, struct obd_trans_info *oti) +static int osc_brw_internal(struct lustre_handle *conn, + struct lov_stripe_md *lsm, + obd_count page_count, struct brw_page *pga, int cmd) { - struct obd_import *imp = class_conn2cliimp(conn); - struct ptlrpc_connection *connection = imp->imp_connection; - struct ptlrpc_request *request = NULL; - struct ptlrpc_bulk_desc *desc = NULL; - struct ost_body *body; - int rc, size[3] = {sizeof(*body)}, mapped = 0; - struct obd_ioobj *iooptr; - struct niobuf_remote *nioptr; - __u32 xid; -#if CHECKSUM_BULK - __u64 cksum = 0; -#endif + int requested_nob; + int niocount; + struct ptlrpc_request *request; + int rc; ENTRY; restart_bulk: - size[1] = sizeof(struct obd_ioobj); - size[2] = page_count * sizeof(struct niobuf_remote); + rc = osc_brw_prep_request(class_conn2cliimp(conn), lsm, page_count, pga, + cmd, &requested_nob, &niocount, &request); + /* NB ^ sets rq_no_resend */ - request = ptlrpc_prep_req(imp, OST_WRITE, 3, size, NULL); - if (!request) - RETURN(-ENOMEM); + if (rc != 0) + return (rc); - body = lustre_msg_buf(request->rq_reqmsg, 0); + rc = ptlrpc_queue_wait(request); - desc = ptlrpc_prep_bulk(connection); - if (!desc) - GOTO(out_req, rc = -ENOMEM); - desc->bd_portal = OSC_BULK_PORTAL; - desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr; - CDEBUG(D_PAGE, "desc = %p\n", desc); + if (rc == -ETIMEDOUT && request->rq_resend) { + DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); + ptlrpc_req_finished(request); + goto restart_bulk; + } - iooptr = lustre_msg_buf(request->rq_reqmsg, 1); - nioptr = lustre_msg_buf(request->rq_reqmsg, 2); - ost_pack_ioo(iooptr, lsm, page_count); - /* end almost identical to brw_read case */ + rc = osc_brw_fini_request (request, requested_nob, niocount, + page_count, pga, rc); - xid = ptlrpc_next_xid(); /* single xid for all pages */ + ptlrpc_req_finished(request); + RETURN (rc); +} - obd_kmap_get(page_count, 0); +static int brw_interpret(struct ptlrpc_request *request, + struct osc_brw_async_args *aa, int rc) +{ + int requested_nob = aa->aa_requested_nob; + int niocount = aa->aa_nio_count; + obd_count page_count = aa->aa_page_count; + struct brw_page *pga = aa->aa_pga; + ENTRY; - for (mapped = 0; mapped < page_count; mapped++, nioptr++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) { - unmap_and_decref_bulk_desc(desc); - GOTO(out_req, rc = -ENOMEM); - } + /* XXX bug 937 here */ + if (rc == -ETIMEDOUT && request->rq_resend) { + DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); + LBUG(); /* re-send. later. */ + //goto restart_bulk; + } - LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); + rc = osc_brw_fini_request (request, requested_nob, niocount, + page_count, pga, rc); + RETURN (rc); +} - bulk->bp_xid = xid; /* single xid for all pages */ - bulk->bp_buf = kmap(pga[mapped].pg); - bulk->bp_page = pga[mapped].pg; - /* matching ptlrpc_bulk_get assert */ - LASSERT(pga[mapped].count > 0); - bulk->bp_buflen = pga[mapped].count; - ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count, - pga[mapped].flag, bulk->bp_xid); - ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen); +static int async_internal(struct lustre_handle *conn, struct lov_stripe_md *lsm, + obd_count page_count, struct brw_page *pga, + struct ptlrpc_request_set *set, int cmd) +{ + struct ptlrpc_request *request; + int requested_nob; + int nio_count; + struct osc_brw_async_args *aa; + int rc; + ENTRY; + + rc = osc_brw_prep_request (class_conn2cliimp(conn), + lsm, page_count, pga, cmd, + &requested_nob, &nio_count, &request); + /* NB ^ sets rq_no_resend */ + + if (rc == 0) { + LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args)); + aa = (struct osc_brw_async_args *)&request->rq_async_args; + aa->aa_requested_nob = requested_nob; + aa->aa_nio_count = nio_count; + aa->aa_page_count = page_count; + aa->aa_pga = pga; + + request->rq_interpret_reply = brw_interpret; + ptlrpc_set_add_req(set, request); } + RETURN (rc); +} -#if CHECKSUM_BULK - body->oa.o_rdev = HTON__u64(cksum); - body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM); +#ifndef min_t +#define min_t(type,x,y) \ + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #endif - /* - * Register the bulk first, because the reply could arrive out of - * order, and we want to be ready for the bulk data. - * - * One reference is released when brw_finish is complete, the other - * when the caller removes us from the "set" list. - * - * On error, we never do the brw_finish, so we handle all decrefs. - */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_WRITE_BULK)) { - CERROR("obd_fail_loc=%x, skipping register_bulk\n", - OBD_FAIL_OSC_BRW_WRITE_BULK); - } else { - rc = ptlrpc_register_bulk_get(desc); - if (rc) { - unmap_and_decref_bulk_desc(desc); - GOTO(out_req, rc); + +/* + * ugh, we want disk allocation on the target to happen in offset order. we'll + * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do + * fine for our small page arrays and doesn't require allocation. its an + * insertion sort that swaps elements that are strides apart, shrinking the + * stride down until its '1' and the array is sorted. + */ +static void sort_brw_pages(struct brw_page *array, int num) +{ + int stride, i, j; + struct brw_page tmp; + + if (num == 1) + return; + for (stride = 1; stride < num ; stride = (stride * 3) + 1) + ; + + do { + stride /= 3; + for (i = stride ; i < num ; i++) { + tmp = array[i]; + j = i; + while (j >= stride && array[j - stride].off > tmp.off) { + array[j] = array[j - stride]; + j -= stride; + } + array[j] = tmp; + } + } while (stride > 1); +} + +/* make sure we the regions we're passing to elan don't violate its '4 + * fragments' constraint. portal headers are a fragment, all full + * PAGE_SIZE long pages count as 1 fragment, and each partial page + * counts as a fragment. I think. see bug 934. */ +static obd_count check_elan_limit(struct brw_page *pg, obd_count pages) +{ + int frags_left = 3; + int saw_whole_frag = 0; + int i; + + for (i = 0 ; frags_left && i < pages ; pg++, i++) { + if (pg->count == PAGE_SIZE) { + if (!saw_whole_frag) { + saw_whole_frag = 1; + frags_left--; + } + } else { + frags_left--; } - obd_brw_set_add(set, desc); } + return i; +} - request->rq_flags |= PTL_RPC_FL_NO_RESEND; - request->rq_replen = lustre_msg_size(1, size); - rc = ptlrpc_queue_wait(request); +static int osc_brw(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *md, obd_count page_count, + struct brw_page *pga, struct obd_trans_info *oti) +{ + ENTRY; - /* XXX bug 937 here */ - if (rc == -ETIMEDOUT && (request->rq_flags & PTL_RPC_FL_RESEND)) { - DEBUG_REQ(D_HA, request, "BULK TIMEOUT"); - ptlrpc_req_finished(request); - goto restart_bulk; - } + if (cmd == OBD_BRW_CHECK) { + /* The caller just wants to know if there's a chance that this + * I/O can succeed */ + struct obd_import *imp = class_conn2cliimp(conn); - if (rc) { - osc_ptl_ev_abort(desc); - GOTO(out_req, rc); + if (imp == NULL || imp->imp_invalid) + RETURN(-EIO); + RETURN(0); } - EXIT; - out_req: - ptlrpc_req_finished(request); - return rc; -} + while (page_count) { + obd_count pages_per_brw; + int rc; -#ifndef min_t -#define min_t(a,b,c) ( b<c ) ? b : c -#endif + if (page_count > OSC_BRW_MAX_IOV) + pages_per_brw = OSC_BRW_MAX_IOV; + else + pages_per_brw = page_count; -#warning "FIXME: make values dynamic based on get_info at setup (bug 665)" -#define OSC_BRW_MAX_SIZE 65536 -#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE) + sort_brw_pages(pga, pages_per_brw); + pages_per_brw = check_elan_limit(pga, pages_per_brw); -#warning "FIXME: make these values dynamic based on a get_info call at setup" -#define OSC_BRW_MAX_SIZE 65536 -#define OSC_BRW_MAX_IOV min_t(int, PTL_MD_MAX_IOV, OSC_BRW_MAX_SIZE/PAGE_SIZE) + rc = osc_brw_internal(conn, md, pages_per_brw, pga, cmd); -static int osc_brw(int cmd, struct lustre_handle *conn, - struct lov_stripe_md *md, obd_count page_count, - struct brw_page *pga, struct obd_brw_set *set, - struct obd_trans_info *oti) + if (rc != 0) + RETURN(rc); + + page_count -= pages_per_brw; + pga += pages_per_brw; + } + RETURN(0); +} + +static int osc_brw_async(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *md, obd_count page_count, + struct brw_page *pga, struct ptlrpc_request_set *set, + struct obd_trans_info *oti) { ENTRY; + if (cmd == OBD_BRW_CHECK) { + /* The caller just wants to know if there's a chance that this + * I/O can succeed */ + struct obd_import *imp = class_conn2cliimp(conn); + + if (imp == NULL || imp->imp_invalid) + RETURN(-EIO); + RETURN(0); + } + while (page_count) { obd_count pages_per_brw; int rc; @@ -844,11 +1111,10 @@ static int osc_brw(int cmd, struct lustre_handle *conn, else pages_per_brw = page_count; - if (cmd & OBD_BRW_WRITE) - rc = osc_brw_write(conn, md, pages_per_brw, pga, - set, oti); - else - rc = osc_brw_read(conn, md, pages_per_brw, pga, set); + sort_brw_pages(pga, pages_per_brw); + pages_per_brw = check_elan_limit(pga, pages_per_brw); + + rc = async_internal(conn, md, pages_per_brw, pga, set, cmd); if (rc != 0) RETURN(rc); @@ -865,16 +1131,18 @@ static int osc_brw(int cmd, struct lustre_handle *conn, static int sanosc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga, - struct obd_brw_set *set) + struct brw_page *pga) { struct ptlrpc_request *request = NULL; struct ost_body *body; struct niobuf_remote *nioptr; struct obd_ioobj *iooptr; - int rc, j, size[3] = {sizeof(*body)}, mapped = 0; + int rc, size[3] = {sizeof(*body)}, mapped = 0; + int swab; ENTRY; + /* XXX does not handle 'new' brw protocol */ + size[1] = sizeof(struct obd_ioobj); size[2] = page_count * sizeof(*nioptr); @@ -883,20 +1151,23 @@ static int sanosc_brw_read(struct lustre_handle *conn, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); - iooptr = lustre_msg_buf(request->rq_reqmsg, 1); - nioptr = lustre_msg_buf(request->rq_reqmsg, 2); - ost_pack_ioo(iooptr, lsm, page_count); + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); + iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr)); + nioptr = lustre_msg_buf(request->rq_reqmsg, 2, + sizeof (*nioptr) * page_count); - obd_kmap_get(page_count, 0); + iooptr->ioo_id = lsm->lsm_object_id; + iooptr->ioo_gr = 0; + iooptr->ioo_type = S_IFREG; + iooptr->ioo_bufcnt = page_count; for (mapped = 0; mapped < page_count; mapped++, nioptr++) { LASSERT(PageLocked(pga[mapped].pg)); LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); - kmap(pga[mapped].pg); - ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count, - pga[mapped].flag, 0); + nioptr->offset = pga[mapped].off; + nioptr->len = pga[mapped].count; + nioptr->flags = pga[mapped].flag; } size[1] = page_count * sizeof(*nioptr); @@ -904,25 +1175,25 @@ static int sanosc_brw_read(struct lustre_handle *conn, rc = ptlrpc_queue_wait(request); if (rc) - GOTO(out_unmap, rc); - - nioptr = lustre_msg_buf(request->rq_repmsg, 1); - if (!nioptr) - GOTO(out_unmap, rc = -EINVAL); + GOTO(out_req, rc); - if (request->rq_repmsg->buflens[1] != size[1]) { - CERROR("buffer length wrong (%d vs. %d)\n", - request->rq_repmsg->buflens[1], size[1]); - GOTO(out_unmap, rc = -EINVAL); + swab = lustre_msg_swabbed (request->rq_repmsg); + LASSERT_REPSWAB (request, 1); + nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]); + if (!nioptr) { + /* nioptr missing or short */ + GOTO(out_req, rc = -EPROTO); } /* actual read */ - for (j = 0; j < page_count; j++, nioptr++) { - struct page *page = pga[j].pg; + for (mapped = 0; mapped < page_count; mapped++, nioptr++) { + struct page *page = pga[mapped].pg; struct buffer_head *bh; kdev_t dev; - ost_unpack_niobuf(nioptr, nioptr); + if (swab) + lustre_swab_niobuf_remote (nioptr); + /* got san device associated */ LASSERT(class_conn2obd(conn)); dev = class_conn2obd(conn)->u.cli.cl_sandev; @@ -970,35 +1241,26 @@ static int sanosc_brw_read(struct lustre_handle *conn, if (!buffer_uptodate(bh)) { /* I/O error */ rc = -EIO; - goto out_unmap; + goto out_req; } } out_req: ptlrpc_req_finished(request); RETURN(rc); - -out_unmap: - /* Clean up on error. */ - while (mapped-- > 0) - kunmap(pga[mapped].pg); - - obd_kmap_put(page_count); - - goto out_req; } static int sanosc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga, - struct obd_brw_set *set) + struct brw_page *pga) { struct ptlrpc_request *request = NULL; struct ost_body *body; struct niobuf_remote *nioptr; struct obd_ioobj *iooptr; - int rc, j, size[3] = {sizeof(*body)}, mapped = 0; + int rc, size[3] = {sizeof(*body)}, mapped = 0; + int swab; ENTRY; size[1] = sizeof(struct obd_ioobj); @@ -1009,20 +1271,24 @@ static int sanosc_brw_write(struct lustre_handle *conn, if (!request) RETURN(-ENOMEM); - body = lustre_msg_buf(request->rq_reqmsg, 0); - iooptr = lustre_msg_buf(request->rq_reqmsg, 1); - nioptr = lustre_msg_buf(request->rq_reqmsg, 2); - ost_pack_ioo(iooptr, lsm, page_count); + body = lustre_msg_buf(request->rq_reqmsg, 0, sizeof (*body)); + iooptr = lustre_msg_buf(request->rq_reqmsg, 1, sizeof (*iooptr)); + nioptr = lustre_msg_buf(request->rq_reqmsg, 2, + sizeof (*nioptr) * page_count); + + iooptr->ioo_id = lsm->lsm_object_id; + iooptr->ioo_gr = 0; + iooptr->ioo_type = S_IFREG; + iooptr->ioo_bufcnt = page_count; - /* map pages, and pack request */ - obd_kmap_get(page_count, 0); + /* pack request */ for (mapped = 0; mapped < page_count; mapped++, nioptr++) { LASSERT(PageLocked(pga[mapped].pg)); LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); - kmap(pga[mapped].pg); - ost_pack_niobuf(nioptr, pga[mapped].off, pga[mapped].count, - pga[mapped].flag, 0); + nioptr->offset = pga[mapped].off; + nioptr->len = pga[mapped].count; + nioptr->flags = pga[mapped].flag; } size[1] = page_count * sizeof(*nioptr); @@ -1030,25 +1296,25 @@ static int sanosc_brw_write(struct lustre_handle *conn, rc = ptlrpc_queue_wait(request); if (rc) - GOTO(out_unmap, rc); - - nioptr = lustre_msg_buf(request->rq_repmsg, 1); - if (!nioptr) - GOTO(out_unmap, rc = -EINVAL); + GOTO(out_req, rc); - if (request->rq_repmsg->buflens[1] != size[1]) { - CERROR("buffer length wrong (%d vs. %d)\n", - request->rq_repmsg->buflens[1], size[1]); - GOTO(out_unmap, rc = -EINVAL); + swab = lustre_msg_swabbed (request->rq_repmsg); + LASSERT_REPSWAB (request, 1); + nioptr = lustre_msg_buf(request->rq_repmsg, 1, size[1]); + if (!nioptr) { + CERROR("absent/short niobuf array\n"); + GOTO(out_req, rc = -EPROTO); } /* actual write */ - for (j = 0; j < page_count; j++, nioptr++) { - struct page *page = pga[j].pg; + for (mapped = 0; mapped < page_count; mapped++, nioptr++) { + struct page *page = pga[mapped].pg; struct buffer_head *bh; kdev_t dev; - ost_unpack_niobuf(nioptr, nioptr); + if (swab) + lustre_swab_niobuf_remote (nioptr); + /* got san device associated */ LASSERT(class_conn2obd(conn)); dev = class_conn2obd(conn)->u.cli.cl_sandev; @@ -1089,28 +1355,18 @@ static int sanosc_brw_write(struct lustre_handle *conn, if (!buffer_uptodate(bh) || test_bit(BH_Dirty, &bh->b_state)) { /* I/O error */ rc = -EIO; - goto out_unmap; + goto out_req; } } out_req: ptlrpc_req_finished(request); RETURN(rc); - -out_unmap: - /* Clean up on error. */ - while (mapped-- > 0) - kunmap(pga[mapped].pg); - - obd_kmap_put(page_count); - - goto out_req; } static int sanosc_brw(int cmd, struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page *pga, struct obd_brw_set *set, - struct obd_trans_info *oti) + struct brw_page *pga, struct obd_trans_info *oti) { ENTRY; @@ -1124,10 +1380,9 @@ static int sanosc_brw(int cmd, struct lustre_handle *conn, pages_per_brw = page_count; if (cmd & OBD_BRW_WRITE) - rc = sanosc_brw_write(conn, lsm, pages_per_brw, - pga, set); + rc = sanosc_brw_write(conn, lsm, pages_per_brw, pga); else - rc = sanosc_brw_read(conn, lsm, pages_per_brw, pga,set); + rc = sanosc_brw_read(conn, lsm, pages_per_brw, pga); if (rc != 0) RETURN(rc); @@ -1152,20 +1407,17 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, int rc; ENTRY; - /* Filesystem locks are given a bit of special treatment: if - * this is not a file size lock (which has end == -1), we - * fixup the lock to start and end on page boundaries. */ - if (extent->end != OBD_OBJECT_EOF) { - extent->start &= PAGE_MASK; - extent->end = (extent->end & PAGE_MASK) + PAGE_SIZE - 1; - } + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother. */ + extent->start -= extent->start & ~PAGE_MASK; + extent->end |= ~PAGE_MASK; /* Next, search for already existing extent locks that will cover us */ rc = ldlm_lock_match(obddev->obd_namespace, 0, &res_id, type, extent, sizeof(extent), mode, lockh); if (rc == 1) /* We already have a lock, and it's referenced */ - RETURN(ELDLM_LOCK_MATCHED); + RETURN(ELDLM_OK); /* If we're trying to read, we also search for an existing PW lock. The * VFS and page cache already protect us locally, so lots of readers/ @@ -1189,14 +1441,52 @@ static int osc_enqueue(struct lustre_handle *connh, struct lov_stripe_md *lsm, ldlm_lock_addref(lockh, LCK_PR); ldlm_lock_decref(lockh, LCK_PW); - RETURN(ELDLM_LOCK_MATCHED); + RETURN(ELDLM_OK); } } rc = ldlm_cli_enqueue(connh, NULL, obddev->obd_namespace, parent_lock, res_id, type, extent, sizeof(extent), mode, flags, - ldlm_completion_ast, callback, data, NULL, - lockh); + ldlm_completion_ast, callback, data, lockh); + RETURN(rc); +} + +static int osc_match(struct lustre_handle *connh, struct lov_stripe_md *lsm, + __u32 type, void *extentp, int extent_len, __u32 mode, + int *flags, struct lustre_handle *lockh) +{ + struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + struct obd_device *obddev = class_conn2obd(connh); + struct ldlm_extent *extent = extentp; + int rc; + ENTRY; + + /* Filesystem lock extents are extended to page boundaries so that + * dealing with the page cache is a little smoother */ + extent->start -= extent->start & ~PAGE_MASK; + extent->end |= ~PAGE_MASK; + + /* Next, search for already existing extent locks that will cover us */ + rc = ldlm_lock_match(obddev->obd_namespace, *flags, &res_id, type, + extent, sizeof(extent), mode, lockh); + if (rc) + RETURN(rc); + + /* If we're trying to read, we also search for an existing PW lock. The + * VFS and page cache already protect us locally, so lots of readers/ + * writers can share a single PW lock. */ + if (mode == LCK_PR) { + rc = ldlm_lock_match(obddev->obd_namespace, *flags, &res_id, + type, extent, sizeof(extent), LCK_PW, + lockh); + if (rc == 1) { + /* FIXME: This is not incredibly elegant, but it might + * be more elegant than adding another parameter to + * lock_match. I want a second opinion. */ + ldlm_lock_addref(lockh, LCK_PR); + ldlm_lock_decref(lockh, LCK_PW); + } + } RETURN(rc); } @@ -1211,16 +1501,18 @@ static int osc_cancel(struct lustre_handle *oconn, struct lov_stripe_md *md, } static int osc_cancel_unused(struct lustre_handle *connh, - struct lov_stripe_md *lsm, int flags) + struct lov_stripe_md *lsm, int flags, void *opaque) { struct obd_device *obddev = class_conn2obd(connh); struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; - return ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags); + return ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags, + opaque); } static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) { + struct obd_statfs *msfs; struct ptlrpc_request *request; int rc, size = sizeof(*osfs); ENTRY; @@ -1238,7 +1530,14 @@ static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) GOTO(out, rc); } - obd_statfs_unpack(osfs, lustre_msg_buf(request->rq_repmsg, 0)); + msfs = lustre_swab_repbuf (request, 0, sizeof (*msfs), + lustre_swab_obd_statfs); + if (msfs == NULL) { + CERROR ("Can't unpack obd_statfs\n"); + GOTO (out, rc = -EPROTO); + } + + memcpy (osfs, msfs, sizeof (*msfs)); EXIT; out: @@ -1299,55 +1598,6 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, ENTRY; switch (cmd) { -#if 0 - case IOC_LDLM_TEST: { - err = ldlm_test(obddev, conn); - CERROR("-- done err %d\n", err); - GOTO(out, err); - } - case IOC_LDLM_REGRESS_START: { - unsigned int numthreads = 1; - unsigned int numheld = 10; - unsigned int numres = 10; - unsigned int numext = 10; - char *parse; - - if (data->ioc_inllen1) { - parse = data->ioc_inlbuf1; - if (*parse != '\0') { - while(isspace(*parse)) parse++; - numthreads = simple_strtoul(parse, &parse, 0); - while(isspace(*parse)) parse++; - } - if (*parse != '\0') { - while(isspace(*parse)) parse++; - numheld = simple_strtoul(parse, &parse, 0); - while(isspace(*parse)) parse++; - } - if (*parse != '\0') { - while(isspace(*parse)) parse++; - numres = simple_strtoul(parse, &parse, 0); - while(isspace(*parse)) parse++; - } - if (*parse != '\0') { - while(isspace(*parse)) parse++; - numext = simple_strtoul(parse, &parse, 0); - while(isspace(*parse)) parse++; - } - } - - err = ldlm_regression_start(obddev, conn, numthreads, - numheld, numres, numext); - - CERROR("-- done err %d\n", err); - GOTO(out, err); - } - case IOC_LDLM_REGRESS_STOP: { - err = ldlm_regression_stop(); - CERROR("-- done err %d\n", err); - GOTO(out, err); - } -#endif case IOC_OSC_REGISTER_LOV: { if (obddev->u.cli.cl_containing_lov) GOTO(out, err = -EALREADY); @@ -1390,7 +1640,7 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, err = copy_to_user((void *)uarg, buf, len); if (err) err = -EFAULT; - OBD_FREE(buf, len); + obd_ioctl_freedata(buf, len); GOTO(out, err); } case LL_IOC_LOV_SETSTRIPE: @@ -1401,6 +1651,14 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, case LL_IOC_LOV_GETSTRIPE: err = osc_getstripe(conn, karg, uarg); GOTO(out, err); + case OBD_IOC_CLIENT_RECOVER: + err = ptlrpc_recover_import(obddev->u.cli.cl_import, + data->ioc_inlbuf1); + GOTO(out, err); + case IOC_OSC_SET_ACTIVE: + err = ptlrpc_set_import_active(obddev->u.cli.cl_import, + data->ioc_offset); + GOTO(out, err); default: CERROR ("osc_ioctl(): unrecognised ioctl %#x\n", cmd); GOTO(out, err = -ENOTTY); @@ -1409,166 +1667,21 @@ out: return err; } -static void set_osc_active(struct obd_import *imp, int active) +static int osc_get_info(struct lustre_handle *conn, obd_count keylen, + void *key, __u32 *vallen, void *val) { - struct obd_device *notify_obd; - - LASSERT(imp->imp_obd); - - notify_obd = imp->imp_obd->u.cli.cl_containing_lov; - - if (notify_obd == NULL) - return; - - /* How gross is _this_? */ - if (!list_empty(¬ify_obd->obd_exports)) { - int rc; - struct lustre_handle fakeconn; - struct obd_ioctl_data ioc_data = { 0 }; - struct obd_export *exp = - list_entry(notify_obd->obd_exports.next, - struct obd_export, exp_obd_chain); - - fakeconn.addr = (__u64)(unsigned long)exp; - fakeconn.cookie = exp->exp_cookie; - ioc_data.ioc_inlbuf1 = - (char *)&imp->imp_obd->u.cli.cl_target_uuid; - ioc_data.ioc_offset = active; - rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn, - sizeof ioc_data, &ioc_data, NULL); - if (rc) - CERROR("error disabling %s on LOV %p/%s: %d\n", - imp->imp_obd->u.cli.cl_target_uuid.uuid, - notify_obd, notify_obd->obd_uuid.uuid, rc); - } else { - CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about " - "%p\n", notify_obd, notify_obd->obd_uuid.uuid, - imp->imp_obd->obd_uuid.uuid); - } -} - -static int osc_recover(struct obd_import *imp, int phase) -{ - int rc; - unsigned long flags; - int msg_flags; - struct ptlrpc_request *req; - struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; ENTRY; + if (!vallen || !val) + RETURN(-EFAULT); - CDEBUG(D_HA, "%s: entering phase: %d\n", - imp->imp_obd->obd_name, phase); - switch(phase) { - - case PTLRPC_RECOVD_PHASE_PREPARE: { - if (imp->imp_flags & IMP_REPLAYABLE) { - CDEBUG(D_HA, "failover OST\n"); - /* If we're a failover OSC/OST, just cancel unused - * locks to simplify lock replay. - */ - ldlm_cli_cancel_unused(ns, NULL, LDLM_FL_LOCAL_ONLY); - } else { - CDEBUG(D_HA, "non-failover OST\n"); - /* Non-failover OSTs (LLNL scenario) disable the OSC - * and invalidate local state. - */ - ldlm_namespace_cleanup(ns, 1 /* no network ops */); - ptlrpc_abort_inflight(imp, 0); - set_osc_active(imp, 0 /* inactive */); - } - RETURN(0); - } - - case PTLRPC_RECOVD_PHASE_RECOVER: { - reconnect: - imp->imp_flags &= ~IMP_INVALID; - rc = ptlrpc_reconnect_import(imp, OST_CONNECT, &req); - - msg_flags = req->rq_repmsg - ? lustre_msg_get_op_flags(req->rq_repmsg) - : 0; - - if (rc == -EBUSY && (msg_flags & MSG_CONNECT_RECOVERING)) - CERROR("reconnect denied by recovery; should retry\n"); - - if (rc) { - if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) { - CERROR("can't reconnect, invalidating\n"); - ldlm_namespace_cleanup(ns, 1); - ptlrpc_abort_inflight(imp, 0); - } - imp->imp_flags |= IMP_INVALID; - ptlrpc_req_finished(req); - RETURN(rc); - } - - if (msg_flags & MSG_CONNECT_RECOVERING) { - /* Replay if they want it. */ - DEBUG_REQ(D_HA, req, "OST wants replay"); - rc = ptlrpc_replay(imp); - if (rc) - GOTO(check_rc, rc); - - rc = ldlm_replay_locks(imp); - if (rc) - GOTO(check_rc, rc); - - rc = signal_completed_replay(imp); - if (rc) - GOTO(check_rc, rc); - } else if (msg_flags & MSG_CONNECT_RECONNECT) { - DEBUG_REQ(D_HA, req, "reconnecting to MDS\n"); - /* Nothing else to do here. */ - } else { - DEBUG_REQ(D_HA, req, "evicted: invalidating\n"); - /* Otherwise, clean everything up. */ - ldlm_namespace_cleanup(ns, 1); - ptlrpc_abort_inflight(imp, 0); - } - - ptlrpc_req_finished(req); - - spin_lock_irqsave(&imp->imp_lock, flags); - imp->imp_level = LUSTRE_CONN_FULL; - imp->imp_flags &= ~IMP_INVALID; - spin_unlock_irqrestore(&imp->imp_lock, flags); - - /* Is this the right place? Should we do this in _PREPARE - * as well? What about raising the level right away? - */ - ptlrpc_wake_delayed(imp); - - rc = ptlrpc_resend(imp); - if (rc) - GOTO(check_rc, rc); - - set_osc_active(imp, 1 /* active */); + if (keylen > strlen("lock_to_stripe") && + strcmp(key, "lock_to_stripe") == 0) { + __u32 *stripe = val; + *vallen = sizeof(*stripe); + *stripe = 0; RETURN(0); - - check_rc: - /* If we get disconnected in the middle, recovery has probably - * failed. Reconnect and find out. - */ - if (rc == -ENOTCONN) - goto reconnect; - RETURN(rc); - } - case PTLRPC_RECOVD_PHASE_NOTCONN: - osc_recover(imp, PTLRPC_RECOVD_PHASE_PREPARE); - RETURN(osc_recover(imp, PTLRPC_RECOVD_PHASE_RECOVER)); - - default: - RETURN(-EINVAL); } -} - -static int osc_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) -{ - struct obd_import *imp = &obd->u.cli.cl_import; - imp->imp_recover = osc_recover; - return client_obd_connect(conn, obd, cluuid, recovd, recover); + RETURN(-EINVAL); } struct obd_ops osc_obd_ops = { @@ -1577,23 +1690,27 @@ struct obd_ops osc_obd_ops = { o_detach: osc_detach, o_setup: client_obd_setup, o_cleanup: client_obd_cleanup, - o_connect: osc_connect, - o_disconnect: client_obd_disconnect, + o_connect: client_import_connect, + o_disconnect: client_import_disconnect, o_statfs: osc_statfs, o_packmd: osc_packmd, o_unpackmd: osc_unpackmd, o_create: osc_create, o_destroy: osc_destroy, o_getattr: osc_getattr, + o_getattr_async: osc_getattr_async, o_setattr: osc_setattr, o_open: osc_open, o_close: osc_close, o_brw: osc_brw, + o_brw_async: osc_brw_async, o_punch: osc_punch, o_enqueue: osc_enqueue, + o_match: osc_match, o_cancel: osc_cancel, o_cancel_unused: osc_cancel_unused, - o_iocontrol: osc_iocontrol + o_iocontrol: osc_iocontrol, + o_get_info: osc_get_info }; struct obd_ops sanosc_obd_ops = { @@ -1601,14 +1718,15 @@ struct obd_ops sanosc_obd_ops = { o_attach: osc_attach, o_detach: osc_detach, o_cleanup: client_obd_cleanup, - o_connect: osc_connect, - o_disconnect: client_obd_disconnect, + o_connect: client_import_connect, + o_disconnect: client_import_disconnect, o_statfs: osc_statfs, o_packmd: osc_packmd, o_unpackmd: osc_unpackmd, o_create: osc_create, o_destroy: osc_destroy, o_getattr: osc_getattr, + o_getattr_async: osc_getattr_async, o_setattr: osc_setattr, o_open: osc_open, o_close: osc_close, @@ -1618,6 +1736,7 @@ struct obd_ops sanosc_obd_ops = { #endif o_punch: osc_punch, o_enqueue: osc_enqueue, + o_match: osc_match, o_cancel: osc_cancel, o_cancel_unused: osc_cancel_unused, o_iocontrol: osc_iocontrol, @@ -1629,7 +1748,8 @@ int __init osc_init(void) int rc; ENTRY; - LASSERT(sizeof(struct osc_obdo_data) <= FD_OSTDATA_SIZE); + LASSERT(sizeof(struct obd_client_handle) <= FD_OSTDATA_SIZE); + LASSERT(sizeof(struct obd_client_handle) <= OBD_INLINESZ); lprocfs_init_vars(&lvars); diff --git a/lustre/ost/Makefile.am b/lustre/ost/Makefile.am index c158a0f..b2e51c3 100644 --- a/lustre/ost/Makefile.am +++ b/lustre/ost/Makefile.am @@ -4,19 +4,10 @@ # See the file COPYING in this distribution DEFS= + MODULE = ost modulefs_DATA = ost.o EXTRA_PROGRAMS = ost - -LINX=obd_pack.c target.c - -obd_pack.c: - test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c -target.c: - test -e target.c || ln -sf $(top_srcdir)/lib/target.c - -ost_SOURCES = ost_handler.c lproc_ost.c $(LINX) -dist-hook: - list='$(LINX)'; for f in $$list; do rm -f $(distdir)/$$f; done +ost_SOURCES = ost_handler.c lproc_ost.c include $(top_srcdir)/Rules diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 848336c..f14d82f 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -41,21 +41,40 @@ #include <linux/init.h> #include <linux/lprocfs_status.h> -inline void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) +inline void oti_to_request(struct obd_trans_info *oti, + struct ptlrpc_request *req) { - if (oti && req->rq_repmsg) - req->rq_repmsg->transno = HTON__u64(oti->oti_transno); + int i; + struct oti_req_ack_lock *ack_lock; + + if(oti == NULL) + return; + + if (req->rq_repmsg) + req->rq_repmsg->transno = oti->oti_transno; + + /* XXX 4 == entries in oti_ack_locks??? */ + for (ack_lock = oti->oti_ack_locks, i = 0; i < 4; i++, ack_lock++) { + if (!ack_lock->mode) + break; + memcpy(&req->rq_ack_locks[i].lock, &ack_lock->lock, + sizeof(req->rq_ack_locks[i].lock)); + req->rq_ack_locks[i].mode = ack_lock->mode; + } EXIT; } static int ost_destroy(struct ptlrpc_request *req, struct obd_trans_info *oti) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; + struct lustre_handle *conn = &req->rq_reqmsg->handle; struct ost_body *body; int rc, size = sizeof(*body); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) @@ -72,14 +91,16 @@ static int ost_getattr(struct ptlrpc_request *req) int rc, size = sizeof(*body); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); req->rq_status = obd_getattr(conn, &repbody->oa, NULL); RETURN(0); @@ -96,23 +117,18 @@ static int ost_statfs(struct ptlrpc_request *req) if (rc) RETURN(rc); - osfs = lustre_msg_buf(req->rq_repmsg, 0); + osfs = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*osfs)); memset(osfs, 0, size); - rc = obd_statfs(conn, osfs); - if (rc) { - CERROR("ost: statfs failed: rc %d\n", rc); - req->rq_status = rc; - RETURN(rc); - } - obd_statfs_pack(osfs, osfs); + req->rq_status = obd_statfs(conn, osfs); + if (req->rq_status != 0) + CERROR("ost: statfs failed: rc %d\n", req->rq_status); RETURN(0); } static int ost_syncfs(struct ptlrpc_request *req) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct obd_statfs *osfs; int rc, size = sizeof(*osfs); ENTRY; @@ -121,7 +137,7 @@ static int ost_syncfs(struct ptlrpc_request *req) if (rc) RETURN(rc); - rc = obd_syncfs(conn); + rc = obd_syncfs(req->rq_export); if (rc) { CERROR("ost: syncfs failed: rc %d\n", rc); req->rq_status = rc; @@ -135,19 +151,21 @@ static int ost_open(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + return (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_open(conn, &repbody->oa, NULL, oti); + req->rq_status = obd_open(conn, &repbody->oa, NULL, oti, NULL); RETURN(0); } @@ -155,17 +173,19 @@ static int ost_close(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); req->rq_status = obd_close(conn, &repbody->oa, NULL, oti); RETURN(0); @@ -175,17 +195,19 @@ static int ost_create(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf (req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); req->rq_status = obd_create(conn, &repbody->oa, NULL, oti); RETURN(0); @@ -195,12 +217,15 @@ static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ost_body *body, *repbody; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); - if ((NTOH__u32(body->oa.o_valid) & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS))!= + if ((body->oa.o_valid & (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) != (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS)) RETURN(-EINVAL); @@ -208,30 +233,32 @@ static int ost_punch(struct ptlrpc_request *req, struct obd_trans_info *oti) if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_punch(conn, &repbody->oa, NULL, - repbody->oa.o_size, repbody->oa.o_blocks, oti); + req->rq_status = obd_punch(conn, &repbody->oa, NULL, repbody->oa.o_size, + repbody->oa.o_blocks, oti); RETURN(0); } static int ost_setattr(struct ptlrpc_request *req, struct obd_trans_info *oti) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; + struct lustre_handle *conn = &req->rq_reqmsg->handle; struct ost_body *body, *repbody; - int rc, size = sizeof(*body); + int rc, size = sizeof(*repbody); ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) + RETURN (-EFAULT); rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) RETURN(rc); - repbody = lustre_msg_buf(req->rq_repmsg, 0); - /* FIXME: unpack only valid fields instead of memcpy, endianness */ + repbody = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); + req->rq_status = obd_setattr(conn, &repbody->oa, NULL, oti); RETURN(0); } @@ -245,128 +272,274 @@ static int ost_bulk_timeout(void *data) RETURN(1); } -static int ost_brw_read(struct ptlrpc_request *req) +static int get_per_page_niobufs (struct obd_ioobj *ioo, int nioo, + struct niobuf_remote *rnb, int nrnb, + struct niobuf_remote **pp_rnbp) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; - struct ptlrpc_bulk_desc *desc; - struct niobuf_remote *remote_nb; - struct niobuf_local *local_nb = NULL; - struct obd_ioobj *ioo; - struct ost_body *body; - struct l_wait_info lwi; - void *desc_priv = NULL; - void *end2; - int cmd, i, j, objcount, niocount, size = sizeof(*body); - int rc = 0; + /* Copy a remote niobuf, splitting it into page-sized chunks + * and setting ioo[i].ioo_bufcnt accordingly */ + struct niobuf_remote *pp_rnb; + int i; + int j; + int page; + int rnbidx = 0; + int npages = 0; + + /* first count and check the number of pages required */ + for (i = 0; i < nioo; i++) + for (j = 0; j < ioo->ioo_bufcnt; j++, rnbidx++) { + obd_off offset = rnb[rnbidx].offset; + obd_off p0 = offset >> PAGE_SHIFT; + obd_off pn = (offset + rnb[rnbidx].len - 1)>>PAGE_SHIFT; + + LASSERT (rnbidx < nrnb); + + npages += (pn + 1 - p0); + + if (rnb[rnbidx].len == 0) { + CERROR("zero len BRW: obj %d objid "LPX64 + " buf %u\n", i, ioo[i].ioo_id, j); + return (-EINVAL); + } + if (j > 0 && + rnb[rnbidx].offset <= rnb[rnbidx-1].offset) { + CERROR("unordered BRW: obj %d objid "LPX64 + " buf %u offset "LPX64" <= "LPX64"\n", + i, ioo[i].ioo_id, j, rnb[rnbidx].offset, + rnb[rnbidx].offset); + return (-EINVAL); + } + } + + LASSERT (rnbidx == nrnb); + + if (npages == nrnb) { /* all niobufs are for single pages */ + *pp_rnbp = rnb; + return (npages); + } + + OBD_ALLOC (pp_rnb, sizeof (*pp_rnb) * npages); + if (pp_rnb == NULL) + return (-ENOMEM); + + /* now do the actual split */ + page = rnbidx = 0; + for (i = 0; i < nioo; i++) { + int obj_pages = 0; + + for (j = 0; j < ioo[i].ioo_bufcnt; j++, rnbidx++) { + obd_off off = rnb[rnbidx].offset; + int nob = rnb[rnbidx].len; + + LASSERT (rnbidx < nrnb); + do { + obd_off poff = off & (PAGE_SIZE - 1); + int pnob = (poff + nob > PAGE_SIZE) ? + PAGE_SIZE - poff : nob; + + LASSERT (page < npages); + pp_rnb[page].len = pnob; + pp_rnb[page].offset = off; + pp_rnb[page].flags = rnb->flags; + + CDEBUG (D_PAGE, " obj %d id "LPX64 + "page %d(%d) "LPX64" for %d\n", + i, ioo[i].ioo_id, obj_pages, page, + pp_rnb[page].offset, pp_rnb[page].len); + page++; + obj_pages++; + + off += pnob; + nob -= pnob; + } while (nob > 0); + LASSERT (nob == 0); + } + ioo[i].ioo_bufcnt = obj_pages; + } + LASSERT (page == npages); + + *pp_rnbp = pp_rnb; + return (npages); +} + +static void free_per_page_niobufs (int npages, struct niobuf_remote *pp_rnb, + struct niobuf_remote *rnb) +{ + if (pp_rnb == rnb) /* didn't allocate above */ + return; + + OBD_FREE (pp_rnb, sizeof (*pp_rnb) * npages); +} + #if CHECKSUM_BULK - __u64 cksum = 0; +__u64 ost_checksum_bulk (struct ptlrpc_bulk_desc *desc) +{ + __u64 cksum = 0; + struct list_head *tmp; + char *ptr; + + list_for_each (tmp, &desc->bd_page_list) { + struct ptlrpc_bulk_page *bp; + + bp = list_entry (tmp, struct ptlrpc_bulk_page, bp_link); + ptr = kmap (bp->bp_page); + ost_checksum (&cksum, ptr + bp->bp_pageoffset, bp->bp_buflen); + kunmap (bp->bp_page); + } +} #endif - ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); - end2 = (char *)remote_nb + req->rq_reqmsg->buflens[2]; - objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo); - niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb); - cmd = OBD_BRW_READ; +static int ost_brw_read(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + struct niobuf_remote *remote_nb; + struct niobuf_remote *pp_rnb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body; + struct l_wait_info lwi; + void *desc_priv = NULL; + int size[1] = { sizeof(*body) }; + int comms_error = 0; + int niocount; + int npages; + int nob = 0; + int rc; + int i; + ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) - GOTO(out, req->rq_status = -EIO); + GOTO(out, rc = -EIO); - /* Hmm, we don't return anything in this reply buffer? - * We should be returning per-page status codes and also - * per-object size, blocks count, mtime, ctime. (bug 593) */ - rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg); - if (rc) - GOTO(out, req->rq_status = rc); - - for (i = 0; i < objcount; i++, ioo++) { - ost_unpack_ioo(ioo, ioo); - if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) { - CERROR("BRW: objid "LPX64" count %u larger than %u\n", - ioo->ioo_id, ioo->ioo_bufcnt, - (int)(end2 - (void *)remote_nb)); - LBUG(); - GOTO(out, rc = -EINVAL); - } - for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++) { - ost_unpack_niobuf(remote_nb, remote_nb); - if (remote_nb->len == 0) { - CERROR("zero len BRW: objid "LPX64" buf %u\n", - ioo->ioo_id, j); - GOTO(out, rc = -EINVAL); - } - if (j && remote_nb->offset <= (remote_nb - 1)->offset) { - CERROR("unordered BRW: objid "LPX64 - " buf %u offset "LPX64" <= "LPX64"\n", - ioo->ioo_id, j, remote_nb->offset, - (remote_nb - 1)->offset); - GOTO(out, rc = -EINVAL); - } - } + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Missing/short ost_body\n"); + GOTO (out, rc = -EFAULT); } - OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount); - if (local_nb == NULL) - GOTO(out, rc = -ENOMEM); + ioo = lustre_swab_reqbuf (req, 1, sizeof (*ioo), + lustre_swab_obd_ioobj); + if (ioo == NULL) { + CERROR ("Missing/short ioobj\n"); + GOTO (out, rc = -EFAULT); + } - /* The unpackers move ioo and remote_nb, so reset them before using */ - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); - req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount, - remote_nb, local_nb, &desc_priv, NULL); + niocount = ioo->ioo_bufcnt; + remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb), + lustre_swab_niobuf_remote); + if (remote_nb == NULL) { + CERROR ("Missing/short niobuf\n"); + GOTO (out, rc = -EFAULT); + } + if (lustre_msg_swabbed (req->rq_reqmsg)) { /* swab remaining niobufs */ + for (i = 1; i < niocount; i++) + lustre_swab_niobuf_remote (&remote_nb[i]); + } - if (req->rq_status) - GOTO(out, req->rq_status); + rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); + if (rc) + GOTO(out, rc); + + /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */ + npages = get_per_page_niobufs (ioo, 1, remote_nb, niocount, &pp_rnb); + if (npages < 0) + GOTO(out, rc = npages); + + OBD_ALLOC(local_nb, sizeof(*local_nb) * npages); + if (local_nb == NULL) + GOTO(out_pp_rnb, rc = -ENOMEM); - desc = ptlrpc_prep_bulk(req->rq_connection); + desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, OST_BULK_PORTAL); if (desc == NULL) GOTO(out_local, rc = -ENOMEM); - desc->bd_ptl_ev_hdlr = NULL; - desc->bd_portal = OST_BULK_PORTAL; - for (i = 0; i < niocount; i++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + rc = obd_preprw(OBD_BRW_READ, req->rq_export, 1, ioo, npages, + pp_rnb, local_nb, &desc_priv, NULL); + if (rc != 0) + GOTO(out_bulk, rc); - if (bulk == NULL) - GOTO(out_bulk, rc = -ENOMEM); - bulk->bp_xid = remote_nb[i].xid; - bulk->bp_buf = local_nb[i].addr; - bulk->bp_buflen = remote_nb[i].len; - if (body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM)) - ost_checksum(&cksum, bulk->bp_buf, bulk->bp_buflen); - } + nob = 0; + for (i = 0; i < npages; i++) { + int page_rc = local_nb[i].rc; - rc = ptlrpc_bulk_put(desc); - if (rc) - GOTO(out_bulk, rc); + if (page_rc < 0) { /* error */ + rc = page_rc; + break; + } - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_SENT, - &lwi); - if (rc) { - LASSERT(rc == -ETIMEDOUT); - GOTO(out_bulk, rc); + LASSERT (page_rc <= pp_rnb[i].len); + nob += page_rc; + if (page_rc != 0) { /* some data! */ + LASSERT (local_nb[i].page != NULL); + rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page, + pp_rnb[i].offset& ~PAGE_MASK, + page_rc); + if (rc != 0) + break; + } + + if (page_rc != pp_rnb[i].len) { /* short read */ + /* All subsequent pages should be 0 */ + while (++i < npages) + LASSERT (local_nb[i].rc == 0); + break; + } } - req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount, - local_nb, desc_priv, NULL); + if (rc == 0) { + rc = ptlrpc_bulk_put(desc); + if (rc == 0) { + lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, + desc); + rc = l_wait_event(desc->bd_waitq, + ptlrpc_bulk_complete(desc), &lwi); + if (rc) { + LASSERT(rc == -ETIMEDOUT); + CERROR ("timeout waiting for bulk PUT\n"); + ptlrpc_abort_bulk (desc); + } + } + comms_error = rc != 0; + } + + /* Must commit after prep above in all cases */ + rc = obd_commitrw(OBD_BRW_READ, req->rq_export, 1, ioo, npages, + local_nb, desc_priv, NULL); -out_bulk: - ptlrpc_bulk_decref(desc); -out_local: - OBD_FREE(local_nb, sizeof(*local_nb) * niocount); -out: - if (rc) - ptlrpc_error(req->rq_svc, req); - else { #if CHECKSUM_BULK - body = lustre_msg_buf(req->rq_repmsg, 0); - body->oa.o_rdev = HTON__u64(cksum); - body->oa.o_valid |= HTON__u32(OBD_MD_FLCKSUM); + if (rc == 0) { + body = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*body)); + body->oa.o_rdev = ost_checksum_bulk (desc); + body->oa.o_valid |= OBD_MD_FLCKSUM; + } #endif - ptlrpc_reply(req->rq_svc, req); + + out_bulk: + ptlrpc_free_bulk (desc); + out_local: + OBD_FREE(local_nb, sizeof(*local_nb) * npages); + out_pp_rnb: + free_per_page_niobufs (npages, pp_rnb, remote_nb); + out: + LASSERT (rc <= 0); + if (rc == 0) { + req->rq_status = nob; + ptlrpc_reply(req); + } else if (!comms_error) { + /* only reply if comms OK */ + req->rq_status = rc; + ptlrpc_error(req); + } else { + if (req->rq_repmsg != NULL) { + /* reply out callback would free */ + OBD_FREE (req->rq_repmsg, req->rq_replen); + } + CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n", + req->rq_export->exp_client_uuid.uuid, + req->rq_connection->c_remote_uuid.uuid, + req->rq_connection->c_peer.peer_nid); + ptlrpc_fail_export(req->rq_export); } RETURN(rc); @@ -374,117 +547,117 @@ out: static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ptlrpc_bulk_desc *desc; - struct niobuf_remote *remote_nb; - void *end2; - struct niobuf_local *local_nb = NULL; - struct obd_ioobj *ioo; - struct ost_body *body; - struct l_wait_info lwi; - void *desc_priv = NULL; - int cmd, i, j, objcount, niocount, size = sizeof(*body); - int rc = 0; + struct niobuf_remote *remote_nb; + struct niobuf_remote *pp_rnb; + struct niobuf_local *local_nb; + struct obd_ioobj *ioo; + struct ost_body *body; + struct l_wait_info lwi; + void *desc_priv = NULL; + __u32 *rcs; + int size[2] = { sizeof (*body) }; + int objcount, niocount, npages; + int comms_error = 0; + int rc, rc2, swab, i, j; ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); - end2 = (void *)remote_nb + req->rq_reqmsg->buflens[2]; - objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo); - niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb); - cmd = OBD_BRW_WRITE; - if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) - GOTO(out, req->rq_status = -EIO); - - for (i = 0; i < objcount; i++, ioo++) { - ost_unpack_ioo(ioo, ioo); - if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) { - CERROR("BRW: objid "LPX64" count %u larger than %u\n", - ioo->ioo_id, ioo->ioo_bufcnt, - (int)(end2 - (void *)remote_nb)); - LBUG(); - GOTO(out, rc = -EINVAL); - } - for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++) { - ost_unpack_niobuf(remote_nb, remote_nb); - if (remote_nb->len == 0) { - CERROR("zero len BRW: objid "LPX64" buf %u\n", - ioo->ioo_id, j); - GOTO(out, rc = -EINVAL); - } - if (j && remote_nb->offset <= (remote_nb - 1)->offset) { - CERROR("unordered BRW: objid "LPX64 - " buf %u offset "LPX64" <= "LPX64"\n", - ioo->ioo_id, j, remote_nb->offset, - (remote_nb - 1)->offset); - GOTO(out, rc = -EINVAL); - } + GOTO(out, rc = -EIO); + + swab = lustre_msg_swabbed (req->rq_reqmsg); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Missing/short ost_body\n"); + GOTO(out, rc = -EFAULT); + } + + LASSERT_REQSWAB (req, 1); + objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo); + if (objcount == 0) { + CERROR ("Missing/short ioobj\n"); + GOTO (out, rc = -EFAULT); + } + ioo = lustre_msg_buf (req->rq_reqmsg, 1, objcount * sizeof (*ioo)); + LASSERT (ioo != NULL); + for (niocount = i = 0; i < objcount; i++) { + if (swab) + lustre_swab_obd_ioobj (&ioo[i]); + if (ioo[i].ioo_bufcnt == 0) { + CERROR ("ioo[%d] has zero bufcnt\n", i); + GOTO (out, rc = -EFAULT); } + niocount += ioo[i].ioo_bufcnt; } - OBD_ALLOC(local_nb, sizeof(*local_nb) * niocount); - if (local_nb == NULL) - GOTO(out, rc = -ENOMEM); + remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb), + lustre_swab_niobuf_remote); + if (remote_nb == NULL) { + CERROR ("Missing/short niobuf\n"); + GOTO(out, rc = -EFAULT); + } + if (swab) { /* swab the remaining niobufs */ + for (i = 1; i < niocount; i++) + lustre_swab_niobuf_remote (&remote_nb[i]); + } - /* The unpackers move ioo and remote_nb, so reset them before using */ - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); + size[1] = niocount * sizeof (*rcs); + rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc != 0) + GOTO (out, rc); + rcs = lustre_msg_buf (req->rq_repmsg, 1, niocount * sizeof (*rcs)); - req->rq_status = obd_preprw(cmd, conn, objcount, ioo, niocount, - remote_nb, local_nb, &desc_priv, oti); + /* CAVEAT EMPTOR this sets ioo->ioo_bufcnt to # pages */ + npages = get_per_page_niobufs(ioo, objcount,remote_nb,niocount,&pp_rnb); + if (npages < 0) + GOTO (out, rc = npages); - if (req->rq_status) - GOTO(out_local, rc = 0); + OBD_ALLOC(local_nb, sizeof(*local_nb) * npages); + if (local_nb == NULL) + GOTO(out_pp_rnb, rc = -ENOMEM); - desc = ptlrpc_prep_bulk(req->rq_connection); + desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, OST_BULK_PORTAL); if (desc == NULL) GOTO(out_local, rc = -ENOMEM); - desc->bd_ptl_ev_hdlr = NULL; - desc->bd_portal = OSC_BULK_PORTAL; - for (i = 0; i < niocount; i++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + rc = obd_preprw(OBD_BRW_WRITE, req->rq_export, objcount, ioo, + npages, pp_rnb, local_nb, &desc_priv, oti); + if (rc != 0) + GOTO (out_bulk, rc); - if (bulk == NULL) - GOTO(out_bulk, rc = -ENOMEM); - bulk->bp_xid = remote_nb[i].xid; - bulk->bp_buf = local_nb[i].addr; - bulk->bp_buflen = remote_nb[i].len; - } + /* NB Having prepped, we must commit... */ - rc = ptlrpc_bulk_get(desc); - if (rc) - GOTO(out_bulk, rc); + for (i = 0; i < npages; i++) { + rc = ptlrpc_prep_bulk_page(desc, local_nb[i].page, + pp_rnb[i].offset & (PAGE_SIZE - 1), + pp_rnb[i].len); + if (rc != 0) + break; + } - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags & PTL_BULK_FL_RCVD, - &lwi); - if (rc) { - LASSERT(rc == -ETIMEDOUT); - ptlrpc_abort_bulk(desc); - recovd_conn_fail(desc->bd_connection); - obd_commitrw(cmd, conn, objcount, ioo, niocount, local_nb, - desc_priv, oti); - GOTO(out_bulk, rc); + if (rc == 0) { + rc = ptlrpc_bulk_get(desc); + if (rc == 0) { + lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, + desc); + rc = l_wait_event(desc->bd_waitq, + ptlrpc_bulk_complete(desc), &lwi); + if (rc) { + LASSERT(rc == -ETIMEDOUT); + CERROR ("timeout waiting for bulk GET\n"); + ptlrpc_abort_bulk (desc); + } + } + comms_error = rc != 0; } #if CHECKSUM_BULK - if ((body->oa.o_valid & NTOH__u32(OBD_MD_FLCKSUM))) { + if (rc == 0 && (body->oa.o_valid & OBD_MD_FLCKSUM) != 0) { static int cksum_counter; - __u64 client_cksum = NTOH__u64(body->oa.o_rdev); - __u64 cksum = 0; - - for (i = 0; i < niocount; i++) { - char *ptr = kmap(local_nb[i].page); - int off = local_nb[i].offset & (PAGE_SIZE - 1); - int len = local_nb[i].len; - - LASSERT(off + len <= PAGE_SIZE); - ost_checksum(&cksum, ptr + off, len); - kunmap(local_nb[i].page); - } + __u64 client_cksum = body->oa.o_rdev; + __u64 cksum = ost_checksum_bulk (desc); if (client_cksum != cksum) { CERROR("Bad checksum: client "LPX64", server "LPX64 @@ -501,59 +674,119 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) } } #endif - - req->rq_status = obd_commitrw(cmd, conn, objcount, ioo, niocount, - local_nb, desc_priv, oti); + /* Must commit after prep above in all cases */ + rc2 = obd_commitrw(OBD_BRW_WRITE, req->rq_export, objcount, ioo, + npages, local_nb, desc_priv, oti); + + if (rc == 0) { + /* set per-requested niobuf return codes */ + for (i = j = 0; i < niocount; i++) { + int nob = remote_nb[i].len; + + rcs[i] = 0; + do { + LASSERT (j < npages); + if (local_nb[j].rc < 0) + rcs[i] = local_nb[j].rc; + nob -= pp_rnb[j].len; + j++; + } while (nob > 0); + LASSERT (nob == 0); + } + LASSERT (j == npages); + } + if (rc == 0) + rc = rc2; out_bulk: - ptlrpc_bulk_decref(desc); + ptlrpc_free_bulk (desc); out_local: - OBD_FREE(local_nb, sizeof(*local_nb) * niocount); + OBD_FREE(local_nb, sizeof(*local_nb) * npages); + out_pp_rnb: + free_per_page_niobufs (npages, pp_rnb, remote_nb); out: - if (!rc) - /* Hmm, we don't return anything in this reply buffer? - * We should be returning per-page status codes and also - * per-object size, blocks count, mtime, ctime. (bug 593) */ - rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, - &req->rq_repmsg); - if (rc) - ptlrpc_error(req->rq_svc, req); - else { + if (rc == 0) { oti_to_request(oti, req); - rc = ptlrpc_reply(req->rq_svc, req); + rc = ptlrpc_reply(req); + } else if (!comms_error) { + /* Only reply if there was no comms problem with bulk */ + req->rq_status = rc; + ptlrpc_error(req); + } else { + if (req->rq_repmsg != NULL) { + /* reply out callback would free */ + OBD_FREE (req->rq_repmsg, req->rq_replen); + } + CERROR("bulk IO comms error: evicting %s@%s nid "LPU64"\n", + req->rq_export->exp_client_uuid.uuid, + req->rq_connection->c_remote_uuid.uuid, + req->rq_connection->c_peer.peer_nid); + ptlrpc_fail_export(req->rq_export); } RETURN(rc); } -static int ost_san_brw(struct ptlrpc_request *req, int alloc) +static int ost_san_brw(struct ptlrpc_request *req, int cmd) { - struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; + struct lustre_handle *conn = &req->rq_reqmsg->handle; struct niobuf_remote *remote_nb, *res_nb; struct obd_ioobj *ioo; struct ost_body *body; - int cmd, rc, i, j, objcount, niocount, size[2] = {sizeof(*body)}; - void *end2; + int rc, i, j, objcount, niocount, size[2] = {sizeof(*body)}; + int n; + int swab; ENTRY; - body = lustre_msg_buf(req->rq_reqmsg, 0); - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); - end2 = (void *)remote_nb + req->rq_reqmsg->buflens[2]; + /* XXX not set to use latest protocol */ + + swab = lustre_msg_swabbed (req->rq_reqmsg); + body = lustre_swab_reqbuf (req, 0, sizeof (*body), + lustre_swab_ost_body); + if (body == NULL) { + CERROR ("Missing/short ost_body\n"); + GOTO (out, rc = -EFAULT); + } + + ioo = lustre_swab_reqbuf(req, 1, sizeof (*ioo), + lustre_swab_obd_ioobj); + if (ioo == NULL) { + CERROR ("Missing/short ioobj\n"); + GOTO (out, rc = -EFAULT); + } objcount = req->rq_reqmsg->buflens[1] / sizeof(*ioo); - niocount = req->rq_reqmsg->buflens[2] / sizeof(*remote_nb); + niocount = ioo[0].ioo_bufcnt; + for (i = 1; i < objcount; i++) { + if (swab) + lustre_swab_obd_ioobj (&ioo[i]); + niocount += ioo[i].ioo_bufcnt; + } - cmd = alloc ? OBD_BRW_WRITE : OBD_BRW_READ; + remote_nb = lustre_swab_reqbuf(req, 2, niocount * sizeof (*remote_nb), + lustre_swab_niobuf_remote); + if (remote_nb == NULL) { + CERROR ("Missing/short niobuf\n"); + GOTO (out, rc = -EFAULT); + } + if (swab) { /* swab the remaining niobufs */ + for (i = 1; i < niocount; i++) + lustre_swab_niobuf_remote (&remote_nb[i]); + } - for (i = 0; i < objcount; i++, ioo++) { - ost_unpack_ioo(ioo, ioo); - if ((void *)(remote_nb + ioo->ioo_bufcnt) > end2) { - CERROR("BRW: objid "LPX64" count %u larger than %u\n", - ioo->ioo_id, ioo->ioo_bufcnt, - (int)(end2 - (void *)remote_nb)); - GOTO(out, rc = -EINVAL); + for (i = n = 0; i < objcount; i++) { + for (j = 0; j < ioo[i].ioo_bufcnt; j++, n++) { + if (remote_nb[n].len == 0) { + CERROR("zero len BRW: objid "LPX64" buf %u\n", + ioo[i].ioo_id, j); + GOTO(out, rc = -EINVAL); + } + if (j && remote_nb[n].offset <= remote_nb[n-1].offset) { + CERROR("unordered BRW: objid "LPX64 + " buf %u offset "LPX64" <= "LPX64"\n", + ioo[i].ioo_id, j, remote_nb[n].offset, + remote_nb[n-1].offset); + GOTO(out, rc = -EINVAL); + } } - for (j = 0; j < ioo->ioo_bufcnt; j++, remote_nb++) - ost_unpack_niobuf(remote_nb, remote_nb); } size[1] = niocount * sizeof(*remote_nb); @@ -561,33 +794,23 @@ static int ost_san_brw(struct ptlrpc_request *req, int alloc) if (rc) GOTO(out, rc); - /* The unpackers move ioo and remote_nb, so reset them before using */ - ioo = lustre_msg_buf(req->rq_reqmsg, 1); - remote_nb = lustre_msg_buf(req->rq_reqmsg, 2); - req->rq_status = obd_san_preprw(cmd, conn, objcount, ioo, niocount, remote_nb); - if (req->rq_status) { - rc = 0; - goto out; - } - - remote_nb = lustre_msg_buf(req->rq_repmsg, 1); - res_nb = lustre_msg_buf(req->rq_reqmsg, 2); - for (i = 0; i < niocount; i++, remote_nb++, res_nb++) - ost_pack_niobuf(remote_nb, res_nb->offset, res_nb->len, - res_nb->flags, res_nb->xid); + if (req->rq_status) + GOTO (out, rc = 0); + res_nb = lustre_msg_buf(req->rq_repmsg, 1, size[1]); + memcpy (res_nb, remote_nb, size[1]); rc = 0; - out: if (rc) { OBD_FREE(req->rq_repmsg, req->rq_replen); req->rq_repmsg = NULL; - ptlrpc_error(req->rq_svc, req); + req->rq_status = rc; + ptlrpc_error(req); } else - ptlrpc_reply(req->rq_svc, req); + ptlrpc_reply(req); return rc; } @@ -601,6 +824,7 @@ static int filter_recovery_request(struct ptlrpc_request *req, *process = 1; RETURN(0); + case OBD_PING: case OST_CLOSE: case OST_CREATE: case OST_DESTROY: @@ -617,24 +841,23 @@ static int filter_recovery_request(struct ptlrpc_request *req, DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); *process = 0; /* XXX what should we set rq_status to here? */ - RETURN(ptlrpc_error(req->rq_svc, req)); + req->rq_status = -EAGAIN; + RETURN(ptlrpc_error(req)); } } + + static int ost_handle(struct ptlrpc_request *req) { struct obd_trans_info trans_info = { 0, }, *oti = &trans_info; - int should_process, rc; + int should_process, fail = OBD_FAIL_OST_ALL_REPLY_NET, rc = 0; ENTRY; - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if (rc || OBD_FAIL_CHECK(OBD_FAIL_OST_HANDLE_UNPACK)) { - CERROR("lustre_ost: Invalid request\n"); - GOTO(out, rc); - } - + /* XXX identical to MDS */ if (req->rq_reqmsg->opc != OST_CONNECT) { struct obd_device *obd; + int abort_recovery, recovering; if (req->rq_export == NULL) { CERROR("lustre_ost: operation %d on unconnected OST\n", @@ -645,31 +868,18 @@ static int ost_handle(struct ptlrpc_request *req) obd = req->rq_export->exp_obd; + /* Check for aborted recovery. */ spin_lock_bh(&obd->obd_processing_task_lock); - if (obd->obd_flags & OBD_ABORT_RECOVERY) - target_abort_recovery(obd); + abort_recovery = obd->obd_abort_recovery; + recovering = obd->obd_recovering; spin_unlock_bh(&obd->obd_processing_task_lock); - - if (obd->obd_flags & OBD_RECOVERING) { + if (abort_recovery) { + target_abort_recovery(obd); + } else if (recovering) { rc = filter_recovery_request(req, obd, &should_process); if (rc || !should_process) RETURN(rc); - } else if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { -#if 0 -/* need to store this reply somewhere... */ - if (req->rq_xid == med->med_last_xid) { - DEBUG_REQ(D_HA, req, "resending reply"); - OBD_ALLOC(req->rq_repmsg, med->med_last_replen); - req->rq_replen = med->med_last_replen; - memcpy(req->rq_repmsg, med->med_last_reply, - req->rq_replen); - ptlrpc_reply(req->rq_svc, req); - return 0; - } - DEBUG_REQ(D_HA, req, "no reply for resend, continuing"); -#endif } - } if (strcmp(req->rq_obd->obd_type->typ_name, "ost") != 0) @@ -731,13 +941,13 @@ static int ost_handle(struct ptlrpc_request *req) case OST_SAN_READ: CDEBUG(D_INODE, "san read\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0); - rc = ost_san_brw(req, 0); + rc = ost_san_brw(req, OBD_BRW_READ); /* ost_san_brw sends its own replies */ RETURN(rc); case OST_SAN_WRITE: CDEBUG(D_INODE, "san write\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_BRW_NET, 0); - rc = ost_san_brw(req, 1); + rc = ost_san_brw(req, OBD_BRW_WRITE); /* ost_san_brw sends its own replies */ RETURN(rc); case OST_PUNCH: @@ -755,11 +965,16 @@ static int ost_handle(struct ptlrpc_request *req) OBD_FAIL_RETURN(OBD_FAIL_OST_SYNCFS_NET, 0); rc = ost_syncfs(req); break; + case OBD_PING: + DEBUG_REQ(D_INODE, req, "ping"); + rc = target_handle_ping(req); + break; case LDLM_ENQUEUE: CDEBUG(D_INODE, "enqueue\n"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0); rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast, ldlm_server_blocking_ast); + fail = OBD_FAIL_OST_LDLM_REPLY_NET; break; case LDLM_CONVERT: CDEBUG(D_INODE, "convert\n"); @@ -775,12 +990,11 @@ static int ost_handle(struct ptlrpc_request *req) case LDLM_CP_CALLBACK: CDEBUG(D_INODE, "callback\n"); CERROR("callbacks should not happen on OST\n"); - LBUG(); - OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0); - break; + /* fall through */ default: + CERROR("Unexpected opcode %d\n", req->rq_reqmsg->opc); req->rq_status = -ENOTSUPP; - rc = ptlrpc_error(req->rq_svc, req); + rc = ptlrpc_error(req); RETURN(rc); } @@ -788,22 +1002,22 @@ static int ost_handle(struct ptlrpc_request *req) /* If we're DISCONNECTing, the export_data is already freed */ if (!rc && req->rq_reqmsg->opc != OST_DISCONNECT) { struct obd_device *obd = req->rq_export->exp_obd; - if ((obd->obd_flags & OBD_NO_TRANSNO) == 0) { + if (!obd->obd_no_transno) { req->rq_repmsg->last_committed = - HTON__u64(obd->obd_last_committed); + obd->obd_last_committed; } else { DEBUG_REQ(D_IOCTL, req, "not sending last_committed update"); } CDEBUG(D_INFO, "last_committed "LPU64", xid "LPX64"\n", - obd->obd_last_committed, HTON__u64(req->rq_xid)); + obd->obd_last_committed, req->rq_xid); } out: if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_LAST_REPLAY) { struct obd_device *obd = req->rq_export->exp_obd; - if (obd && (obd->obd_flags & OBD_RECOVERING)) { + if (obd && obd->obd_recovering) { DEBUG_REQ(D_HA, req, "LAST_REPLAY, queuing reply"); return target_queue_final_reply(req, rc); } @@ -811,21 +1025,10 @@ out: rc = req->rq_status = -ENOTCONN; } - if (rc) { - CERROR("ost: processing error (opcode=%d): %d\n", - req->rq_reqmsg->opc, rc); - ptlrpc_error(req->rq_svc, req); - } else { - CDEBUG(D_INODE, "sending reply\n"); - if (req->rq_repmsg == NULL) - CERROR("handler for opcode %d returned rc=0 without " - "creating rq_repmsg; needs to return rc != 0!\n", - req->rq_reqmsg->opc); - else - oti_to_request(oti, req); - ptlrpc_reply(req->rq_svc, req); - } + if (!rc) + oti_to_request(oti, req); + target_send_reply(req, rc, fail); return 0; } @@ -839,7 +1042,7 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, - ost_handle, "ost"); + ost_handle, "ost", obddev); if (!ost->ost_service) { CERROR("failed to start service\n"); GOTO(error_disc, err = -ENOMEM); @@ -861,13 +1064,15 @@ error_disc: RETURN(err); } -static int ost_cleanup(struct obd_device * obddev) +static int ost_cleanup(struct obd_device *obddev, int force, int failover) { struct ost_obd *ost = &obddev->u.ost; int err = 0; - ENTRY; + if (obddev->obd_recovering) + target_cancel_recovery_timer(obddev); + ptlrpc_stop_all_threads(ost->ost_service); ptlrpc_unregister_service(ost->ost_service); @@ -891,9 +1096,7 @@ int ost_detach(struct obd_device *dev) * connects directly to this module. */ static int ost_connect(struct lustre_handle *conn, - struct obd_device *obd, struct obd_uuid *cluuid, - struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + struct obd_device *obd, struct obd_uuid *cluuid) { struct obd_export *exp; int rc; @@ -907,6 +1110,7 @@ static int ost_connect(struct lustre_handle *conn, RETURN(rc); exp = class_conn2export(conn); LASSERT(exp); + class_export_put(exp); RETURN(0); } diff --git a/lustre/portals/.cvsignore b/lustre/portals/.cvsignore new file mode 100644 index 0000000..99ac885 --- /dev/null +++ b/lustre/portals/.cvsignore @@ -0,0 +1,8 @@ +Kernelenv +Makefile +Makefile.in +aclocal.m4 +autom4te.cache +config.log +config.status +configure diff --git a/lustre/portals/AUTHORS b/lustre/portals/AUTHORS new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/ChangeLog b/lustre/portals/ChangeLog new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/Kernelenv.in b/lustre/portals/Kernelenv.in new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lustre/portals/Kernelenv.in @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lustre/portals/Kernelenv.mk b/lustre/portals/Kernelenv.mk new file mode 100644 index 0000000..29a713f --- /dev/null +++ b/lustre/portals/Kernelenv.mk @@ -0,0 +1 @@ +EXTRA_CFLAGS= -Ifs/lustre/include -Ifs/lustre/portals/include diff --git a/lustre/portals/Makefile.am b/lustre/portals/Makefile.am new file mode 100644 index 0000000..1a223f2 --- /dev/null +++ b/lustre/portals/Makefile.am @@ -0,0 +1,12 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = Rules.linux archdep.m4 include +DIST_SUBDIRS = libcfs portals knals unals utils tests doc router +if LIBLUSTRE +SUBDIRS = portals unals utils +else +SUBDIRS = libcfs portals knals unals utils tests doc router +endif diff --git a/lustre/portals/Makefile.mk b/lustre/portals/Makefile.mk new file mode 100644 index 0000000..be0e51a --- /dev/null +++ b/lustre/portals/Makefile.mk @@ -0,0 +1,6 @@ +include fs/lustre/portals/Kernelenv + +obj-y += portals/ +obj-y += libcfs/ +obj-y += knals/ +obj-y += router/ diff --git a/lustre/portals/NEWS b/lustre/portals/NEWS new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/README b/lustre/portals/README new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/Rules.linux b/lustre/portals/Rules.linux new file mode 100644 index 0000000..93943b7 --- /dev/null +++ b/lustre/portals/Rules.linux @@ -0,0 +1,25 @@ +# included in Linux kernel directories +# Rules for module building + +if LINUX25 + +basename=$(shell echo $< | sed -e 's/\.c//g' | sed -e 's/-//g' | sed -e 's/\.o//g') +AM_CPPFLAGS= -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -mpreferred-stack-boundary=2 -DKBUILD_MODNAME=$(MODULE) -DKBUILD_BASENAME=$(basename) + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + +else + +$(MODULE).o: $($(MODULE)_OBJECTS) + $(LD) -m $(MOD_LINK) -r -o $(MODULE).o $($(MODULE)_OBJECTS) + +endif + +tags: + rm -f $(top_srcdir)/TAGS + rm -f $(top_srcdir)/tags + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs etags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs etags -a + find $(top_srcdir)/../portals/ -name '*.[hc]' | xargs ctags -a + find $(top_srcdir) -name '*.[hc]' | grep -v ".orig" | xargs ctags -a diff --git a/lustre/portals/archdep.m4 b/lustre/portals/archdep.m4 new file mode 100644 index 0000000..7cb00cf --- /dev/null +++ b/lustre/portals/archdep.m4 @@ -0,0 +1,317 @@ + +# -------- in kernel compilation? (2.5 only) ------------- +AC_ARG_ENABLE(inkernel, [ --enable-inkernel set up 2.5 kernel makefiles]) +AM_CONDITIONAL(INKERNEL, test x$enable_inkernel = xyes) +echo "Makefile for in kernel build: $INKERNEL" + +# -------- liblustre compilation -------------- +AC_ARG_WITH(lib, [ --with-lib compile lustre library], host_cpu="lib") + +# -------- set linuxdir ------------ + +AC_ARG_WITH(linux, [ --with-linux=[path] set path to Linux source (default=/usr/src/linux)],LINUX=$with_linux,LINUX=/usr/src/linux) +AC_SUBST(LINUX) + +# --------- UML? -------------------- +AC_MSG_CHECKING(if you are running user mode linux for $host_cpu ...) +if test $host_cpu = "lib" ; then + host_cpu="lib" + AC_MSG_RESULT(no building Lustre library) +else + if test -e $LINUX/include/asm-um ; then + if test X`ls -id $LINUX/include/asm/ | awk '{print $1}'` = X`ls -id $LINUX/include/asm-um | awk '{print $1}'` ; then + host_cpu="um"; + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT(no (asm doesn't point at asm-um)) + fi + + else + AC_MSG_RESULT(no (asm-um missing)) + fi +fi + +# --------- Linux 25 ------------------ + +AC_MSG_CHECKING(if you are running linux 2.5) +if test -e $LINUX/include/linux/namei.h ; then + linux25="yes" + AC_MSG_RESULT(yes) +else + linux25="no" + AC_MSG_RESULT(no) +fi +AM_CONDITIONAL(LINUX25, test x$linux25 = xyes) +echo "Makefiles for in linux 2.5 build: $LINUX25" + +# ------- Makeflags ------------------ + +AC_MSG_CHECKING(setting make flags system architecture: ) +case ${host_cpu} in + lib ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall ' + KCPPFLAGS='-D__arch_lib__ ' + libdir='${exec_prefix}/lib/lustre' + MOD_LINK=elf_i386 +;; + um ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -Wall -pipe -Wno-trigraphs -Wstrict-prototypes -fno-strict-aliasing -fno-common ' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/include -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/kernel/skas/include -O2 -nostdinc -iwithprefix include -DKBUILD_BASENAME=$(MODULE) -DKBUILD_MODNAME=$(MODULE) ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -U__i386__ -Ui386 -DUM_FASTCALL -D__arch_um__ -DSUBARCH="i386" -DNESTING=0 -D_LARGEFILE64_SOURCE -Derrno=kernel_errno -DPATCHLEVEL=4 -DMODULE -I$(LINUX)/arch/um/kernel/tt/include -I$(LINUX)/arch/um/include ' + ;; + esac + + MOD_LINK=elf_i386 +;; + i*86 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -pipe' + case ${linux25} in + yes ) + KCPPFLAGS='-D__KERNEL__ -DMODULE -march=i686 -I$(LINUX)/include/asm-i386/mach-default -nostdinc -iwithprefix include ' + ;; + * ) + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + ;; + esac + MOD_LINK=elf_i386 +;; + + alphaev6 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alphaev67 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev6' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + alpha* ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-g -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -pipe -mno-fp-regs -ffixed-8 -mcpu=ev5 -Wa,-mev5' + KCPPFLAGS='-D__KERNEL__ -DMODULE ' + MOD_LINK=elf64alpha +;; + + ia64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-gstabs -O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fno-strict-aliasing -fno-common -pipe -ffixed-r13 -mfixed-range=f10-f15,f32-f127 -falign-functions=32 -mb-step' + KCPPFLAGS='-D__KERNEL__ -DMODULE' + MOD_LINK=elf64_ia64 +;; + + sparc64 ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -Wno-unused -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow -ffixed-g4 -fcall-used-g5 -fcall-used-g7 -Wno-sign-compare -Wa,--undeclared-regs' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf64_sparc + +;; + + powerpc ) + AC_MSG_RESULT($host_cpu) + KCFLAGS='-O2 -Wall -Wstrict-prototypes -Wno-trigraphs -fomit-frame-pointer -fno-strict-aliasing -fno-common -D__powerpc__ -fsigned-char -msoft-float -pipe -ffixed-r2 -Wno-uninitialized -mmultiple -mstring' + KCPPFLAGS='-D__KERNEL__' + MOD_LINK=elf32ppclinux +;; + + *) + AC_ERROR("Unknown Linux Platform: $host_cpu") +;; +esac + +# ----------- make dep run? ------------------ + +if test $host_cpu != "lib" ; then + AC_MSG_CHECKING(if make dep has been run in kernel source (host $host_cpu) ) + if test -f $LINUX/include/linux/config.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/config.h. Run make dep in $LINUX.) + fi +fi + +# ------------ include paths ------------------ + +if test $host_cpu != "lib" ; then + KINCFLAGS="-I\$(top_srcdir)/include -I\$(top_srcdir)/portals/include -I$LINUX/include" +else + KINCFLAGS='-I$(top_srcdir)/include -I$(top_srcdir)/portals/include' +fi +CPPFLAGS="$KINCFLAGS $ARCHCPPFLAGS" + +if test $host_cpu != "lib" ; then +# ------------ autoconf.h ------------------ + AC_MSG_CHECKING(if autoconf.h is in kernel source) + if test -f $LINUX/include/linux/autoconf.h ; then + AC_MSG_RESULT(yes) + else + AC_MSG_ERROR(** cannot find $LINUX/include/linux/autoconf.h. Run make config in $LINUX.) + fi + +# ------------ RELEASE and moduledir ------------------ + AC_MSG_CHECKING(for Linux release) + + dnl We need to rid ourselves of the nasty [ ] quotes. + changequote(, ) + dnl Get release from version.h + RELEASE="`sed -ne 's/.*UTS_RELEASE[ \"]*\([0-9.a-zA-Z_-]*\).*/\1/p' $LINUX/include/linux/version.h`" + changequote([, ]) + + moduledir='$(libdir)/modules/'$RELEASE/kernel + AC_SUBST(moduledir) + + modulefsdir='$(moduledir)/fs/$(PACKAGE)' + AC_SUBST(modulefsdir) + + AC_MSG_RESULT($RELEASE) + AC_SUBST(RELEASE) + +# ---------- modversions? -------------------- + AC_MSG_CHECKING(for MODVERSIONS) + if egrep -e 'MODVERSIONS.*1' $LINUX/include/linux/autoconf.h >/dev/null 2>&1; + then + MFLAGS="-DMODULE -DMODVERSIONS -include $LINUX/include/linux/modversions.h -DEXPORT_SYMTAB" + AC_MSG_RESULT(yes) + else + MFLAGS= + AC_MSG_RESULT(no) + fi +fi + +# ---------- Portals flags -------------------- + +#AC_PREFIX_DEFAULT([]) +#if test "x$prefix" = xNONE || test "x$prefix" = x; then +# usrprefix=/usr +#else +# usrprefix='${prefix}' +#fi +#AC_SUBST(usrprefix) + +AC_MSG_CHECKING(if kernel has CPU affinity support) +if test "$target_cpu" != ia64 ; then + enable_affinity_temp="-DCPU_AFFINITY=1" + AC_MSG_RESULT(yes) +else + enable_affinity_temp="" + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(if kernel has zero-copy TCP support) +ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`" +if test "$ZCCD" != 0 ; then + enable_zerocopy_temp="-DSOCKNAL_ZC=1" + AC_MSG_RESULT(yes) +else + enable_zerocopy_temp="" + AC_MSG_RESULT(no) +fi + +AC_ARG_ENABLE(zerocopy, [ --enable-zerocopy enable socknal zerocopy],enable_zerocopy=$enable_zerocopy_temp, enable_zerocopy="") + +AC_ARG_ENABLE(affinity, [ --enable-affinity enable process/irq affinity],enable_affinity="-DCPU_AFFINITY=1", enable_affinity=$enable_affinity_temp) +##################################### + +AC_MSG_CHECKING(if quadrics kernel headers are present) +if test -d $LINUX/drivers/net/qsnet ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/net/qsnet/include" + : +elif test -d $LINUX/drivers/qsnet1 ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/qsnet1/include -DPROPRIETARY_ELAN" + : +elif test -d $LINUX/drivers/quadrics ; then + AC_MSG_RESULT(yes) + QSWNAL="qswnal" + with_quadrics="-I$LINUX/drivers/quadrics/include -DPROPRIETARY_ELAN" + : +#elif test -d /usr/include/elan3 ; then +# AC_MSG_RESULT(yes) +# QSWNAL="qswnal" +# with_quadrics="" +# : +else + AC_MSG_RESULT(no) + QSWNAL="" + with_quadrics="" + : +fi +AC_SUBST(with_quadrics) +AC_SUBST(QSWNAL) + +# R. Read 5/02 +GMNAL="" +echo "checking with-gm=" ${with_gm} +if test "${with_gm+set}" = set; then + if test "${with_gm}" = yes; then + with_gm="-I/usr/local/gm/include" + else + with_gm=-I"$with_gm/include" + fi + GMNAL="gmnal" +else +# default case - no GM + with_gm="" +fi +AC_SUBST(with_gm) +AC_SUBST(GMNAL) + + +def_scamac=/opt/scali/include +AC_ARG_WITH(scamac, [ --with-scamac=[yes/no/path] Path to ScaMAC includes (default=/opt/scali/include)], with_scamac=$withval, with_scamac=$def_scamac) +AC_MSG_CHECKING(if ScaMAC headers are present) +if test "$with_scamac" = yes; then + with_scamac=$def_scamac +fi +if test "$with_scamac" != no -a -f ${with_scamac}/scamac.h; then + AC_MSG_RESULT(yes) + SCIMACNAL="scimacnal" + with_scamac="-I${with_scamac} -I${with_scamac}/icm" +else + AC_MSG_RESULT(no) + SCIMACNAL="" + with_scamac="" +fi + +AC_SUBST(with_scamac) +AC_SUBST(SCIMACNAL) + +CFLAGS="$KCFLAGS" +CPPFLAGS="$KINCFLAGS $KCPPFLAGS $MFLAGS $enable_zerocopy $enable_affinity $with_quadrics $with_gm $with_scamac " + +AC_SUBST(MOD_LINK) +AC_SUBST(LINUX25) +AM_CONDITIONAL(LIBLUSTRE, test x$host_cpu = xlib) + +# ---------- Red Hat 2.4.20 backports some 2.5 bits -------- +# This needs to run after we've defined the KCPPFLAGS + +AC_MSG_CHECKING(for kernel version) +AC_TRY_LINK([#define __KERNEL__ + #include <linux/sched.h>], + [struct task_struct p; + p.sighand = NULL;], + [RH_2_4_20=1], + [RH_2_4_20=0]) + +if test $RH_2_4_20 = 1; then + AC_MSG_RESULT(redhat-2.4.20) + CPPFLAGS="$CPPFLAGS -DCONFIG_RH_2_4_20" +else + AC_MSG_RESULT($RELEASE) +fi diff --git a/lustre/portals/autogen.sh b/lustre/portals/autogen.sh new file mode 100755 index 0000000..9deed73 --- /dev/null +++ b/lustre/portals/autogen.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +aclocal && +automake --add-missing && +${AUTOCONF:-autoconf} diff --git a/lustre/portals/build.m4 b/lustre/portals/build.m4 new file mode 100644 index 0000000..025f243 --- /dev/null +++ b/lustre/portals/build.m4 @@ -0,0 +1,95 @@ +# ---------- other tests and settings --------- + + +# --------- unsigned long long sane? ------- + +AC_CHECK_SIZEOF(unsigned long long, 0) +echo "---> size SIZEOF $SIZEOF_unsigned_long_long" +echo "---> size SIZEOF $ac_cv_sizeof_unsigned_long_long" +if test $ac_cv_sizeof_unsigned_long_long != 8 ; then + AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) +fi + +# directories for binaries +ac_default_prefix= +bindir='${exec_prefix}/usr/bin' +sbindir='${exec_prefix}/usr/sbin' +includedir='${prefix}/usr/include' + +# Directories for documentation and demos. +docdir='${prefix}/usr/share/doc/$(PACKAGE)' +AC_SUBST(docdir) +demodir='$(docdir)/demo' +AC_SUBST(demodir) +pkgexampledir='${prefix}/usr/lib/$(PACKAGE)/examples' +AC_SUBST(pkgexampledir) +pymoddir='${prefix}/usr/lib/${PACKAGE}/python/Lustre' +AC_SUBST(pymoddir) +modulenetdir='$(moduledir)/net/$(PACKAGE)' +AC_SUBST(modulenetdir) + + +# ---------- BAD gcc? ------------ +AC_PROG_RANLIB +AC_PROG_CC +AC_MSG_CHECKING(for buggy compiler) +CC_VERSION=`$CC -v 2>&1 | grep "^gcc version"` +bad_cc() { + echo + echo " '$CC_VERSION'" + echo " has been known to generate bad code, " + echo " please get an updated compiler." + AC_MSG_ERROR(sorry) +} +TMP_VERSION=`echo $CC_VERSION | cut -c 1-16` +if test "$TMP_VERSION" = "gcc version 2.95"; then + bad_cc +fi +case "$CC_VERSION" in + # ost_pack_niobuf putting 64bit NTOH temporaries on the stack + # without "sub $0xc,%esp" to protect the stack from being + # stomped on by interrupts (bug 606) + "gcc version 2.96 20000731 (Red Hat Linux 7.1 2.96-98)") + bad_cc + ;; + # mandrake's similar sub 0xc compiler bug + # http://marc.theaimsgroup.com/?l=linux-kernel&m=104748366226348&w=2 + "gcc version 2.96 20000731 (Mandrake Linux 8.1 2.96-0.62mdk)") + bad_cc + ;; + *) + AC_MSG_RESULT(no known problems) + ;; +esac +# end ------ BAD gcc? ------------ + +# -------- Check for required packages -------------- + +# this doesn't seem to work on older autoconf +# AC_CHECK_LIB(readline, readline,,) +AC_ARG_ENABLE(readline, [ --enable-readline use readline library],, + enable_readline="yes") + +if test "$enable_readline" = "yes" ; then + LIBREADLINE="-lreadline -lncurses" + HAVE_LIBREADLINE="-DHAVE_LIBREADLINE=1" +else + LIBREADLINE="" + HAVE_LIBREADLINE="" +fi +AC_SUBST(LIBREADLINE) +AC_SUBST(HAVE_LIBREADLINE) + +AC_ARG_ENABLE(efence, [ --enable-efence use efence library],, + enable_efence="no") + +if test "$enable_efence" = "yes" ; then + LIBEFENCE="-lefence" + HAVE_LIBEFENCE="-DHAVE_LIBEFENCE=1" +else + LIBEFENCE="" + HAVE_LIBEFENCE="" +fi +AC_SUBST(LIBEFENCE) +AC_SUBST(HAVE_LIBEFENCE) + diff --git a/lustre/portals/configure.in b/lustre/portals/configure.in new file mode 100644 index 0000000..31d3492 --- /dev/null +++ b/lustre/portals/configure.in @@ -0,0 +1,34 @@ +# This version is here to make autoconf happy; the name is a file which is +# "unique" to this directory so that configure knows where it should run. +AC_INIT(knals/Makefile.am, 3.0) +AC_CANONICAL_SYSTEM +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +# Automake variables. Steal the version number from packaging/intersync.spec +AM_INIT_AUTOMAKE(portals, builtin([esyscmd], [sed -ne '/.*define IVERSION /{ s/.*IVERSION //; p; }' libcfs/module.c])) +# AM_MAINTAINER_MODE + +sinclude(build.m4) +sinclude(archdep.m4) + +if test x$enable_inkernel = xyes ; then +cp Kernelenv.mk Kernelenv.in +cp Makefile.mk Makefile.in +cp libcfs/Makefile.mk libcfs/Makefile.in +cp portals/Makefile.mk portals/Makefile.in +cp knals/Makefile.mk knals/Makefile.in +cp knals/socknal/Makefile.mk knals/socknal/Makefile.in +cp router/Makefile.mk router/Makefile.in +fi + +AM_CONFIG_HEADER(include/config.h) + +AC_OUTPUT([Makefile Kernelenv libcfs/Makefile portals/Makefile \ + unals/Makefile knals/Makefile router/Makefile \ + knals/socknal/Makefile knals/gmnal/Makefile knals/qswnal/Makefile \ + knals/scimacnal/Makefile knals/toenal/Makefile \ + utils/Makefile tests/Makefile doc/Makefile ]) + diff --git a/lustre/portals/doc/.cvsignore b/lustre/portals/doc/.cvsignore new file mode 100644 index 0000000..827dca4 --- /dev/null +++ b/lustre/portals/doc/.cvsignore @@ -0,0 +1,4 @@ +Makefile +Makefile.in +*.eps +*.pdf diff --git a/lustre/portals/doc/Data-structures b/lustre/portals/doc/Data-structures new file mode 100644 index 0000000..b5532b1 --- /dev/null +++ b/lustre/portals/doc/Data-structures @@ -0,0 +1,65 @@ +In this document I will try to draw the data structures and how they +interrelate in the Portals 3 reference implementation. It is probably +best shown with a drawing, so there may be an additional xfig or +Postscript figure. + + +MEMORY POOLS: +------------ + +First, a digression on memory allocation in the library. As mentioned +in the NAL Writer's Guide, the library does not link against any +standard C libraries and as such is unable to dynamically allocate +memory on its own. It requires that the NAL implement a method +for allocation that is appropriate for the protection domain in +which the library lives. This is only called when a network +interface is initialized to allocate the Portals object pools. + +These pools are preallocate blocks of objects that the library +can rapidly make active and manage with a minimum of overhead. +It is also cuts down on overhead for setting up structures +since the NAL->malloc() callback does not need to be called +for each object. + +The objects are maintained on a per-object type singly linked free +list and contain a pointer to the next free object. This pointer +is NULL if the object is not on the free list and is non-zero +if it is on the list. The special sentinal value of 0xDEADBEEF +is used to mark the end of the free list since NULL could +indicate that the last object in the list is not free. + +When one of the lib_*_alloc() functions is called, the library +returns the head of the free list and advances the head pointer +to the next item on the list. The special case of 0xDEADBEEF is +checked and a NULL pointer is returned if there are no more +objects of this type available. The lib_*_free() functions +are even simpler -- check to ensure that the object is not already +free, set its next pointer to the current head and then set +the head to be this newly freed object. + +Since C does not have templates, I did the next best thing and wrote +the memory pool allocation code as a macro that expands based on the +type of the argument. The mk_alloc(T) macro expands to +write the _lib_T_alloc() and lib_T_free() functions. +It requires that the object have a pointer of the type T named +"next_free". There are also functions that map _lib_T_alloc() +to lib_T_alloc() so that the library can add some extra +functionality to the T constructor. + + + +LINKED LISTS: +------------ + +Many of the active Portals objects are stored in doubly linked lists +when they are active. These are always implemented with the pointer +to the next object and a pointer to the next pointer of the +previous object. This avoids the "dummy head" object or +special cases for inserting at the beginning or end of the list. +The pointer manipulations are a little hairy at times, but +I hope that they are understandable. + +The actual linked list code is implemented as macros in <lib-p30.h>, +although the object has to know about + + diff --git a/lustre/portals/doc/Makefile.am b/lustre/portals/doc/Makefile.am new file mode 100644 index 0000000..7c65e6c --- /dev/null +++ b/lustre/portals/doc/Makefile.am @@ -0,0 +1,46 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +LYX2PDF = lyx --export pdf +LYX2TXT = lyx --export text +LYX2HTML = lyx --export html +SUFFIXES = .lin .lyx .pdf .sgml .html .txt .fig .eps + +DOCS = portals3.pdf +IMAGES = file.eps flow_new.eps get.eps mpi.eps portals.eps put.eps +LYXFILES= portals3.lyx + +MAINTAINERCLEANFILES = $(IMAGES) $(DOCS) $(GENERATED) +GENERATED = +EXTRA_DIST = $(DOCS) $(IMAGES) $(LYXFILES) + +all: $(DOCS) + +# update date and version in document +date := $(shell date +%x) +tag := $(shell echo '$$Name: $$' | sed -e 's/^\$$Na''me: *\$$$$/HEAD/; s/^\$$Na''me: \(.*\) \$$$$/\1/') +addversion = sed -e 's|@T''AG@|$(tag)|g; s|@VER''SION@|$(VERSION)|g; s|@DA''TE@|$(date)|g' + +# Regenerate when the $(VERSION) or $Name: $ changes. +.INTERMEDIATE: $(GENERATED) +$(GENERATED) : %.lyx: %.lin Makefile + $(addversion) $< > $@ + +.lyx.pdf: + @$(LYX2PDF) $< || printf "\n*** Warning: not creating PDF docs; install lyx to rectify this\n" + +.lyx.txt: + @$(LYX2TXT) $< || printf "\n*** Warning: not creating text docs; install lyx to rectify this\n" +.lyx.html: + @$(LYX2HTML) $< || printf "\n*** Warning: not creating HTML docs; install lyx to rectify this\n" +.fig.eps: + -fig2dev -L eps $< > $@ + +portals3.pdf portals3.txt portals3.html: $(IMAGES) portals3.lyx + +syncweb: portals3.pdf +# cp lustre.pdf /usr/src/www/content/lustre/docs/lustre.pdf +# ( cd /usr/src/www ; make lustre ; make synclustre ) + diff --git a/lustre/portals/doc/Message-life-cycle b/lustre/portals/doc/Message-life-cycle new file mode 100644 index 0000000..e8cc7e2 --- /dev/null +++ b/lustre/portals/doc/Message-life-cycle @@ -0,0 +1,118 @@ +This documents the life cycle of message as it arrives and is handled by +a basic async, packetized NAL. There are four types of messages that have +slightly different life cycles, so they are addressed independently. + + +Put request +----------- + +1. NAL notices that there is a incoming message header on the network +and reads an ptl_hdr_t in from the wire. + +2. It may store additional NAL specific data that provides context +for this event in a void* that it will interpret in some fashion +later. + +3. The NAL calls lib_parse() with a pointer to the header and its +private data structure. + +4. The library decodes the header and may build a message state +object that describes the event to be written and the ACK to be +sent, if any. It then calls nal->recv() with the private data +that the NAL passed in, a pointer to the message state object +and a translated user address. + + The NAL will have been given a chance to pretranslate + all user addresses when the buffers are created. This + process is described in the NAL-HOWTO. + +5. The NAL should restore what ever context it required from the +private data pointer, begin receiving the bytes and possibly store +some extra state of its own. It should return at this point. + + + +Get request +----------- + +1. As with a Put, the NAL notices the incoming message header and +passes it to lib_parse(). + +2. The library decodes the header and calls nal->recv() with a +zero byte length, offset and destination to instruct it to clean +up the wire after reading the header. The private data will +be passed in as well, allowing the NAL to retrieve any state +or context that it requires. + +3. The library may build a message state object to possibly +write an event log or invalidate a memory region. + +4. The library will build a ptl_msg_t header that specifies the +Portals protocol information for delivery at the remote end. + +5. The library calls nal->send() with the pre-built header, +the optional message state object, the four part address +component, a translated user pointer + offset, and some +other things. + +6. The NAL is to put the header on the wire or copy it at +this point (since it off the stack). It should store some +amount of state about its current position in the message and +the destination address. + +7. And then return to the library. + + +Reply request +------------- + +1. Starting at "The library decodes the header..." + +2. The library decodes the header and calls nal->recv() +to bring in the rest of the message. Flow continues in +exactly the same fashion as with all other receives. + + +Ack request +----------- + +1. The library decodes the header, builds the appropriate data +structures for the event in a message state object and calls nal->recv() +with a zero byte length, etc. + + +Packet arrival +-------------- + +1. The NAL should notice the arrival of a packet, retrieve whatever +state it needs from the message ID or other NAL specific header data +and place the data bytes directly into the user address that were +given to nal->recv(). + + How this happens is outside the scope of the Portals library + and soley determined by the NAL... + +2. If this is the last packet in a message, the NAL should retrieve +the lib_msg_t *cookie that it was given in the call to nal->recv() +and pass it to lib_finalize(). lib_finalize() may call nal->send() +to send an ACK, nal->write() to record an entry in the event log, +nal->invalidate() to unregister a region of memory or do nothing at all. + +3. It should then clean up any remaining NAL specific state about +the message and go back into the main loop. + + +Outgoing packets +---------------- + +1. When the NAL has pending output, it should put the packets on +the wire wrapped with whatever implementation specified wrappers. + +2. Once it has output all the packets of a message it should +call lib_finalize() with the message state object that was +handed to nal->send(). This will allows the library to clean +up its state regarding the message and write any pending event +entries. + + + diff --git a/lustre/portals/doc/NAL-HOWTO b/lustre/portals/doc/NAL-HOWTO new file mode 100644 index 0000000..ea38aed --- /dev/null +++ b/lustre/portals/doc/NAL-HOWTO @@ -0,0 +1,293 @@ +This document is a first attempt at describing how to write a NAL +for the Portals 3 library. It also defines the library architecture +and the abstraction of protection domains. + + +First, an overview of the architecture: + + Application + +----|----+-------- + | + API === NAL (User space) + | +---------+---|----- + | + LIB === NAL (Library space) + | +---------+---|----- + + Physical wire (NIC space) + + +Application + API +API-side NAL +------------ +LIB-side NAL + LIB +LIB-side NAL + wire + +Communication is through the indicated paths via well defined +interfaces. The API and LIB portions are written to be portable +across platforms and do not depend on the network interface. + +Communcation between the application and the API code is +defined in the Portals 3 API specification. This is the +user-visible portion of the interface and should be the most +stable. + + + +API-side NAL: +------------ + +The user space NAL needs to implement only a few functions +that are stored in a nal_t data structure and called by the +API-side library: + + int forward( nal_t *nal, + int index, + void *args, + size_t arg_len, + void *ret, + size_t ret_len + ); + +Most of the data structures in the portals library are held in +the LIB section of the code, so it is necessary to forward API +calls across the protection domain to the library. This is +handled by the NAL's forward method. Once the argument and return +blocks are on the remote side the NAL should call lib_dispatch() +to invoke the appropriate API function. + + int validate( nal_t *nal, + void *base, + size_t extent, + void **trans_base, + void **trans_data + ); + +The validate method provides a means for the NAL to prevalidate +and possibly pretranslate user addresses into a form suitable +for fast use by the network card or kernel module. The trans_base +pointer will be used by the library everytime it needs to +refer to the block of memory. The trans_data result is a +cookie that will be handed to the NAL along with the trans_base. + +The library never performs calculations on the trans_base value; +it only computes offsets that are then handed to the NAL. + + + int shutdown( nal_t *nal, int interface ); + +Brings down the network interface. The remote NAL side should +call lib_fini() to bring down the library side of the network. + + void yield( nal_t *nal ); + +This allows the user application to gracefully give up the processor +while busy waiting. Performance critical applications may not +want to take the time to call this function, so it should be an +option to the PtlEQWait call. Right now it is not implemented as such. + +Lastly, the NAL must implement a function named PTL_IFACE_*, where +* is the name of the NAL such as PTL_IFACE_IP or PTL_IFACE_MYR. +This initialization function is to set up communication with the +library-side NAL, which should call lib_init() to bring up the +network interface. + + + +LIB-side NAL: +------------ + +On the library-side, the NAL has much more responsibility. It +is responsible for calling lib_dispatch() on behalf of the user, +it is also responsible for bringing packets off the wire and +pushing bits out. As on the user side, the methods are stored +in a nal_cb_t structure that is defined on a per network +interface basis. + +The calls to lib_dispatch() need to be examined. The prototype: + + void lib_dispatch( + nal_cb_t *nal, + void *private, + int index, + void *arg_block, + void *ret_block + ); + +has two complications. The private field is a NAL-specific +value that will be passed to any callbacks produced as a result +of this API call. Kernel module implementations may use this +for task structures, or perhaps network card data. It is ignored +by the library. + +Secondly, the arg_block and ret_block must be in the same protection +domain as the library. The NAL's two halves must communicate the +sizes and perform the copies. After the call, the buffer pointed +to by ret_block will be filled in and should be copied back to +the user space. How this is to be done is NAL specific. + + int lib_parse( + nal_cb_t *nal, + ptl_hdr_t *hdr, + void *private + ); + +This is the only other entry point into the library from the NAL. +When the NAL detects an incoming message on the wire it should read +sizeof(ptl_hdr_t) bytes and pass a pointer to the header to +lib_parse(). It may set private to be anything that it needs to +tie the incoming message to callbacks that are made as a result +of this event. + +The method calls are: + + int (*send)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int nid, + int pid, + int gid, + int rid, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t len + ); + +This is a tricky function -- it must support async output +of messages as well as properly syncronized event log writing. +The private field is the same that was passed into lib_dispatch() +or lib_parse() and may be used to tie this call to the event +that initiated the entry to the library. + +The cookie is a pointer to a library private value that must +be passed to lib_finalize() once the message has been completely +sent. It should not be examined by the NAL for any meaning. + +The four ID fields are passed in, although some implementations +may not use all of them. + +The single base pointer has been replaced with the translated +address that the API NAL generated in the api_nal->validate() +call. The trans_data is unchanged and the offset is in bytes. + + + int (*recv)( + nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + user_ptr trans_base, + user_ptr trans_data, + size_t offset, + size_t mlen, + size_t rlen + ); + +This callback will only be called in response to lib_parse(). +The cookie, trans_addr and trans_data are as discussed in send(). +The NAL should read mlen bytes from the wire, deposit them into +trans_base + offset and then discard (rlen - mlen) bytes. +Once the entire message has been received the NAL should call +lib_finalize() with the lib_msg_t *cookie. + +The special arguments of base=NULL, data=NULL, offset=0, mlen=0, rlen=0 +is used to indicate that the NAL should clean up the wire. This could +be implemented as a blocking call, although having it return as quickly +as possible is desirable. + + int (*write)( + nal_cb_t *nal, + void *private, + user_ptr trans_addr, + user_ptr trans_data, + size_t offset, + + void *src_addr, + size_t len + ); + +This is essentially a cross-protection domain memcpy(). The user address +has been pretranslated by the api_nal->translate() call. + + void *(*malloc)( + nal_cb_t *nal, + size_t len + ); + + void (*free)( + nal_cb_t *nal, + void *buf + ); + +Since the NAL may be in a non-standard hosted environment it can +not call malloc(). This allows the library side NAL to implement +the system specific malloc(). In the current reference implementation +the libary only calls nal->malloc() when the network interface is +initialized and then calls free when it is brought down. The library +maintains its own pool of objects for allocation so only one call to +malloc is made per object type. + + void (*invalidate)( + nal_cb_t *nal, + user_ptr trans_base, + user_ptr trans_data, + size_t extent + ); + +User addresses are validated/translated at the user-level API NAL +method, which is likely to push them to this level. Meanwhile, +the library NAL will be notified when the library no longer +needs the buffer. Overlapped buffers are not detected by the +library, so the NAL should ref count each page involved. + +Unfortunately we have a few bugs when the invalidate method is +called. It is still in progress... + + void (*printf)( + nal_cb_t *nal, + const char *fmt, + ... + ); + +As with malloc(), the library does not have any way to do printf +or printk. It is not necessary for the NAL to implement the this +call, although it will make debugging difficult. + + void (*cli)( + nal_cb_t *nal, + unsigned long *flags + ); + + void (*sti)( + nal_cb_t *nal, + unsigned long *flags + ); + +These are used by the library to mark critical sections. + + int (*gidrid2nidpid)( + nal_cb_t *nal, + ptl_id_t gid, + ptl_id_t rid, + ptl_id_t *nid, + ptl_id_t *pid + ); + + + int (*nidpid2gidrid)( + nal_cb_t *nal, + ptl_id_t nid, + ptl_id_t pid, + ptl_id_t *gid, + ptl_id_t *rid + ); + +Rolf added these. I haven't looked at how they have to work yet. diff --git a/lustre/portals/doc/file.fig b/lustre/portals/doc/file.fig new file mode 100644 index 0000000..914c294 --- /dev/null +++ b/lustre/portals/doc/file.fig @@ -0,0 +1,111 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1200 750 1650 1050 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1050 1650 750 1200 750 1200 1050 1650 1050 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 952 FS0\001 +-6 +6 1200 2325 1650 2625 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2625 1650 2325 1200 2325 1200 2625 1650 2625 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2527 FS3\001 +-6 +6 1200 1800 1650 2100 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 2100 1650 1800 1200 1800 1200 2100 1650 2100 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 2002 FS2\001 +-6 +6 1200 1275 1650 1575 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1650 1575 1650 1275 1200 1275 1200 1575 1650 1575 +4 1 0 100 0 0 10 0.0000 0 105 240 1425 1477 FS1\001 +-6 +6 450 750 900 1200 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 750.000 450 1050 675 1125 900 1050 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 825 225 75 450 900 900 750 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 825 450 1050 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1050 900 825 +-6 +6 450 2325 900 2775 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 2325.000 450 2625 675 2700 900 2625 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 2400 225 75 450 2475 900 2325 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 2400 450 2625 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2625 900 2400 +-6 +6 450 1800 900 2250 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1800.000 450 2100 675 2175 900 2100 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1875 225 75 450 1950 900 1800 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1875 450 2100 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2100 900 1875 +-6 +6 450 1275 900 1725 +5 1 0 1 0 7 100 0 20 0.000 0 1 0 0 675.000 1275.000 450 1575 675 1650 900 1575 +1 2 0 1 0 7 100 0 20 0.000 1 0.0000 675 1350 225 75 450 1425 900 1275 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 450 1350 450 1575 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1575 900 1350 +-6 +6 2250 750 3450 2625 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1200 3150 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1500 3150 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 1800 3150 1800 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2100 3150 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 975 3150 975 3150 2625 2550 2625 2550 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 2 + 2550 2400 3150 2400 +4 1 0 100 0 0 10 0.0000 0 135 1185 2850 900 Application Buffer\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2400 2550 1350 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1875 2550 1050 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 1425 2550 1950 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 900 2550 1650 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 900 1200 900 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1425 1200 1425 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 1950 1200 1950 +2 1 0 1 0 7 100 0 20 0.000 0 0 -1 0 0 2 + 900 2475 1200 2475 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2025 2550 2250 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 1 2 + 0 0 1.00 60.00 120.00 + 0 0 1.00 60.00 120.00 + 1650 2550 2550 2475 +2 4 0 1 0 7 100 0 -1 0.000 0 0 7 0 0 5 + 1875 2850 1875 600 225 600 225 2850 1875 2850 +4 1 0 100 0 0 10 0.0000 0 105 1215 1050 525 Parallel File Server\001 diff --git a/lustre/portals/doc/flow_new.fig b/lustre/portals/doc/flow_new.fig new file mode 100644 index 0000000..d828dea --- /dev/null +++ b/lustre/portals/doc/flow_new.fig @@ -0,0 +1,213 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 525 2175 1575 2925 +6 675 2287 1425 2812 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 2437 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 645 1050 2587 Exists and\001 +4 1 0 50 0 0 10 0.0000 4 135 555 1050 2737 Accepts?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 2550 1050 2175 525 2550 1050 2925 1575 2550 +-6 +6 3450 1275 4350 1725 +6 3600 1312 4200 1687 +4 1 0 100 0 0 10 0.0000 0 135 525 3900 1612 Message\001 +4 1 0 100 0 0 10 0.0000 0 105 465 3900 1462 Discard\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3450 1275 4350 1275 4350 1725 3450 1725 3450 1275 +-6 +6 4650 1275 5550 1725 +6 4725 1312 5475 1687 +4 1 0 100 0 0 10 0.0000 0 135 735 5100 1612 Drop Count\001 +4 1 0 100 0 0 10 0.0000 0 105 630 5100 1462 Increment\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4650 1275 5550 1275 5550 1725 4650 1725 4650 1275 +-6 +6 1350 525 2250 975 +6 1350 562 2250 937 +4 1 0 100 0 0 10 0.0000 0 135 795 1800 862 Match Entry\001 +4 1 0 100 0 0 10 0.0000 0 105 585 1800 712 Get Next\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 525 2250 525 2250 975 1350 975 1350 525 +-6 +6 525 1125 1575 1875 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 1575 1500 1050 1125 525 1500 1050 1875 1575 1500 +4 1 0 100 0 0 10 0.0000 0 105 465 1049 1552 Match?\001 +-6 +6 2340 1237 2940 1687 +6 2340 1237 2940 1687 +4 1 0 100 0 0 10 0.0000 0 105 345 2640 1387 More\001 +4 1 0 100 0 0 10 0.0000 0 105 405 2640 1537 Match\001 +4 1 0 100 0 0 10 0.0000 0 105 510 2640 1687 Entries?\001 +-6 +-6 +6 525 3225 1575 3975 +6 675 3375 1425 3750 +4 1 0 50 0 0 10 0.0000 4 105 255 1050 3525 MD\001 +4 1 0 50 0 0 10 0.0000 4 105 615 1050 3720 has room?\001 +-6 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 525 3600 1050 3225 1575 3600 1050 3975 525 3600 +-6 +6 3300 3375 4350 3825 +6 3300 3412 4350 3787 +4 1 0 50 0 0 10 0.0000 4 105 735 3825 3562 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3825 3712 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3300 3375 4350 3375 4350 3825 3300 3825 3300 3375 +-6 +6 1950 3225 3000 3975 +6 2250 3450 2700 3750 +4 1 0 50 0 0 10 0.0000 4 105 450 2475 3600 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 315 2475 3750 full?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 3000 3600 2475 3225 1950 3600 2475 3975 3000 3600 +-6 +6 3150 4500 4200 4950 +6 3150 4537 4200 4912 +4 1 0 50 0 0 10 0.0000 4 105 735 3675 4687 Unlink MD\001 +4 1 0 50 0 0 10 0.0000 4 135 945 3675 4837 & Match Entry\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3150 4500 4200 4500 4200 4950 3150 4950 3150 4500 +-6 +6 600 4500 1500 4950 +6 675 4537 1425 4912 +4 1 0 50 0 0 10 0.0000 4 135 615 1050 4837 Operation\001 +4 1 0 50 0 0 10 0.0000 4 105 525 1050 4687 Perform\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 600 4500 1500 4500 1500 4950 600 4950 600 4500 +-6 +6 4650 4350 5700 5100 +6 4950 4537 5400 4912 +6 4950 4537 5400 4912 +4 1 0 50 0 0 10 0.0000 4 135 435 5175 4837 Queue?\001 +4 1 0 50 0 0 10 0.0000 4 105 360 5175 4687 Event\001 +-6 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 5700 4725 5175 4350 4650 4725 5175 5100 5700 4725 +-6 +6 6000 4500 6900 4950 +6 6225 4575 6675 4875 +4 1 0 50 0 0 10 0.0000 4 105 360 6450 4875 Event\001 +4 1 0 50 0 0 10 0.0000 4 105 435 6450 4725 Record\001 +-6 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 6000 4500 6900 4500 6900 4950 6000 4950 6000 4500 +-6 +6 1800 4350 2850 5100 +6 2100 4575 2550 4875 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4725 Unlink\001 +4 1 0 50 0 0 10 0.0000 4 105 450 2325 4875 thresh?\001 +-6 +2 3 0 1 0 7 100 0 -1 0.000 0 0 0 0 0 5 + 2850 4725 2325 4350 1800 4725 2325 5100 2850 4725 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 1875 1050 2175 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 1500 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 450 1050 1125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1350 750 1050 750 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 2925 1050 3225 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3150 1500 3450 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4350 1500 4650 1500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2100 1500 2625 1125 3150 1500 2625 1875 2100 1500 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1575 3600 1950 3600 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1050 3975 1050 4500 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 3600 3300 3600 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 4725 1800 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 5700 4725 6000 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2850 4725 3150 4725 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 4200 4725 4650 4725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 6900 4725 7950 4725 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1575 2550 1650 2550 1800 2550 1800 2400 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 2250 750 2475 750 2625 750 2625 900 2625 1125 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 1 5 + 0 0 1.00 60.00 120.00 + 7500 4725 7500 1650 7500 1500 7350 1500 5550 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 2475 3225 2475 2400 2475 2250 2325 2250 1800 2250 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 3825 3375 3825 2175 3825 2025 3675 2025 1800 2025 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 2325 4350 2325 4275 2325 4125 2475 4125 4275 4125 4425 4125 + 4425 4275 4425 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 50 0 -1 0.000 0 1 0 8 + 0 0 1.00 60.00 120.00 + 5175 4350 5175 4275 5175 4125 5325 4125 7125 4125 7275 4125 + 7275 4275 7275 4725 + 0.000 1.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 135 360 825 525 Entry\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1575 2475 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 1950 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 3000 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 2775 1050 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 3225 1425 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 1650 3525 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 1200 4050 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3150 3525 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2625 3150 no\001 +4 1 0 100 0 0 10 0.0000 0 105 195 3000 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 105 195 5850 4650 yes\001 +4 1 0 100 0 0 10 0.0000 0 75 150 2475 4275 no\001 +4 1 0 100 0 0 10 0.0000 0 75 150 5325 4275 no\001 +4 1 0 50 0 0 10 0.0000 4 105 285 7800 4650 Exit\001 diff --git a/lustre/portals/doc/get.fig b/lustre/portals/doc/get.fig new file mode 100644 index 0000000..28db949 --- /dev/null +++ b/lustre/portals/doc/get.fig @@ -0,0 +1,33 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 2775 900 3525 1200 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1200 Translation\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1050 Portal\001 +-6 +6 1350 1725 2175 2025 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 2025 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1875 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 750 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 825 2700 1275 +2 1 0 1 0 7 100 0 -1 3.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1350 900 1950 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 2 0 1 0 7 100 0 -1 4.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +4 1 0 100 0 0 10 0.0000 4 135 495 1800 825 Request\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 diff --git a/lustre/portals/doc/ieee.bst b/lustre/portals/doc/ieee.bst new file mode 100644 index 0000000..4df7c50 --- /dev/null +++ b/lustre/portals/doc/ieee.bst @@ -0,0 +1,1112 @@ +% --------------------------------------------------------------- +% +% by Paolo.Ienne@di.epfl.ch +% +% --------------------------------------------------------------- +% +% no guarantee is given that the format corresponds perfectly to +% IEEE 8.5" x 11" Proceedings, but most features should be ok. +% +% --------------------------------------------------------------- +% +% `ieee' from BibTeX standard bibliography style `abbrv' +% version 0.99a for BibTeX versions 0.99a or later, LaTeX version 2.09. +% Copyright (C) 1985, all rights reserved. +% Copying of this file is authorized only if either +% (1) you make absolutely no changes to your copy, including name, or +% (2) if you do make changes, you name it something other than +% btxbst.doc, plain.bst, unsrt.bst, alpha.bst, and abbrv.bst. +% This restriction helps ensure that all standard styles are identical. +% The file btxbst.doc has the documentation for this style. + +ENTRY + { address + author + booktitle + chapter + edition + editor + howpublished + institution + journal + key + month + note + number + organization + pages + publisher + school + series + title + type + volume + year + } + {} + { label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} + +STRINGS { s t } + +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} + +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} + +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem{" write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} + +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} + +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} + +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} + +FUNCTION {new.block.checka} +{ empty$ + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} + +FUNCTION {new.sentence.checka} +{ empty$ + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {new.sentence.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.sentence + if$ +} + +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} + +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "{\em " swap$ * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + +FUNCTION {format.names} +{ 's := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + nameptr #1 > + { namesleft #1 > + { ", " * t * } + { numnames #2 > + { "," * } + 'skip$ + if$ + t "others" = + { " et~al." * } + { " and " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {format.authors} +{ author empty$ + { "" } + { author format.names } + if$ +} + +FUNCTION {format.editors} +{ editor empty$ + { "" } + { editor format.names + editor num.names$ #1 > + { ", editors" * } + { ", editor" * } + if$ + } + if$ +} + +FUNCTION {format.title} +{ title empty$ + { "" } + { title "t" change.case$ } + if$ +} + +FUNCTION {n.dashify} +{ 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {format.date} +{ year empty$ + { month empty$ + { "" } + { "there's a month but no year in " cite$ * warning$ + month + } + if$ + } + { month empty$ + 'year + { month " " * year * } + if$ + } + if$ +} + +FUNCTION {format.btitle} +{ title emphasize +} + +FUNCTION {tie.or.space.connect} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ * * +} + +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} + +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { "volume" volume tie.or.space.connect + series empty$ + 'skip$ + { " of " * series emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} + +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { output.state mid.sentence = + { "number" } + { "Number" } + if$ + number tie.or.space.connect + series empty$ + { "there's a number but no series in " cite$ * warning$ } + { " in " * series * } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition empty$ + { "" } + { output.state mid.sentence = + { edition "l" change.case$ " edition" * } + { edition "t" change.case$ " edition" * } + if$ + } + if$ +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} + +FUNCTION {format.pages} +{ pages empty$ + { "" } + { pages multi.page.check + { "pages" pages n.dashify tie.or.space.connect } + { "page" pages tie.or.space.connect } + if$ + } + if$ +} + +FUNCTION {format.vol.num.pages} +{ volume field.or.null + number empty$ + 'skip$ + { "(" number * ")" * * + volume empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + } + if$ + pages empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.pages } + { ":" * pages n.dashify * } + if$ + } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { "chapter" } + { type "l" change.case$ } + if$ + chapter tie.or.space.connect + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.in.ed.booktitle} +{ booktitle empty$ + { "" } + { editor empty$ + { "In " booktitle emphasize * } + { "In " format.editors * ", " * booktitle emphasize * } + if$ + } + if$ +} + +FUNCTION {empty.misc.check} +{ author empty$ title empty$ howpublished empty$ + month empty$ year empty$ note empty$ + and and and and and + key empty$ not and + { "all relevant fields are empty in " cite$ * warning$ } + 'skip$ + if$ +} + +FUNCTION {format.thesis.type} +{ type empty$ + 'skip$ + { pop$ + type "t" change.case$ + } + if$ +} + +FUNCTION {format.tr.number} +{ type empty$ + { "Technical Report" } + 'type + if$ + number empty$ + { "t" change.case$ } + { number tie.or.space.connect } + if$ +} + +FUNCTION {format.article.crossref} +{ key empty$ + { journal empty$ + { "need key or journal for " cite$ * " to crossref " * crossref * + warning$ + "" + } + { "In {\em " journal * "\/}" * } + if$ + } + { "In " key * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.crossref.editor} +{ editor #1 "{vv~}{ll}" format.name$ + editor num.names$ duplicate$ + #2 > + { pop$ " et~al." * } + { #2 < + 'skip$ + { editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + { " et~al." * } + { " and " * editor #2 "{vv~}{ll}" format.name$ * } + if$ + } + if$ + } + if$ +} + +FUNCTION {format.book.crossref} +{ volume empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + "In " + } + { "Volume" volume tie.or.space.connect + " of " * + } + if$ + editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { series empty$ + { "need editor, key, or series for " cite$ * " to crossref " * + crossref * warning$ + "" * + } + { "{\em " * series * "\/}" * } + if$ + } + { key * } + if$ + } + { format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {format.incoll.inproc.crossref} +{ editor empty$ + editor field.or.null author field.or.null = + or + { key empty$ + { booktitle empty$ + { "need editor, key, or booktitle for " cite$ * " to crossref " * + crossref * warning$ + "" + } + { "In {\em " booktitle * "\/}" * } + if$ + } + { "In " key * } + if$ + } + { "In " format.crossref.editor * } + if$ + " \cite{" * crossref * "}" * +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { journal emphasize "journal" output.check + format.vol.num.pages output + format.date "year" output.check + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.authors output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + new.sentence + publisher "publisher" output.check + address output + } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.edition output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} +{ output.bibitem + author empty$ + { organization empty$ + 'skip$ + { organization output.nonnull + address output + } + if$ + } + { format.authors output.nonnull } + if$ + new.block + format.btitle "title" output.check + author empty$ + { organization empty$ + { address new.block.checka + address output + } + 'skip$ + if$ + } + { organization address new.block.checkb + organization output + address output + } + if$ + format.edition output + format.date output + new.block + note output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + "Master's thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + title howpublished new.block.checkb + format.title output + howpublished new.block.checka + howpublished output + format.date output + new.block + note output + fin.entry + empty.misc.check +} + +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + new.block + format.btitle "title" output.check + new.block + "PhD thesis" format.thesis.type output.nonnull + school "school" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + editor empty$ + { organization output } + { format.editors output.nonnull } + if$ + new.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + address empty$ + { editor empty$ + { publisher new.sentence.checka } + { organization publisher new.sentence.checkb + organization output + } + if$ + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + editor empty$ + 'skip$ + { organization output } + if$ + publisher output + } + if$ + new.block + note output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + format.tr.number output.nonnull + institution "institution" output.check + address output + format.date "year" output.check + new.block + note output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + note "note" output.check + format.date output + fin.entry +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + {"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + +READ + +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} + +INTEGERS { len } + +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} + +FUNCTION {sort.format.names} +{ 's := + #1 'nameptr := + "" + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { nameptr #1 > + { " " * } + 'skip$ + if$ + s nameptr "{vv{ } }{ll{ }}{ f{ }}{ jj{ }}" format.name$ 't := + nameptr numnames = t "others" = and + { "et al" * } + { t sortify * } + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {sort.format.title} +{ 't := + "A " #2 + "An " #3 + "The " #4 t chop.word + chop.word + chop.word + sortify + #1 global.max$ substring$ +} + +FUNCTION {author.sort} +{ author empty$ + { key empty$ + { "to sort, need author or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.editor.sort} +{ author empty$ + { editor empty$ + { key empty$ + { "to sort, need author, editor, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { editor sort.format.names } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {author.organization.sort} +{ author empty$ + { organization empty$ + { key empty$ + { "to sort, need author, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { author sort.format.names } + if$ +} + +FUNCTION {editor.organization.sort} +{ editor empty$ + { organization empty$ + { key empty$ + { "to sort, need editor, organization, or key in " cite$ * warning$ + "" + } + { key sortify } + if$ + } + { "The " #4 organization chop.word sortify } + if$ + } + { editor sort.format.names } + if$ +} + +FUNCTION {presort} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.sort + { type$ "proceedings" = + 'editor.organization.sort + { type$ "manual" = + 'author.organization.sort + 'author.sort + if$ + } + if$ + } + if$ + " " + * + year field.or.null sortify + * + " " + * + title field.or.null + sort.format.title + * + #1 entry.max$ substring$ + 'sort.key$ := +} + +ITERATE {presort} + +SORT + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} +{ "" 'longest.label := + #1 'number.label := + #0 'longest.label.width := +} + +FUNCTION {longest.label.pass} +{ number.label int.to.str$ 'label := + number.label #1 + 'number.label := + label width$ longest.label.width > + { label 'longest.label := + label width$ 'longest.label.width := + } + 'skip$ + if$ +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} +{ preamble$ empty$ + 'skip$ + { preamble$ write$ newline$ } + if$ + "\begin{thebibliography}{" longest.label * + "}\setlength{\itemsep}{-1ex}\small" * write$ newline$ +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} +{ newline$ + "\end{thebibliography}" write$ newline$ +} + +EXECUTE {end.bib} + +% end of file ieee.bst +% --------------------------------------------------------------- diff --git a/lustre/portals/doc/mpi.fig b/lustre/portals/doc/mpi.fig new file mode 100644 index 0000000..e1a91b5 --- /dev/null +++ b/lustre/portals/doc/mpi.fig @@ -0,0 +1,117 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 150 1650 900 2025 +4 1 0 100 0 0 10 0.0000 0 135 735 525 1800 Unexpected\001 +4 1 0 100 0 0 10 0.0000 0 135 585 525 1995 Messages\001 +-6 +6 150 150 900 525 +4 1 0 100 0 0 10 0.0000 0 135 615 525 300 Preposted\001 +4 1 0 100 0 0 10 0.0000 0 105 525 525 495 Receives\001 +-6 +6 2550 4125 3150 4725 +4 1 0 100 0 0 10 0.0000 0 135 600 2850 4275 Length=0\001 +4 1 0 100 0 0 10 0.0000 0 105 540 2850 4470 Truncate\001 +4 1 0 100 0 0 10 0.0000 0 105 480 2850 4665 No Ack\001 +-6 +6 1050 1575 1950 1875 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 1575 1950 1575 1950 1875 1050 1875 1050 1575 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 1725 Match Short\001 +-6 +6 5400 1575 6300 2175 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 1575 6300 1575 6300 2175 5400 2175 5400 1575 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 1875 Buffer\001 +-6 +6 5400 2400 6300 3000 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 5400 2400 6300 2400 6300 3000 5400 3000 5400 2400 +4 1 0 100 0 0 10 0.0000 0 105 405 5850 2700 Buffer\001 +-6 +6 1050 2400 1950 2700 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 2400 1950 2400 1950 2700 1050 2700 1050 2400 +4 1 0 100 0 0 10 0.0000 0 105 780 1500 2550 Match Short\001 +-6 +6 1050 825 1950 1125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 825 1950 825 1950 1125 1050 1125 1050 825 +4 1 0 100 0 0 10 0.0000 0 105 765 1500 975 Match None\001 +-6 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1575 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2025 4050 3375 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 675 6600 675 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 150 1350 6600 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 4125 3300 4125 3300 4725 2400 4725 2400 4125 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 4500 4050 3675 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 1725 5400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2550 5400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3225 2850 4050 3450 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 1800 1500 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 825 3300 825 3300 1275 2400 1275 2400 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 2625 1500 4125 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1050 4125 1950 4125 1950 4425 1050 4425 1050 4125 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1500 300 1500 825 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 975 2400 975 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 1725 2400 1725 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 2550 2400 2550 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 1875 4275 2400 4275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 1575 3300 1575 3300 2175 2400 2175 2400 1575 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2400 2400 3300 2400 3300 3000 2400 3000 2400 2400 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4050 3300 5250 3300 5250 3750 4050 3750 4050 3300 +4 1 0 100 0 0 10 0.0000 0 105 885 1500 150 Match Entries\001 +4 1 0 100 0 0 10 0.0000 0 135 1290 2850 150 Memory Descriptors\001 +4 1 0 100 0 0 10 0.0000 0 135 1065 5850 150 Memory Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 825 4500 150 Event Queues\001 +4 1 0 100 0 0 10 0.0000 0 105 585 525 1050 RcvMark\001 +4 1 0 100 0 0 10 0.0000 0 105 330 2850 1102 None\001 +4 1 0 100 0 0 10 0.0000 0 135 705 1500 4275 Match Any\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 1725 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 1875 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 150 810 2850 2550 max_offset=\001 +4 1 0 50 0 0 10 0.0000 0 150 840 2850 2700 n - short_len\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2100 unlink\001 +4 1 0 50 0 0 10 0.0000 0 105 405 2850 2925 unlink\001 +4 1 0 100 0 0 10 0.0000 0 135 930 4650 3675 Message Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 735 4650 3525 Unexpected\001 diff --git a/lustre/portals/doc/portals.fig b/lustre/portals/doc/portals.fig new file mode 100644 index 0000000..9b1271b --- /dev/null +++ b/lustre/portals/doc/portals.fig @@ -0,0 +1,68 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1350 900 1650 900 1650 1200 1350 1200 1350 900 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 1800 1350 2100 1350 2100 1650 1800 1650 1800 1350 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2250 1800 2550 1800 2550 2100 2250 2100 2250 1800 +2 1 1 1 0 7 100 0 -1 4.000 0 0 -1 0 0 2 + 4200 375 4200 2100 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 525 600 1125 600 1125 2100 525 2100 525 600 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 4425 1275 4875 1275 4875 1950 4425 1950 4425 1275 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 2550 1200 3150 1200 3150 1500 2550 1500 2550 1200 +2 1 0 1 0 7 100 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3000 1425 4425 1425 +2 2 0 1 0 7 100 0 -1 0.000 0 0 -1 0 0 5 + 3600 825 3750 825 3750 1125 3600 1125 3600 825 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2025 1425 2550 1425 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 4425 750 4875 750 4875 1125 4425 1125 4425 750 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 3675 975 4425 975 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 2 + 0 0 1.00 60.00 120.00 + 825 1050 1350 1050 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1500 1125 1500 1350 1500 1500 1650 1500 1800 1500 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 5 + 0 0 1.00 60.00 120.00 + 1950 1575 1950 1800 1950 1950 2100 1950 2250 1950 + 0.000 1.000 1.000 1.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 975 1125 975 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 0 0 2 + 525 1125 1125 1125 + 0.000 0.000 +3 0 0 1 0 7 100 0 -1 0.000 0 1 0 7 + 0 0 1.00 60.00 120.00 + 3000 1275 3150 1275 3300 1275 3300 1125 3300 975 3450 975 + 3600 975 + 0.000 1.000 1.000 1.000 1.000 1.000 0.000 +4 0 0 100 0 0 10 0.0000 0 105 690 1275 750 Match List\001 +4 1 0 100 0 0 10 0.0000 0 105 780 825 525 Portal Table\001 +4 2 0 100 0 0 10 0.0000 0 135 825 4050 2025 Library Space\001 +4 0 0 100 0 0 10 0.0000 0 135 1110 4350 2175 Application Space\001 +4 1 0 100 0 0 10 0.0000 0 135 660 2850 1050 Descriptor\001 +4 1 0 100 0 0 10 0.0000 0 135 540 2850 825 Memory\001 +4 1 0 100 0 0 10 0.0000 0 135 765 3750 675 Event Queue\001 +4 1 0 100 0 0 10 0.0000 0 135 495 4650 675 Regions\001 +4 1 0 100 0 0 10 0.0000 0 135 540 4650 525 Memory\001 diff --git a/lustre/portals/doc/portals3.bib b/lustre/portals/doc/portals3.bib new file mode 100644 index 0000000..323b99f --- /dev/null +++ b/lustre/portals/doc/portals3.bib @@ -0,0 +1,124 @@ +@Article{ Cplant, + title = { {M}assively {P}arallel {C}omputing with + {C}ommodity {C}omponents }, + author = { Ron Brightwell and David S. Greenberg and Arthur + B. Maccabe and Rolf Riesen }, + journal = { Parallel Computing }, + volume = { 26 }, + month = { February }, + pages = { 243-266 }, + year = { 2000 } +} + +@Manual{ Portals, + organization = { Sandia National Laboratories }, + title = { {P}uma {P}ortals }, + note = { http://www.cs.sandia.gov/puma/portals }, + year = { 1997 } +} + +@Techreport{ VIA, + title = { {V}irtual {I}nterface {A}rchitecture + {S}pecification {V}ersion 1.0 }, + author = { {Compaq, Microsoft, and Intel} }, + institution = { Compaq, Microsoft, and Intel }, + month = { December }, + year = { 1997 } +} + +@Techreport{ ST, + title = { {I}nformation {T}echnology - {S}cheduled + {T}ransfer {P}rotocol - {W}orking {D}raft 2.0 }, + author = { {Task Group of Technical Committee T11} }, + institution = { Accredited Standards Committee NCITS }, + month = { July }, + year = { 1998 } +} + +@Manual{ TFLOPS, + organization = { Sandia National Laboratories }, + title = { ASCI Red }, + note = { http://www.sandia.gov/ASCI/TFLOP }, + year = { 1996 } +} + +@Techreport{ GM, + title = { The {GM} {M}essage {P}assing {S}ystem }, + author = { {Myricom, Inc.} }, + institution = { {Myricom, Inc.} }, + year = { 1997 }, +} + +@Article{ MPIstandard, + title = { {MPI}: {A} {M}essage-{P}assing {I}nterface standard }, + author = { {Message Passing Interface Forum} }, + journal = { The International Journal of Supercomputer Applications + and High Performance Computing }, + volume = { 8 }, + year = { 1994 } +} + +@Inproceedings{ PumaOS, + author = "Lance Shuler and Chu Jong and Rolf Riesen and + David van Dresser and Arthur B. Maccabe and + Lee Ann Fisk and T. Mack Stallcup", + booktitle = "Proceeding of the 1995 Intel Supercomputer + User's Group Conference", + title = "The {P}uma Operating System for Massively Parallel Computers", + organization = "Intel Supercomputer User's Group", + year = 1995 +} + +@InProceedings{ SUNMOS, +author = "Arthur B. Maccabe and Kevin S. McCurley and Rolf Riesen and + Stephen R. Wheat", +title = "{SUNMOS} for the {Intel} {Paragon}: A Brief User's Guide", +booktitle = "Proceedings of the {Intel} Supercomputer Users' Group. 1994 + Annual North America Users' Conference.", +year = 1994, +pages = "245--251", +month = "June", +location = "ftp.cs.sandia.gov /pub/sunmos/papers/ISUG94-1.ps" +} + +@InProceedings { PumaMPI, + title = { Design and Implementation of {MPI} on {P}uma Portals }, + author = { Ron Brightwell and Lance Shuler }, + booktitle = { Proceedings of the Second MPI Developer's Conference }, + pages = { 18-25 }, + month = { July }, + year = { 1996 } +} + +@Inproceedings{ FM2, + author = { Mario Lauria and Scott Pakin and Andrew Chien }, + title = { {E}fficient {L}ayering for {H}igh {S}peed + {C}ommunication: {F}ast {M}essages 2.x }, + Booktitle = { Proceedings of the IEEE International Symposium + on High Performance Distributed Computing }, + year = { 1998 } +} + +@Manual { CraySHMEM, + title = "SHMEM Technical Note for C, SG-2516 2.3", + organization = "Cray Research, Inc.", + month = "October", + year = 1994 +} + +@Manual { MPI2, + title = "{MPI}-2: {E}xtensions to the {M}essage-{P}assing {I}nterface", + organization = "Message Passing Interface Forum", + note = "http://www.mpi-forum.org/docs/mpi-20-html/mpi2-report.html", + month = "July", + year = 1997 +} + +@InProceedings { PMMPI, + title = { {The Design and Implementation of Zero Copy MPI Using + Commodity Hardware with a High Performance Network} }, + author = { Francis O'Carroll and Hiroshi Tezuka and Atsushi Hori + and Yutaka Ishikawa }, + booktitle = { Proceedings of the ICS }, + year = { 1998 } +} diff --git a/lustre/portals/doc/portals3.lyx b/lustre/portals/doc/portals3.lyx new file mode 100644 index 0000000..8429280 --- /dev/null +++ b/lustre/portals/doc/portals3.lyx @@ -0,0 +1,15944 @@ +#LyX 1.2 created this file. For more info see http://www.lyx.org/ +\lyxformat 220 +\textclass report +\begin_preamble +\usepackage{fullpage} +\renewenvironment{comment}% +{\begin{quote}\textbf{Discussion}: \slshape}% +{\end{quote}} +\pagestyle{myheadings} +\end_preamble +\language american +\inputencoding auto +\fontscheme pslatex +\graphics default +\paperfontsize 10 +\spacing single +\papersize letterpaper +\paperpackage a4 +\use_geometry 0 +\use_amsmath 0 +\use_natbib 0 +\use_numerical_citations 0 +\paperorientation portrait +\secnumdepth 2 +\tocdepth 2 +\paragraph_separation indent +\defskip medskip +\quotes_language english +\quotes_times 2 +\papercolumns 1 +\papersides 2 +\paperpagestyle headings + +\layout Title + +The Portals 3.2 Message Passing Interface +\newline + Revision 1.1 +\layout Author + +Ron Brightwell +\begin_inset Foot +collapsed true + +\layout Standard + +R. + Brightwell and R. + Riesen are with the Scalable Computing Systems Department, Sandia National + Laboratories, P.O. + Box 5800, Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87111-1110, bright@cs.sandia.gov, rolf@cs.sandia.gov. +\end_inset + +, Arthur B. + Maccabe +\begin_inset Foot +collapsed true + +\layout Standard + +A. + B. + Maccabe is with the Computer Science Department, University of New Mexico, + Albuquerque, NM\SpecialChar ~ +\SpecialChar ~ +87131-1386, maccabe@cs.unm.edu. +\end_inset + +, Rolf Riesen and Trammell Hudson +\layout Abstract + +This report presents a specification for the Portals 3.2 message passing + interface. + Portals 3.2 is intended to allow scalable, high-performance network communicatio +n between nodes of a parallel computing system. + Specifically, it is designed to support a parallel computing platform composed + of clusters of commodity workstations connected by a commodity system area + network fabric. + In addition, Portals 3.2 is well suited to massively parallel processing + and embedded systems. + Portals 3.2 represents an adaption of the data movement layer developed + for massively parallel processing platforms, such as the 4500-node Intel + TeraFLOPS machine. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +clearpage +\backslash +pagenumbering{roman} +\backslash +setcounter{page}{3} +\end_inset + + +\layout Standard + + +\begin_inset LatexCommand \tableofcontents{} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList figure + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Standard + + +\begin_inset FloatList table + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\end_inset + + +\layout Chapter* + +Summary of Changes for Revision 1.1 +\layout Enumerate + +Updated version number to 3.2 throughout the document +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sub:PtlGetId} + +\end_inset + +: added +\family typewriter +PTL_SEGV +\family default + to error list for +\shape italic +PtlGetId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: added +\family typewriter +PTL_ML_TOOLONG +\family default + to error list for +\shape italic +PtlMEAttach +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meunlink} + +\end_inset + +: removed text referring to a list of associated memory descriptors. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added text to describe unlinking a free-floating memory descriptor. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added entry for +\family typewriter +ptl_seq_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added definition of +\family typewriter +max_offset +\family default +. +\layout Enumerate + +added text to clarify +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default +. +\end_deeper +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: modified text for +\family typewriter +unlink_op +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: added text to clarify multiple calls to +\shape italic +PtlNIInit +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: added text to clarify +\family typewriter +unlink_nofit +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:receiving} + +\end_inset + +: removed text indicating that an MD will reject a message if the associated + EQ is full. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + error code and text to indicate that only MDs with no pending operations + can be unlinked. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_MD_INUSE +\family default + return code. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added user id field, MD handle field, and NI specific failure field to + the +\family typewriter +ptl_event_t +\family default + structure. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + +: added +\family typewriter +ptl_ni_fail_t +\family default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + +: added +\family typewriter +PTL_EVENT_UNLINK +\family default + event type. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: removed +\shape slanted +PtlTransId +\shape default +. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +, Section +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +: listed allowable constants with relevant fields. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + +: added +\shape italic +PtlMEAttachAny +\shape default + function. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + +: added +\family typewriter +PTL_PT_FULL +\family default + return code for +\shape italic +PtlMEAttachAny +\shape default +. +\layout Enumerate + +Table +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + +: updated to reflect new event types. +\layout Enumerate + +Section +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: added +\family typewriter +ptl_nid_t +\family default +, +\family typewriter +ptl_pid_t +\family default +, and +\family typewriter +ptl_uid_t +\family default +. +\layout Chapter* + +Summary of Changes for Version 3.1 +\layout Section* + +Thread Issues +\layout Standard + +The most significant change to the interface from version 3.0 to 3.1 involves + the clarification of how the interface interacts with multi-threaded applicatio +ns. + We adopted a generic thread model in which processes define an address + space and threads share the address space. + Consideration of the API in the light of threads lead to several clarifications + throughout the document: +\layout Enumerate + +Glossary: +\begin_deeper +\layout Enumerate + +added a definition for +\emph on +thread +\emph default +, +\layout Enumerate + +reworded the definition for +\emph on +process +\emph default +. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +: added section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:threads} + +\end_inset + + to describe the multi-threading model used by the Portals API. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlinit} + +\end_inset + +: +\emph on +PtlInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ptlfini} + +\end_inset + +: +\emph on +PtlFini +\emph default + should be called once as the process is terminating and not as each thread + terminates. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +: Portals does not define thread ids. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + +: network interfaces are associated with processes, not threads. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +: +\emph on +PtlNIInit +\emph default + must be called at least once and may be called any number of times. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqget} + +\end_inset + +: +\emph on +PtlEQGet +\emph default + returns +\family typewriter +PTL_EQ_EMPTY +\family default + if a thread is blocked on +\emph on +PtlEQWait +\emph default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:eqwait} + +\end_inset + +: waiting threads are awakened in FIFO order. + +\layout Standard + +Two functions, +\emph on +PtlNIBarrier +\emph default + and +\emph on +PtlEQCount +\emph default + were removed from the API. + +\emph on +PtlNIBarrier +\emph default + was defined to block the calling process until all of the processes in + the application group had invoked +\emph on +PtlNIBarrier +\emph default +. + We now consider this functionality, along with the concept of groups (see + the discussion under +\begin_inset Quotes eld +\end_inset + +other changes +\begin_inset Quotes erd +\end_inset + +), to be part of the runtime system, not part of the Portals API. + +\emph on +PtlEQCount +\emph default + was defined to return the number of events in an event queue. + Because external operations may lead to new events being added and other + threads may remove events, the value returned by +\emph on +PtlEQCount +\emph default + would have to be a hint about the number of events in the event queue. +\layout Section* + +Handling small, unexpected messages +\layout Standard + +Another set of changes relates to handling small unexpected messages in + MPI. + In designing version 3.0, we assumed that each unexpected message would + be placed in a unique memory descriptor. + To avoid the need to process a long list of memory descriptors, we moved + the memory descriptors out of the match list and hung them off of a single + match list entry. + In this way, large unexpected messages would only encounter a single +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + match list entry before encountering the +\begin_inset Quotes eld +\end_inset + +long message +\begin_inset Quotes erd +\end_inset + + match list entry. + Experience with this strategy identified resource management problems with + this approach. + In particular, a long sequence of very short (or zero length) messages + could quickly exhaust the memory descriptors constructed for handling unexpecte +d messages. + Our new strategy involves the use of several very large memory descriptors + for small unexpected messages. + Consecutive unexpected messages will be written into the first of these + memory descriptors until the memory descriptor fills up. + When the first of the +\begin_inset Quotes eld +\end_inset + +small memory +\begin_inset Quotes erd +\end_inset + + descriptors fills up, it will be unlinked and subsequent short messages + will be written into the next +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor. + In this case, a +\begin_inset Quotes eld +\end_inset + +short message +\begin_inset Quotes erd +\end_inset + + memory descriptor will be declared full when it does not have sufficient + space for the largest small unexpected message. +\layout Standard + +This lead to two significant changes. + First, each match list entry now has a single memory descriptor rather + than a list of memory descriptors. + Second, in addition to exceeding the operation threshold, a memory descriptor + can be unlinked when the local offset exceeds a specified value. + These changes have lead to several changes in this document: +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{subsec:paddress} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed references to the memory descriptor list, +\layout Enumerate + +changed the portals address translation description to indicate that unlinking + a memory descriptor implies unlinking the associated match list entry--match + list entries can no longer be unlinked independently from the memory descriptor. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +removed unlink from argument list, +\layout Enumerate + +removed description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +changed wording of the error condition when the Portal table index already + has an associated match list. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + +: removed unlink from argument list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + +: added +\family typewriter +max_offset +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + +: +\begin_deeper +\layout Enumerate + +added description of +\family typewriter +ptl_unlink +\family default + type, +\layout Enumerate + +removed reference to memory descriptor lists, +\layout Enumerate + +changed wording of the error condition when match list entry already has + an associated memory descriptor, +\layout Enumerate + +changed the description of the +\family typewriter +unlink +\family default + argument. + +\end_deeper +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +: removed +\family typewriter +PtlMDInsert +\family default + operation. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: removed references to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: removed references to PtlMDInsert. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +: removed reference to memory descriptor list. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + +: revised the MPI example to reflect the changes to the interface. + +\layout Standard + +Several changes have been made to improve the general documentation of the + interface. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_EQ_NONE +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + +: documented the special value +\family typewriter +PTL_ID_ANY +\family default +. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + +: documented the return value +\family typewriter +PTL_INV_EQ +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + +: clarified the description of the +\emph on +PtlMDUpdate +\emph default + function. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:implvals} + +\end_inset + +: introduced a new section to document the implementation defined values. + +\layout Enumerate + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:summary} + +\end_inset + +: modified Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + to indicate where each constant is introduced and where it is used. + +\layout Section* + +Other changes +\layout Subsection* + +Implementation defined limits (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +The earlier version provided implementation defined limits for the maximum + number of match entries, the maximum number of memory descriptors, etc. + Rather than spanning the entire implementation, these limits are now associated + with individual network interfaces. +\layout Subsection* + +Added User Ids (Section +\begin_inset LatexCommand \ref{sec:uid} + +\end_inset + +) +\layout Standard + +Group Ids had been used to simplify access control entries. + In particular, a process could allow access for all of the processes in + a group. + User Ids have been introduced to regain this functionality. + We use user ids to fill this role. +\layout Subsection* + +Removed Group Ids and Rank Ids (Section +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + +) +\layout Standard + +The earlier version of Portals had two forms for addressing processes: <node + id, process id> and <group id, rank id>. + A process group was defined as the collection processes created during + application launch. + Each process in the group was given a unique rank id in the range 0 to + +\begin_inset Formula $n-1$ +\end_inset + + where +\begin_inset Formula $n$ +\end_inset + + was the number of processes in the group. + We removed groups because they are better handled in the runtime system. +\layout Subsection* + +Match lists (Section +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +) +\layout Standard + +It is no longer illegal to have an existing match entry when calling PtlMEAttach. + A position argument was added to the list of arguments supplied to +\emph on +PtlMEAttach +\emph default + to specify whether the new match entry is prepended or appended to the + existing list. + If there is no existing match list, the position argument is ignored. +\layout Subsection* + +Unlinking Memory Descriptors (Section +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +) +\layout Standard + +Previously, a memory descriptor could be unlinked if the offset exceeded + a threshold upon the completion of an operation. + In this version, the unlinking is delayed until there is a matching operation + which requires more memory than is currently available in the descriptor. + In addition to changes in section, this lead to a revision of Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + +. +\layout Subsection* + +Split Phase Operations and Events (Section +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + +) +\layout Standard + +Previously, there were five types of events: +\family typewriter +PTL_EVENT_PUT +\family default +, +\family typewriter +PTL_EVENT_GET +\family default +, +\family typewriter +PTL_EVENT_REPLY +\family default +, +\family typewriter +PTL_EVENT_SENT +\family default +, and +\family typewriter +PTL_EVENT_ACK. + +\family default +The first four of these reflected the completion of potentially long operations. + We have introduced new event types to reflect the fact that long operations + have a distinct starting point and a distinct completion point. + Moreover, the completion may be successful or unsuccessful. +\layout Standard + +In addition to providing a mechanism for reporting failure to higher levels + of software, this split provides an opportunity for for improved ordering + semantics. + Previously, if one process intiated two operations (e.g., two put operations) + on a remote process, these operations were guaranteed to complete in the + same order that they were initiated. + Now, we only guarantee that the initiation events are delivered in the + same order. + In particular, the operations do not need to complete in the order that + they were intiated. +\layout Subsection* + +Well known proces ids (Section +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + +) +\layout Standard + +To support the notion of +\begin_inset Quotes eld +\end_inset + +well known process ids, +\begin_inset Quotes erd +\end_inset + + we added a process id argument to the arguments for PtlNIInit. +\layout Chapter* + +Glossary +\layout Description + +API Application Programming Interface. + A definition of the functions and semantics provided by library of functions. + +\layout Description + +Initiator A +\emph on +process +\emph default + that initiates a message operation. + +\layout Description + +Message An application-defined unit of data that is exchanged between +\emph on +processes +\emph default +. + +\layout Description + +Message\SpecialChar ~ +Operation Either a put operation, which writes data, or a get operation, + which reads data. + +\layout Description + +Network A network provides point-to-point communication between +\emph on +nodes +\emph default +. + Internally, a network may provide multiple routes between endpoints (to + improve fault tolerance or to improve performance characteristics); however, + multiple paths will not be exposed outside of the network. + +\layout Description + +Node A node is an endpoint in a +\emph on +network +\emph default +. + Nodes provide processing capabilities and memory. + A node may provide multiple processors (an SMP node) or it may act as a + +\emph on +gateway +\emph default + between networks. + +\layout Description + +Process A context of execution. + A process defines a virtual memory (VM) context. + This context is not shared with other processes. + Several threads may share the VM context defined by a process. + +\layout Description + +Target A +\emph on +process +\emph default + that is acted upon by a message operation. + +\layout Description + +Thread A context of execution that shares a VM context with other threads. + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +cleardoublepage +\layout Standard + +\backslash +setcounter{page}{1} +\backslash +pagenumbering{arabic} +\end_inset + + +\layout Chapter + +Introduction +\begin_inset LatexCommand \label{sec:intro} + +\end_inset + + +\layout Section + +Overview +\layout Standard + +This document describes an application programming interface for message + passing between nodes in a system area network. + The goal of this interface is to improve the scalability and performance + of network communication by defining the functions and semantics of message + passing required for scaling a parallel computing system to ten thousand + nodes. + This goal is achieved by providing an interface that will allow a quality + implementation to take advantage of the inherently scalable design of Portals. +\layout Standard + +This document is divided into several sections: +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:intro} + +\end_inset + +---Introduction This section describes the purpose and scope of the Portals + API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:apiover} + +\end_inset + +---An\SpecialChar ~ +Overview\SpecialChar ~ +of\SpecialChar ~ +the\SpecialChar ~ +Portals\SpecialChar ~ +3.1\SpecialChar ~ +API This section gives a brief overview of the + Portals API. + The goal is to introduce the key concepts and terminology used in the descripti +on of the API. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:api} + +\end_inset + +---The\SpecialChar ~ +Portals\SpecialChar ~ +3.2\SpecialChar ~ +API This section describes the functions and semantics of + the Portals application programming interface. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:semantics} + +\end_inset + +--The\SpecialChar ~ +Semantics\SpecialChar ~ +of\SpecialChar ~ +Message\SpecialChar ~ +Transmission This section describes the semantics + of message transmission. + In particular, the information transmitted in each type of message and + the processing of incoming messages. + +\layout Description + +Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:examples} + +\end_inset + +---Examples This section presents several examples intended to illustrates + the use of the Portals API. + +\layout Section + +Purpose +\layout Standard + +Existing message passing technologies available for commodity cluster networking + hardware do not meet the scalability goals required by the Cplant\SpecialChar ~ + +\begin_inset LatexCommand \cite{Cplant} + +\end_inset + + project at Sandia National Laboratories. + The goal of the Cplant project is to construct a commodity cluster that + can scale to the order of ten thousand nodes. + This number greatly exceeds the capacity for which existing message passing + technologies have been designed and implemented. +\layout Standard + +In addition to the scalability requirements of the network, these technologies + must also be able to support a scalable implementation of the Message Passing + Interface (MPI)\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPIstandard} + +\end_inset + + standard, which has become the +\shape italic +de facto +\shape default + standard for parallel scientific computing. + While MPI does not impose any scalability limitations, existing message + passing technologies do not provide the functionality needed to allow implement +ations of MPI to meet the scalability requirements of Cplant. +\layout Standard + +The following are properties of a network architecture that do not impose + any inherent scalability limitations: +\layout Itemize + +Connectionless - Many connection-oriented architectures, such as VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + + and TCP/IP sockets, have limitations on the number of peer connections + that can be established. + +\layout Itemize + +Network independence - Many communication systems depend on the host processor + to perform operations in order for messages in the network to be consumed. + Message consumption from the network should not be dependent on host processor + activity, such as the operating system scheduler or user-level thread scheduler. + +\layout Itemize + +User-level flow control - Many communication systems manage flow control + internally to avoid depleting resources, which can significantly impact + performance as the number of communicating processes increases. + +\layout Itemize + +OS Bypass - High performance network communication should not involve memory + copies into or out of a kernel-managed protocol stack. + +\layout Standard + +The following are properties of a network architecture that do not impose + scalability limitations for an implementation of MPI: +\layout Itemize + +Receiver-managed - Sender-managed message passing implementations require + a persistent block of memory to be available for every process, requiring + memory resources to increase with job size and requiring user-level flow + control mechanisms to manage these resources. + +\layout Itemize + +User-level Bypass - While OS Bypass is necessary for high-performance, it + alone is not sufficient to support the Progress Rule of MPI asynchronous + operations. + +\layout Itemize + +Unexpected messages - Few communication systems have support for receiving + messages for which there is no prior notification. + Support for these types of messages is necessary to avoid flow control + and protocol overhead. + +\layout Section + +Background +\layout Standard + +Portals was originally designed for and implemented on the nCube machine + as part of the SUNMOS (Sandia/UNM OS)\SpecialChar ~ + +\begin_inset LatexCommand \cite{SUNMOS} + +\end_inset + + and Puma\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaOS} + +\end_inset + + lightweight kernel development projects. + Portals went through two design phases, the latter of which is used on + the 4500-node Intel TeraFLOPS machine\SpecialChar ~ + +\begin_inset LatexCommand \cite{TFLOPS} + +\end_inset + +. + Portals have been very successful in meeting the needs of such a large + machine, not only as a layer for a high-performance MPI implementation\SpecialChar ~ + +\begin_inset LatexCommand \cite{PumaMPI} + +\end_inset + +, but also for implementing the scalable run-time environment and parallel + I/O capabilities of the machine. +\layout Standard + +The second generation Portals implementation was designed to take full advantage + of the hardware architecture of large MPP machines. + However, efforts to implement this same design on commodity cluster technology + identified several limitations, due to the differences in network hardware + as well as to shortcomings in the design of Portals. +\layout Section + +Scalability +\layout Standard + +The primary goal in the design of Portals is scalability. + Portals are designed specifically for an implementation capable of supporting + a parallel job running on tens of thousands of nodes. + Performance is critical only in terms of scalability. + That is, the level of message passing performance is characterized by how + far it allows an application to scale and not by how it performs in micro-bench +marks (e.g., a two node bandwidth or latency test). +\layout Standard + +The Portals API is designed to allow for scalability, not to guarantee it. + Portals cannot overcome the shortcomings of a poorly designed application + program. + Applications that have inherent scalability limitations, either through + design or implementation, will not be transformed by Portals into scalable + applications. + Scalability must be addressed at all levels. + Portals do not inhibit scalability, but do not guarantee it either. +\layout Standard + +To support scalability, the Portals interface maintains a minimal amount + of state. + Portals provide reliable, ordered delivery of messages between pairs of + processes. + They are connectionless: a process is not required to explicitly establish + a point-to-point connection with another process in order to communicate. + Moreover, all buffers used in the transmission of messages are maintained + in user space. + The target process determines how to respond to incoming messages, and + messages for which there are no buffers are discarded. +\layout Section + +Communication Model +\layout Standard + +Portals combine the characteristics of both one-side and two-sided communication. + They define a +\begin_inset Quotes eld +\end_inset + +matching put +\begin_inset Quotes erd +\end_inset + + operation and a +\begin_inset Quotes eld +\end_inset + +matching get +\begin_inset Quotes erd +\end_inset + + operation. + The destination of a put (or send) is not an explicit address; instead, + each message contains a set of match bits that allow the receiver to determine + where incoming messages should be placed. + This flexibility allows Portals to support both traditional one-sided operation +s and two-sided send/receive operations. +\layout Standard + +Portals allows the target to determine whether incoming messages are acceptable. + A target process can choose to accept message operations from any specific + process or can choose to ignore message operations from any specific process. +\layout Section + +Zero Copy, OS Bypass and Application Bypass +\layout Standard + +In traditional system architectures, network packets arrive at the network + interface card (NIC), are passed through one or more protocol layers in + the operating system, and eventually copied into the address space of the + application. + As network bandwidth began to approach memory copy rates, reduction of + memory copies became a critical concern. + This concern lead to the development of zero-copy message passing protocols + in which message copies are eliminated or pipelined to avoid the loss of + bandwidth. +\layout Standard + +A typical zero-copy protocol has the NIC generate an interrupt for the CPU + when a message arrives from the network. + The interrupt handler then controls the transfer of the incoming message + into the address space of the appropriate application. + The interrupt latency, the time from the initiation of an interrupt until + the interrupt handler is running, is fairly significant. + To avoid this cost, some modern NICs have processors that can be programmed + to implement part of a message passing protocol. + Given a properly designed protocol, it is possible to program the NIC to + control the transfer of incoming messages, without needing to interrupt + the CPU. + Because this strategy does not need to involve the OS on every message + transfer, it is frequently called +\begin_inset Quotes eld +\end_inset + +OS Bypass. +\begin_inset Quotes erd +\end_inset + + ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, VIA\SpecialChar ~ + +\begin_inset LatexCommand \cite{VIA} + +\end_inset + +, FM\SpecialChar ~ + +\begin_inset LatexCommand \cite{FM2} + +\end_inset + +, GM\SpecialChar ~ + +\begin_inset LatexCommand \cite{GM} + +\end_inset + +, and Portals are examples of OS Bypass protocols. +\layout Standard + +Many protocols that support OS Bypass still require that the application + actively participate in the protocol to ensure progress. + As an example, the long message protocol of PM requires that the application + receive and reply to a request to put or get a long message. + This complicates the runtime environment, requiring a thread to process + incoming requests, and significantly increases the latency required to + initiate a long message protocol. + The Portals message passing protocol does not require activity on the part + of the application to ensure progress. + We use the term +\begin_inset Quotes eld +\end_inset + +Application Bypass +\begin_inset Quotes erd +\end_inset + + to refer to this aspect of the Portals protocol. +\layout Section + +Faults +\layout Standard + +Given the number of components that we are dealing with and the fact that + we are interested in supporting applications that run for very long times, + failures are inevitable. + The Portals API recognizes that the underlying transport may not be able + to successfully complete an operation once it has been initiated. + This is reflected in the fact that the Portals API reports three types + of events: events indicating the initiation of an operation, events indicating + the successful completion of an operation, and events indicating the unsuccessf +ul completion of an operation. + Every initiation event is eventually followed by a successful completion + event or an unsuccessful completion event. +\layout Standard + +Between the time an operation is started and the time that the operation + completes (successfully or unsuccessfully), any memory associated with + the operation should be considered volatile. + That is, the memory may be changed in unpredictable ways while the operation + is progressing. + Once the operation completes, the memory associated with the operation + will not be subject to further modification (from this operation). + Notice that unsuccessful operations may alter memory in an essentially + unpredictable fashion. +\layout Chapter + +An Overview of the Portals API +\begin_inset LatexCommand \label{sec:apiover} + +\end_inset + + +\layout Standard + +In this section, we give a conceptual overview of the Portals API. + The goal is to provide a context for understanding the detailed description + of the API presented in the next section. +\layout Section + +Data Movement +\begin_inset LatexCommand \label{sec:dmsemantics} + +\end_inset + + +\layout Standard + +A Portal represents an opening in the address space of a process. + Other processes can use a Portal to read (get) or write (put) the memory + associated with the portal. + Every data movement operation involves two processes, the +\series bold +initiator +\series default + and the +\series bold +target +\series default +. + The initiator is the process that initiates the data movement operation. + The target is the process that responds to the operation by either accepting + the data for a put operation, or replying with the data for a get operation. +\layout Standard + +In this discussion, activities attributed to a process may refer to activities + that are actually performed by the process or +\emph on +on behalf of the process +\emph default +. + The inclusiveness of our terminology is important in the context of +\emph on +application bypass +\emph default +. + In particular, when we note that the target sends a reply in the case of + a get operation, it is possible that reply will be generated by another + component in the system, bypassing the application. +\layout Standard + +Figures\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:put} + +\end_inset + + and +\begin_inset LatexCommand \ref{fig:get} + +\end_inset + + present graphical interpretations of the Portal data movement operations: + put and get. + In the case of a put operation, the initiator sends a put request message + containing the data to the target. + The target translates the Portal addressing information in the request + using its local Portal structures. + When the request has been processed, the target optionally sends an acknowledge +ment message. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename put.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Put (Send) +\begin_inset LatexCommand \label{fig:put} + +\end_inset + + +\end_inset + + +\layout Standard + +In the case of a get operation, the initiator sends a get request to the + target. + As with the put operation, the target translates the Portal addressing + information in the request using its local Portal structures. + Once it has translated the Portal addressing information, the target sends + a reply that includes the requested data. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename get.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 218pt + lyxheight 119pt +\end_inset + + +\layout Caption + +Portal Get +\begin_inset LatexCommand \label{fig:get} + +\end_inset + + +\end_inset + + +\layout Standard + +We should note that Portal address translations are only performed on nodes + that respond to operations initiated by other nodes. + Acknowledgements and replies to get operations bypass the portals address + translation structures. +\layout Section + +Portal Addressing +\begin_inset LatexCommand \label{subsec:paddress} + +\end_inset + + +\layout Standard + +One-sided data movement models (e.g., shmem\SpecialChar ~ + +\begin_inset LatexCommand \cite{CraySHMEM} + +\end_inset + +, ST\SpecialChar ~ + +\begin_inset LatexCommand \cite{ST} + +\end_inset + +, MPI-2\SpecialChar ~ + +\begin_inset LatexCommand \cite{MPI2} + +\end_inset + +) typically use a triple to address memory on a remote node. + This triple consists of a process id, memory buffer id, and offset. + The process id identifies the target process, the memory buffer id specifies + the region of memory to be used for the operation, and the offset specifies + an offset within the memory buffer. +\layout Standard + +In addition to the standard address components (process id, memory buffer + id, and offset), a Portal address includes a set of match bits. + This addressing model is appropriate for supporting one-sided operations + as well as traditional two-sided message passing operations. + Specifically, the Portals API provides the flexibility needed for an efficient + implementation of MPI-1, which defines two-sided operations with one-sided + completion semantics. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:portals} + +\end_inset + + presents a graphical representation of the structures used by a target + in the interpretation of a Portal address. + The process id is used to route the message to the appropriate node and + is not reflected in this diagram. + The memory buffer id, called the +\series bold +portal id +\series default +, is used as an index into the Portal table. + Each element of the Portal table identifies a match list. + Each element of the match list specifies two bit patterns: a set of +\begin_inset Quotes eld +\end_inset + +don't care +\begin_inset Quotes erd +\end_inset + + bits, and a set of +\begin_inset Quotes eld +\end_inset + +must match +\begin_inset Quotes erd +\end_inset + + bits. + In addition to the two sets of match bits, each match list element has + at most one memory descriptor. + Each memory descriptor identifies a memory region and an optional event + queue. + The memory region specifies the memory to be used in the operation and + the event queue is used to record information about these operations. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename portals.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 305pt + lyxheight 106pt +\end_inset + + +\layout Caption + +Portal Addressing Structures +\begin_inset LatexCommand \label{fig:portals} + +\end_inset + + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:flow} + +\end_inset + + illustrates the steps involved in translating a Portal address, starting + from the first element in a match list. + If the match criteria specified in the match list entry are met and the + memory descriptor list accepts the operation +\begin_inset Foot +collapsed true + +\layout Standard + +Memory descriptors can reject operations because a threshold has been exceeded + or because the memory region does not have sufficient space, see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset + +, the operation (put or get) is performed using the memory region specified + in the memory descriptor. + If the memory descriptor specifies that it is to be unlinked when a threshold + has been exceeded, the match list entry is removed from the match list + and the resources associated with the memory descriptor and match list + entry are reclaimed. + Finally, if there is an event queue specified in the memory descriptor, + the operation is logged in the event queue. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename flow_new.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 447pt + lyxheight 282pt +\end_inset + + +\layout Caption + +Portals Address Translation +\begin_inset LatexCommand \label{fig:flow} + +\end_inset + + +\end_inset + + +\layout Standard + +If the match criteria specified in the match list entry are not met, or + there is no memory descriptor associated with the match list entry, or + the memory descriptor associated with the match list entry rejects the + operation, the address translation continues with the next match list entry. + If the end of the match list has been reached, the address translation + is aborted and the incoming requested is discarded. +\layout Section + +Access Control +\layout Standard + +A process can control access to its portals using an access control list. + Each entry in the access control list specifies a process id and a Portal + table index. + The access control list is actually an array of entries. + Each incoming request includes an index into the access control list (i.e., + a +\begin_inset Quotes eld +\end_inset + +cookie +\begin_inset Quotes erd +\end_inset + + or hint). + If the id of the process issuing the request doesn't match the id specified + in the access control list entry or the Portal table index specified in + the request doesn't match the Portal table index specified in the access + control list entry, the request is rejected. + Process identifiers and Portal table indexes may include wild card values + to increase the flexibility of this mechanism. + +\layout Standard + +Two aspects of this design merit further discussion. + First, the model assumes that the information in a message header, the + sender's id in particular, is trustworthy. + In most contexts, we assume that the entity that constructs the header + is trustworthy; however, using cryptographic techniques, we could easily + devise a protocol that would ensure the authenticity of the sender. +\layout Standard + +Second, because the access check is performed by the receiver, it is possible + that a malicious process will generate thousands of messages that will + be denied by the receiver. + This could saturate the network and/or the receiver, resulting in a +\emph on +denial of service +\emph default + attack. + Moving the check to the sender using capabilities, would remove the potential + for this form of attack. + However, the solution introduces the complexities of capability management + (exchange of capabilities, revocation, protections, etc). +\layout Section + +Multi-threaded Applications +\begin_inset LatexCommand \label{sec:threads} + +\end_inset + + +\layout Standard + +The Portals API supports a generic view of multi-threaded applications. + From the perspective of the Portals API, an application program is defined + by a set of processes. + Each process defines a unique address space. + The Portals API defines access to this address space from other processes + (using portals addressing and the data movement operations). + A process may have one or more +\emph on +threads +\emph default + executing in its address space. + +\layout Standard + +With the exception of +\emph on +PtlEQWait +\emph default + every function in the Portals API is non-blocking and atomic with respect + to both other threads and external operations that result from data movement + operations. + While individual operations are atomic, sequences of these operations may + be interleaved between different threads and with external operations. + The Portals API does not provide any mechanisms to control this interleaving. + It is expected that these mechanisms will be provided by the API used to + create threads. +\layout Chapter + +The Portals API +\begin_inset LatexCommand \label{sec:api} + +\end_inset + + +\layout Section + +Naming Conventions +\begin_inset LatexCommand \label{sec:conv} + +\end_inset + + +\layout Standard + +The Portals API defines two types of entities: functions and types. + Function always start with +\emph on +Ptl +\emph default + and use mixed upper and lower case. + When used in the body of this report, function names appear in italic face, + e.g., +\emph on +PtlInit +\emph default +. + The functions associated with an object type will have names that start + with +\emph on +Ptl +\emph default +, followed by the two letter object type code shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + As an example, the function +\emph on +PtlEQAlloc +\emph default + allocates resources for an event queue. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Object Type Codes +\begin_inset LatexCommand \label{tab:objcodes} + +\end_inset + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\newline + +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="5" columns="3"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\emph on +xx +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + Name +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + Section +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +EQ +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + Event Queue +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + MD +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + Memory Descriptor +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + ME +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + Match list Entry +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + NI +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + Network Interface +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Type names use lower case with underscores to separate words. + Each type name starts with +\family typewriter +ptl +\family default +_ and ends with +\family typewriter +_t +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +ptl_match_bits_t +\family default +. +\layout Standard + +Names for constants use upper case with underscores to separate words. + Each constant name starts with +\family typewriter +PTL_ +\family default +. + When used in the body of this report, type names appear in a fixed font, + e.g., +\family typewriter +PTL_OK +\family default +. +\layout Section + +Base Types +\layout Standard + +The Portals API defines a variety of base types. + These types represent a simple renaming of the base types provided by the + C programming language. + In most cases these new type names have been introduced to improve type + safety and to avoid issues arising from differences in representation sizes + (e.g., 16-bit or 32-bit integers). +\layout Subsection + +Sizes +\begin_inset LatexCommand \label{sec:size-t} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_size_t +\family default + is an unsigned 64-bit integral type used for representing sizes. +\layout Subsection + +Handles +\begin_inset LatexCommand \label{sec:handle-type} + +\end_inset + + +\layout Standard + +Objects maintained by the API are accessed through handles. + Handle types have names of the form +\family typewriter +ptl_handle_ +\emph on +xx +\emph default +_t +\family default +, where +\emph on +xx +\emph default + is one of the two letter object type codes shown in Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:objcodes} + +\end_inset + +. + For example, the type +\family typewriter +ptl_handle_ni_t +\family default + is used for network interface handles. +\layout Standard + +Each type of object is given a unique handle type to enhance type checking. + The type, +\family typewriter +ptl_handle_any_t +\family default +, can be used when a generic handle is needed. + Every handle value can be converted into a value of type +\family typewriter +ptl_handle_any_t +\family default + without loss of information. +\layout Standard + +Handles are not simple values. + Every portals object is associated with a specific network interface and + an identifier for this interface (along with an object identifier) is part + of the handle for the object. +\layout Standard + +The special value +\family typewriter +PTL_EQ_NONE +\family default +, of type +\family typewriter +ptl_handle_eq_t +\family default +, is used to indicate the absence of an event queue. + See sections +\begin_inset LatexCommand \ref{sec:mdfree} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + for uses of this value. +\layout Subsection + +Indexes +\begin_inset LatexCommand \label{sec:index-type} + +\end_inset + + +\layout Standard + +The types +\family typewriter +ptl_pt_index_t +\family default + and +\family typewriter +ptl_ac_index_t +\family default + are integral types used for representing Portal table indexes and access + control tables indexes, respectively. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:niinit} + +\end_inset + + for limits on values of these types. +\layout Subsection + +Match Bits +\begin_inset LatexCommand \label{sec:mb-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_match_bits_t +\family default + is capable of holding unsigned 64-bit integer values. +\layout Subsection + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_interface_t +\family default + is an integral type used for identifying different network interfaces. + Users will need to consult the local documentation to determine appropriate + values for the interfaces available. + The special value +\family typewriter +PTL_IFACE_DEFAULT +\family default + identifies the default interface. +\layout Subsection + +Identifiers +\begin_inset LatexCommand \label{sec:id-type} + +\end_inset + + +\layout Standard + +The type +\family typewriter +ptl_nid_t +\family default + is an integral type used for representing node ids +\family typewriter +, ptl_pid_t +\family default + is an integral type for representing process ids, and +\family typewriter +ptl_uid_t +\family default +is an integral type for representing user ids. +\layout Standard + +The special values +\family typewriter +PTL_PID_ANY +\family default + matches any process identifier, PTL_NID_ANY matches any node identifier, + and +\family typewriter +PTL_UID_ANY +\family default + matches any user identifier. + See sections +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + and\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + for uses of these values. +\layout Subsection + +Status Registers +\begin_inset LatexCommand \label{sec:stat-type} + +\end_inset + + +\layout Standard + +Each network interface maintains an array of status registers that can be + accessed using the +\family typewriter +PtlNIStatus +\family default + function (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + The type +\family typewriter +ptl_sr_index_t +\family default + defines the types of indexes that can be used to access the status registers. + The only index defined for all implementations is +\family typewriter +PTL_SR_DROP_COUNT +\family default + which identifies the status register that counts the dropped requests for + the interface. + Other indexes (and registers) may be defined by the implementation. +\layout Standard + +The type +\family typewriter +ptl_sr_value_t +\family default + defines the types of values held in status registers. + This is a signed integer type. + The size is implementation dependent, but must be at least 32 bits. +\layout Section + +Initialization and Cleanup +\begin_inset LatexCommand \label{sec:init} + +\end_inset + + +\layout Standard + +The Portals API includes a function, +\emph on +PtlInit +\emph default +, to initialize the library and a function, +\emph on +PtlFini +\emph default +, to cleanup after the application is done using the library. +\layout Subsection + +PtlInit +\begin_inset LatexCommand \label{sec:ptlinit} + +\end_inset + + +\layout LyX-Code + +int PtlInit( int *max_interfaces ); +\layout Standard +\noindent +The +\emph on +PtlInit +\emph default + function initializes the Portals library. + PtlInit must be called at least once by a process before any thread makes + a Portals function call, but may be safely called more than once. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_FAIL Indicates an error during initialization. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +max_interfaces +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="5in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +max_interfaces +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the maximum number of interfaces + that can be initialized. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlFini +\begin_inset LatexCommand \label{sec:ptlfini} + +\end_inset + + +\layout LyX-Code + +void PtlFini( void ); +\layout Standard +\noindent +The +\emph on +PtlFini +\emph default + function cleans up after the Portals library is no longer needed by a process. + After this function is called, calls to any of the functions defined by + the Portal API or use of the structures set up by the Portals API will + result in undefined behavior. + This function should be called once and only once during termination by + a process. + Typically, this function will be called in the exit sequence of a process. + Individual threads should not call PtlFini when they terminate. +\layout Section + +Network Interfaces +\begin_inset LatexCommand \label{sec:ni} + +\end_inset + + +\layout Standard + +The Portals API supports the use of multiple network interfaces. + However, each interface is treated as an independent entity. + Combining interfaces (e.g., +\begin_inset Quotes eld +\end_inset + +bonding +\begin_inset Quotes erd +\end_inset + + to create a higher bandwidth connection) must be implemented by the application + or embedded in the underlying network. + Interfaces are treated as independent entities to make it easier to cache + information on individual network interface cards. +\layout Standard + +Once initialized, each interface provides a Portal table, an access control + table, and a collection of status registers. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for a discussion of updating Portal table entries using the +\emph on +PtlMEAttach +\emph default + function. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + for a discussion of the initialization and updating of entries in the access + control table. + See Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + for a discussion of the +\emph on +PtlNIStatus +\emph default + function which can be used to determine the value of a status register. +\layout Standard + +Every other type of Portal object (e.g., memory descriptor, event queue, or + match list entry) is associated with a specific network interface. + The association to a network interface is established when the object is + created and is encoded in the handle for the object. +\layout Standard + +Each network interface is initialized and shutdown independently. + The initialization routine, +\emph on +PtlNIInit +\emph default +, returns a handle for an interface object which is used in all subsequent + Portal operations. + The +\emph on +PtlNIFini +\emph default + function is used to shutdown an interface and release any resources that + are associated with the interface. + Network interface handles are associated with processes, not threads. + All threads in a process share all of the network interface handles. +\layout Standard + +The Portals API also defines the +\emph on +PtlNIStatus +\emph default + function to query the status registers for a network interface, the +\emph on +PtlNIDist +\emph default + function to determine the +\begin_inset Quotes eld +\end_inset + +distance +\begin_inset Quotes erd +\end_inset + + to another process, and the +\emph on +PtlNIHandle +\emph default + function to determine the network interface that an object is associated + with. +\layout Subsection + +PtlNIInit +\begin_inset LatexCommand \label{sec:niinit} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + int max_match_entries; +\newline + int max_mem_descriptors; +\newline + int max_event_queues; +\newline + ptl_ac_index_t max_atable_index; +\newline + ptl_pt_index_t max_ptable_index; +\newline +} ptl_ni_limits_t; +\newline + +\newline +int PtlNIInit( ptl_interface_t interface +\newline + ptl_pid_t pid, +\newline + ptl_ni_limits_t* desired, +\newline + ptl_ni_limits_t* actual, +\newline + ptl_handle_ni_t* handle ); +\layout Standard + +Values of type +\family typewriter +ptl_ni_limits_t +\family default + include the following members: +\layout Description + +max_match_entries Maximum number of match entries that can be allocated + at any one time. +\layout Description + +max_mem_descriptors Maximum number of memory descriptors that can be allocated + at any one time. +\layout Description + +max_event_queues Maximum number of event queues that can be allocated at + any one time. +\layout Description + +max_atable_index Largest access control table index for this interface, + valid indexes range from zero to +\family typewriter +max_atable_index +\family default +, inclusive. +\layout Description + +max_ptable_index Largest Portal table index for this interface, valid indexes + range from zero to +\family typewriter +max_ptable_index +\family default +, inclusive. +\layout Standard +\noindent +The +\emph on +PtlNIInit +\emph default + function is used to initialized the Portals API for a network interface. + This function must be called at least once by each process before any other + operations that apply to the interface by any process or thread. + For subsequent calls to +\shape italic +PtlNIInit +\shape default + from within the same process (either by different threads or the same thread), + the desired limits will be ignored and the call will return the existing + NI handle. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INIT_DUP Indicates a duplicate initialization of +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INIT_INV Indicates that +\family typewriter +interface +\family default + is not a valid network interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to initialize the + interface. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +pid +\family default + is not a valid process id. +\layout Description + +PTL_SEGV Indicates that +\family typewriter +actual +\family default +or +\family typewriter + handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="5" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the network interface to be initialized. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + for a discussion of values used to identify network interfaces.) +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +pid +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the desired process id (for well known process ids). + The value +\family typewriter +PTL_PID_ANY +\family default + may be used to have the process id assigned by the underlying library. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +desired +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +If non-NULL, points to a structure that holds the desired limits. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +actual +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, the location pointed to by actual will hold the actual + limits. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the interface. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +The use of desired is implementation dependent. + In particular, an implementation may choose to ignore this argument. +\layout Subsection + +PtlNIFini +\begin_inset LatexCommand \label{sec:nifini} + +\end_inset + + +\layout LyX-Code + +int PtlNIFini( ptl_handle_ni_t interface ); +\layout Standard +\noindent +The +\emph on +PtlNIFini +\emph default + function is used to release the resources allocated for a network interface. + Once the +\emph on +PtlNIFini +\emph default + operation has been started, the results of pending API operations (e.g., + operations initiated by another thread) for this interface are undefined. + Similarly, the effects of incoming operations (puts and gets) or return + values (acknowledgements and replies) for this interface are undefined. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +A handle for the interface to shutdown. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlNIStatus +\begin_inset LatexCommand \label{sec:nistatus} + +\end_inset + + +\layout LyX-Code + +int PtlNIStatus( ptl_handle_ni_t interface, +\newline + ptl_sr_index_t status_register, +\newline + ptl_sr_value_t* status ); +\layout Standard +\noindent +The +\emph on +PtlNIStatus +\emph default + function returns the value of a status register for the specified interface. + (See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + for more information on status register indexes and status register values.) +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_SR_INDX Indicates that +\family typewriter +status_register +\family default + is not a valid status register. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +status +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="3" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +status_register +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +An index for the status register to read. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +status +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the current value of the status + register. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +The only status register that must be defined is a drop count register ( +\family typewriter +PTL_SR_DROP_COUNT +\family default +). + Implementations may define additional status registers. + Identifiers for the indexes associated with these registers should start + with the prefix +\family typewriter +PTL_SR_ +\family default +. +\layout Subsection + +PtlNIDist +\layout LyX-Code + +int PtlNIDist( ptl_handle_ni_t interface, +\newline + ptl_process_id_t process, +\newline + unsigned long* distance ); +\layout Standard +\noindent +The +\emph on +PtlNIDist +\emph default + function returns the distance to another process using the specified interface. + Distances are only defined relative to an interface. + Distance comparisons between different interfaces on the same process may + be meaningless. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +process +\family default + is not a valid process identifier. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +distance +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="3" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +process +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +An identifier for the process whose distance is being requested. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +distance +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the distance to the remote + process. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +This function should return a static measure of distance. + Examples include minimum latency, the inverse of available bandwidth, or + the number of switches between the two endpoints. +\layout Subsection + +PtlNIHandle +\layout LyX-Code + +int PtlNIHandle( ptl_handle_any_t handle, +\newline + ptl_handle_ni_t* interface ); +\layout Standard +\noindent +The +\emph on +PtlNIHandle +\emph default + function returns a handle for the network interface with which the object + identified by +\family typewriter +handle +\family default + is associated. + If the object identified by +\family typewriter +handle +\family default + is a network interface, this function returns the same value it is passed. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_HANDLE Indicates that +\family typewriter +handle +\family default + is not a valid handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="2" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the object. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the network interface + associated with +\family typewriter +handle +\family default +. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +Every handle should encode the network interface and the object id relative + to this handle. + Both are presumably encoded using integer values. +\layout Section + +User Identification +\begin_inset LatexCommand \label{sec:uid} + +\end_inset + + +\layout Standard + +Every process runs on behalf of a user. + +\layout Subsection + +PtlGetUid +\layout LyX-Code + +int PtlGetUid( ptl_handle_ni_t ni_handle, +\newline + ptl_uid_t* uid ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +interface +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="2" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="5in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the user id for the calling + process. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +Note that user identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, a process may have multiple + user identifiers. +\layout Section + +Process Identification +\begin_inset LatexCommand \label{sec:pid} + +\end_inset + + +\layout Standard + +Processes that use the Portals API, can be identified using a node id and + process id. + Every node accessible through a network interface has a unique node identifier + and every process running on a node has a unique process identifier. + As such, any process in the computing system can be identified by its node + id and process id. + +\layout Standard + +The Portals API defines a type, +\family typewriter +ptl_process_id_t +\family default + for representing process ids and a function, +\emph on +PtlGetId +\emph default +, which can be used to obtain the id of the current process. +\layout Comment + +The portals API does not include thread identifiers. + Messages are delivered to processes (address spaces) not threads (contexts + of execution). +\layout Subsection + +The Process Id Type +\begin_inset LatexCommand \label{sec:pid-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_nid_t nid; /* node id */ +\newline + ptl_pid_t pid; /* process id */ +\newline +} ptl_process_id_t; +\layout Standard +\noindent +The +\family typewriter +ptl_process_id_t +\family default + type uses two identifiers to represent a process id: a node id and a process + id. + +\layout Subsection + +PtlGetId +\begin_inset LatexCommand \label{sub:PtlGetId} + +\end_inset + + +\layout LyX-Code + +int PtlGetId( ptl_handle_ni_t ni_handle, +\newline + ptl_process_id_t* id ); +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +ni_handle +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +id +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="2" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="5in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A network interface handle. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +id +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the id for the calling process. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Comment + +Note that process identifiers are dependent on the network interface(s). + In particular, if a node has multiple interfaces, it may have multiple + node identifiers. +\layout Section + +Match List Entries and Match Lists +\begin_inset LatexCommand \label{sec:me} + +\end_inset + + +\layout Standard + +A match list is a chain of match list entries. + Each match list entry includes a memory descriptor and a set of match criteria. + The match criteria can be used to reject incoming requests based on process + id or the match bits provided in the request. + A match list is created using the +\emph on +PtlMEAttach +\emph default + or +\shape italic +PtlMEAttachAny +\shape default + functions, which create a match list consisting of a single match list + entry, attaches the match list to the specified Portal index, and returns + a handle for the match list entry. + Match entries can be dynamically inserted and removed from a match list + using the +\emph on +PtlMEInsert +\emph default + and +\emph on +PtlMEUnlink +\emph default + functions. +\layout Subsection + +PtlMEAttach +\begin_inset LatexCommand \label{sec:meattach} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_RETAIN, PTL_UNLINK } ptl_unlink_t; +\newline + +\layout LyX-Code + +typedef enum { PTL_INS_BEFORE, PTL_INS_AFTER } ptl_ins_pos_t; +\newline + +\layout LyX-Code + +int PtlMEAttach( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ins_pos_t +\family default + are used to control where a new item is inserted. + The value +\family typewriter +PTL_INS_BEFORE +\family default + is used to insert the new item before the current item or before the head + of the list. + The value +\family typewriter +PTL_INS_AFTER +\family default + is used to insert the new item after the current item or after the last + item in the list. + +\layout Standard + +The +\emph on +PtlMEAttach +\emph default + function creates a match list consisting of a single entry and attaches + this list to the Portal table for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PTINDEX Indicates that +\family typewriter +index +\family default + is not a valid Portal table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="7" columns="3"> +<features> +<column alignment="left" valignment="top" width="0.8in"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.75in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The Portal table index where the match list should be attached. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Specifies the match criteria for the process id of the requestor. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +match_bits, ignorebits +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Specify the match criteria to apply to the match bits in the incoming request. + The +\family typewriter +ignorebits +\family default + are used to mask out insignificant bits in the incoming match bits. + The resulting bits are then compared to the match list entry's match + bits to determine if the incoming request meets the match criteria. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +unlink +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Indicates the match list entry should be unlinked when the last memory descripto +r associated with this match list entry is unlinked. + (Note, the check for unlinking a match entry only occurs when a memory + descriptor is unlinked.) +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be prepended or appended to + the existing match list. + If there is no existing list, this argument is ignored and the new match + entry becomes the only entry in the list. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMEAttachAny +\begin_inset LatexCommand \label{sec:attachany} + +\end_inset + + +\layout LyX-Code + +int PtlMEAttachAny( ptl_handle_ni_t interface, +\newline + ptl_pt_index_t *index, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_unlink_t unlink, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEAttachAny +\emph default + function creates a match list consisting of a single entry and attaches + this list to an unused Portal table entry for +\family typewriter +interface +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match list entry. + +\layout Description + +PTL_PT_FULL Indicates that there are no free entries in the Portal table. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="3"> +<features> +<column alignment="left" valignment="top" width="0.8in"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.75in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface to use. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +index +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On succesfful return, this location will hold the Portal index where the + match list has been attached. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid, match_bits, ignorebits, unlink +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\shape italic +PtlMEAttach +\shape default +. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + match list entry. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMEInsert +\begin_inset LatexCommand \label{sec:meinsert} + +\end_inset + + +\layout LyX-Code + +int PtlMEInsert( ptl_handle_me_t current, +\newline + ptl_process_id_t matchid, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_match_bits_t ignorebits, +\newline + ptl_ins_pos_t position, +\newline + ptl_handle_me_t* handle ); +\layout Standard + +The +\emph on +PtlMEInsert +\emph default + function creates a new match list entry and inserts this entry into the + match list containing +\family typewriter +current +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +current +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ML_TOOLONG Indicates that the resulting match list is too long. + The maximum length for a match list is defined by the interface. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + match entry. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="3"> +<features> +<column alignment="left" valignment="top" width="0.8in"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="left" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +current +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for a match entry. + The new match entry will be inserted immediately before or immediately + after this match entry. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +matchid +\family default +, +\family typewriter +match_bits +\family default +, +\family typewriter +ignorebits +\family default +, +\family typewriter +unlink +\family default + +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +position +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Indicates whether the new match entry should be inserted before or after + the +\family typewriter +current +\family default + entry. + Allowed constants: +\family typewriter +PTL_INS_BEFORE +\family default +, +\family typewriter +PTL_INS_AFTER +\family default +. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +See the discussion for +\emph on +PtlMEAttach +\emph default +. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMEUnlink +\begin_inset LatexCommand \label{sec:meunlink} + +\end_inset + + +\layout LyX-Code + +int PtlMEUnlink( ptl_handle_me_t entry ); +\layout Standard +\noindent +The +\emph on +PtlMEUnlink +\emph default + function can be used to unlink a match entry from a match list. + This operation also releases any resources associated with the match entry + (including the associated memory descriptor). + It is an error to use the match entry handle after calling +\emph on +PtlMEUnlink +\emph default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +entry +\family default + is not a valid match entry handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +entry +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +A handle for the match entry to be unlinked. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Section + +Memory Descriptors +\begin_inset LatexCommand \label{sec:md} + +\end_inset + + +\layout Standard + +A memory descriptor contains information about a region of an application + process' memory and an event queue where information about the operations + performed on the memory descriptor are recorded. + The Portals API provides two operations to create memory descriptors: +\emph on +PtlMDAttach +\emph default +, and +\emph on +PtlMDBind +\emph default +; an operation to update a memory descriptor, +\emph on +PtlMDUpdate +\emph default +; and an operation to unlink and release the resources associated with a + memory descriptor, +\emph on +PtlMDUnlink +\emph default +. +\layout Subsection + +The Memory Descriptor Type +\begin_inset LatexCommand \label{sec:md-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + void* start; +\newline + ptl_size_t length; +\newline + int threshold; +\newline + unsigned int max_offset; +\newline + unsigned int options; +\newline + void* user_ptr; +\newline + ptl_handle_eq_t eventq; +\newline +} ptl_md_t; +\layout Standard +\noindent +The +\family typewriter +ptl_md_t +\family default + type defines the application view of a memory descriptor. + Values of this type are used to initialize and update the memory descriptors. +\layout Subsubsection + +Members +\layout Description + +start,\SpecialChar ~ +length Specify the memory region associated with the memory descriptor. + The +\family typewriter +start +\family default + member specifies the starting address for the memory region and the +\family typewriter +length +\family default + member specifies the length of the region. + The +\family typewriter +start member +\family default + can be NULL provided that the +\family typewriter +length +\family default + member is zero. + (Zero length buffers are useful to record events.) There are no alignment + restrictions on the starting address or the length of the region; although, + unaligned messages may be slower (i.e., lower bandwidth and/or longer latency) + on some implementations. + +\layout Description + +threshold Specifies the maximum number of operations that can be performed + on the memory descriptor. + An operation is any action that could possibly generate an event (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + for the different types of events). + In the usual case, the threshold value is decremented for each operation + on the memory descriptor. + When the threshold value is zero, the memory descriptor is +\emph on +inactive +\emph default +, and does not respond to operations. + A memory descriptor can have an initial threshold value of zero to allow + for manipulation of an inactive memory descriptor by the local process. + A threshold value of +\family typewriter +PTL_MD_THRESH_INF +\family default + indicates that there is no bound on the number of operations that may be + applied to a memory descriptor. + Note that local operations (e.g., +\emph on +PtlMDUpdate +\emph default +) are not applied to the threshold count. + +\layout Description + +max_offset Specifies the maximum local offset of a memory descriptor. + When the local offset of a memory descriptor exceeds this maximum, the + memory descriptor becomes +\shape italic +inactive +\shape default + and does not respond to further operations. +\layout Description + +options Specifies the behavior of the memory descriptor. + There are five options that can be selected: enable put operations (yes + or no), enable get operations (yes or no), offset management (local or + remote), message truncation (yes or no), and acknowledgement (yes or no). + Values for this argument can be constructed using a bitwise or of the following + values: +\begin_deeper +\begin_deeper +\layout Description + +PTL_MD_OP_PUT Specifies that the memory descriptor will respond to +\emph on +put +\emph default + operations. + By default, memory descriptors reject +\emph on +put +\emph default + operations. + +\layout Description + +PTL_MD_OP_GET Specifies that the memory descriptor will respond to +\emph on +get +\emph default + operations. + By default, memory descriptors reject +\emph on +get +\emph default + operations. + +\layout Description + +PTL_MD_MANAGE_REMOTE Specifies that the offset used in accessing the memory + region is provided by the incoming request. + By default, the offset is maintained locally. + When the offset is maintained locally, the offset is incremented by the + length of the request so that the next operation (put and/or get) will + access the next part of the memory region. +\layout Description + +PTL_MD_TRUNCATE Specifies that the length provided in the incoming request + can be reduced to match the memory available in the region. + (The memory available in a memory region is determined by subtracting the + offset from the length of the memory region.) By default, if the length + in the incoming operation is greater than the amount of memory available, + the operation is rejected. + +\layout Description + +PTL_MD_ACK_DISABLE Specifies that an acknowledgement should +\emph on +not +\emph default + be sent for incoming +\emph on +put +\emph default + operations, even if requested. + By default, acknowledgements are sent for +\emph on +put +\emph default + operations that request an acknowledgement. + Acknowledgements are never sent for +\emph on +get +\emph default + operations. + The value sent in the reply serves as an implicit acknowledgement. + +\end_deeper +\layout Standard + + +\series bold +Note +\series default +: It is not considered an error to have a memory descriptor that does not + respond to either +\emph on +put +\emph default + or +\emph on +get +\emph default + operations: Every memory descriptor responds to +\emph on +reply +\emph default + operations. + Nor is it considered an error to have a memory descriptor that responds + to both +\emph on +put +\emph default + and +\emph on +get +\emph default + operations. + +\end_deeper +\layout Description + +user_ptr A user-specified value that is associated with the memory descriptor. + The value does not need to be a pointer, but must fit in the space used + by a pointer. + This value (along with other values) is recorded in events associated with + operations on this memory descriptor. +\begin_inset Foot +collapsed true + +\layout Standard + +Tying the memory descriptor to a user-defined value can be useful when multiple + memory descriptor share the same event queue or when the memory descriptor + needs to be associated with a data structure maintained by the application. + For example, an MPI implementation can set the +\family typewriter +user_ptr +\family default + argument to the value of an MPI Request. + This direct association allows for processing of memory descriptor's by + the MPI implementation without a table lookup or a search for the appropriate + MPI Request. +\end_inset + + +\layout Description + +eventq A handle for the event queue used to log the operations performed + on the memory region. + If this argument is +\family typewriter +PTl_EQ_NONE +\family default +, operations performed on this memory descriptor are not logged. + +\layout Subsection + +PtlMDAttach +\begin_inset LatexCommand \label{sec:mdattach} + +\end_inset + + +\layout LyX-Code + +int PtlMDAttach( ptl_handle_me_t match, +\newline + ptl_md_t mem_desc, +\newline + ptl_unlink_t unlink_op, +\newline + ptl_unlink_t unlink_nofit, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_unlink_t +\family default + are used to control whether an item is unlinked from a list. + The value +\family typewriter +PTL_UNLINK +\family default + enables unlinking. + The value +\family typewriter +PTL_RETAIN +\family default + disables unlinking. +\layout Standard + +The +\emph on +PtlMDAttach +\emph default + operation is used to create a memory descriptor and attach it to a match + list entry. + An error code is returned if this match list entry already has an associated + memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INUSE Indicates that +\family typewriter +match +\family default + already has a memory descriptor attached. + +\layout Description + +PTL_INV_ME Indicates that +\family typewriter +match +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface associated with +\family typewriter +match +\family default +. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="5" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +match +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the match entry that the memory descriptor will be associated + with. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_op +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when it becomes + inactive, either because the operation threshold drops to zero or because + the maximum offset has been exceeded. + (Note, the check for unlinking a memory descriptor only occurs after a + the completion of a successful operation. + If the threshold is set to zero during initialization or using +\emph on +PtlMDUpdate +\emph default +, the memory descriptor is +\series bold +not +\series default + unlinked.) +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +unlink_nofit +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A flag to indicate whether the memory descriptor is unlinked when the space + remaining in the memory descriptor is not sufficient for a matching operation. + If an incoming message arrives arrives at a memory descriptor that does + not have sufficient space and the +\series bold +PTL_MD_TRUNCATE +\series default + operation is not specified, the memory descriptor will be unlinked. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument can be NULL, in which case the handle will not be returned. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMDBind +\begin_inset LatexCommand \label{sec:mdbind} + +\end_inset + + +\layout LyX-Code + +int PtlMDBind( ptl_handle_ni_t interface, +\newline + ptl_md_t mem_desc, +\newline + ptl_handle_md_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlMDBind +\emph default + operation is used to create a +\begin_inset Quotes eld +\end_inset + +free floating +\begin_inset Quotes erd +\end_inset + + memory descriptor, i.e., a memory descriptor that is not associated with + a match list entry. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid match entry handle. + +\layout Description + +PTL_ILL_MD Indicates that +\family typewriter +mem_desc +\family default + is not a legal memory descriptor. + This may happen because the memory region defined in +\family typewriter +mem_desc +\family default + is invalid or because the network interface associated with the +\family typewriter +eventq +\family default + in +\family typewriter +mem_desc +\family default + is not the same as the network interface, +\family typewriter +interface +\family default +. + +\layout Description + +PTL_INV_EQ Indicates that the event queue associated with +\family typewriter +mem_desc +\family default + is not valid. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + memory descriptor. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="3" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the network interface with which the memory descriptor will + be associated. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Provides initial values for the application visible parts of a memory descriptor. + Other than its use for initialization, there is no linkage between this + structure and the memory descriptor maintained by the API. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + memory descriptor. + The +\family typewriter +handle +\family default + argument must be a valid address and cannot be NULL. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMDUnlink +\begin_inset LatexCommand \label{sec:mdfree} + +\end_inset + + +\layout LyX-Code + +int PtlMDUnlink( ptl_handle_md_t mem_desc ); +\layout Standard +\noindent +The +\emph on +PtlMDUnlink +\emph default + function unlinks the memory descriptor from any match list entry it may + be linked to and releases the resources associated with a memory descriptor. + (This function does not free the memory region associated with the memory + descriptor.) This function also releases the resources associated with a + floating memory descriptor. + Only memory descriptors with no pending operations may be unlinked. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. +\layout Description + +PTL_MD_INUSE Indicates that +\family typewriter +mem_desc +\family default + has pending operations and cannot be unlinked. +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to be released. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlMDUpdate +\begin_inset LatexCommand \label{sec:mdupdate} + +\end_inset + + +\layout LyX-Code + +int PtlMDUpdate( ptl_handle_md_t mem_desc, +\newline + ptl_md_t* old_md, +\newline + ptl_md_t* new_md, +\newline + ptl_handle_eq_t testq ); +\layout Standard +\noindent +The +\emph on +PtlMDUpdate +\emph default + function provides a conditional, atomic update operation for memory descriptors. + The memory descriptor identified by +\family typewriter +mem_desc +\family default + is only updated if the event queue identified by +\family typewriter +testq +\family default + is empty. + The intent is to only enable updates to the memory descriptor when no new + messages have arrived since the last time the queue was checked. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:exmpi} + +\end_inset + + for an example of how this function can be used. +\layout Standard + +If +\family typewriter +new +\family default + is not NULL the memory descriptor identified by handle will be updated + to reflect the values in the structure pointed to by +\family typewriter +new +\family default + if +\family typewriter +testq +\family default + has the value +\family typewriter +PTL_EQ_NONE +\family default + or if the event queue identified by +\family typewriter +testq +\family default + is empty. + If +\family typewriter +old +\family default + is not NULL, the current value of the memory descriptor identified by +\family typewriter +mem_desc +\family default + is recorded in the location identified by +\family typewriter +old +\family default +. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_NOUPDATE Indicates that the update was not performed because +\family typewriter +testq +\family default + was not empty. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor handle. + +\layout Description + +PTL_ILL_MD Indicates that the value pointed to by +\family typewriter +new +\family default + is not a legal memory descriptor (e.g., the memory region specified by the + memory descriptor may be invalid). + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +testq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +new +\family default + or +\family typewriter +old +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="4" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor to update. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +old_md +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +old_md +\family default + is not the value +\family typewriter +NULL +\family default +, the current value of the memory descriptor will be stored in the location + identified by +\family typewriter +old +\family default +_md. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +new_md +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +If +\family typewriter +new_md +\family default + is not the value +\family typewriter +NULL +\family default +, this argument provides the new values for the memory descriptor, if the + update is performed. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +testq +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for an event queue used to predicate the update. + If +\family typewriter +testq +\family default + is equal to +\family typewriter +PTL_EQ_NONE +\family default +, the update is performed unconditionally. + Otherwise, the update is performed if and only if +\family typewriter +testq +\family default + is empty. + If the update is not performed, the function returns the value +\family typewriter +PTL_NOUPDATE +\family default +. + (Note, the +\family typewriter +testq +\family default + argument does not need to be the same as the event queue associated with + the memory descriptor.) +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Standard + +The conditional update can be used to ensure that the memory descriptor + has not changed between the time it was examined and the time it is updated. + In particular, it is needed to support an MPI implementation where the + activity of searching an unexpected message queue and posting a receive + must be atomic. +\layout Section + +Events and Event Queues +\begin_inset LatexCommand \label{sec:eq} + +\end_inset + + +\layout Standard + +Event queues are used to log operations performed on memory descriptors. + They can also be used to hold acknowledgements for completed +\emph on +put +\emph default + operations and to note when the data specified in a +\emph on +put +\emph default + operation has been sent (i.e., when it is safe to reuse the buffer that holds + this data). + Multiple memory descriptors can share a single event queue. +\layout Standard + +In addition to the +\family typewriter +ptl_handle_eq_t +\family default + type, the Portals API defines two types associated with events: The +\family typewriter + +\newline +ptl_event_kind_t +\family default + type defines the kinds of events that can be stored in an event queue. + The +\family typewriter +ptl_event_t +\family default + type defines a structure that holds the information associated with an + event. +\layout Standard + +The Portals API also provides four functions for dealing with event queues: + The +\emph on +PtlEQAlloc +\emph default + function is used to allocate the API resources needed for an event queue, + the +\emph on +PtlEQFree +\emph default + function is used to release these resources, the +\emph on +PtlEQGet +\emph default + function can be used to get the next event from an event queue, and the + +\emph on +PtlEQWait +\emph default + function can be used to block a process (or thread) until an event queue + has at least one event. +\layout Subsection + +Kinds of Events +\begin_inset LatexCommand \label{sec:ek-type} + +\end_inset + + +\layout LyX-Code + +typedef enum { +\newline + PTL_EVENT_GET_START, PTL_EVENT_GET_END, PTL_EVENT_GET_FAIL, +\newline + PTL_EVENT_PUT_START, PTL_EVENT_PUT_END, PTL_EVENT_PUT_FAIL, +\newline + PTL_EVENT_REPLY_START, PTL_EVENT_REPLY_END, PTL_EVENT_REPLY_FAIL, +\newline + PTL_EVENT_SEND_START, PTL_EVENT_SEND_END, PTL_EVENT_SEND_FAIL, +\newline + PTL_EVENT_ACK, +\newline + PTL_EVENT_UNLINK +\newline +} ptl_event_kind_t; +\layout Standard +\noindent +The Portals API defines fourteen types of events that can be logged in an + event queue: +\layout Description + +PTL_EVENT_GET_START A remote +\emph on +get +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_GET_END A previously initiated +\emph on +get +\emph default + operation completed successfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_GET_FAIL A previously initiated +\emph on +get +\emph default + operation completed unsuccessfully. + This event is logged after the reply has been sent by the local node. + As such, the process could free the memory descriptor once it sees this + event. + +\layout Description + +PTL_EVENT_PUT_START A remote +\emph on +put +\emph default + operation has been started on the memory descriptor. + The memory region associated with this descriptor should should be considered + volatile until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_PUT_END A previously initiated +\emph on +put +\emph default + operation completed successfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_PUT_FAIL A previously initiated +\emph on +put +\emph default + operation completed unsuccessfully. + The underlying layers will not alter the memory (on behalf of this operation) + once this event has been logged. + +\layout Description + +PTL_EVENT_REPLY_START A +\emph on +reply +\emph default + operation has been started on the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_END A previously initiated +\emph on +reply +\emph default + operation has completed successfully . + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_REPLY_FAIL A previously initiated +\emph on +reply +\emph default + operation has completed unsuccessfully. + This event is logged after the data (if any) from the reply has been written + into the memory descriptor. + +\layout Description + +PTL_EVENT_ACK An +\emph on +acknowledgement +\emph default + was received. + This event is logged when the acknowledgement is received +\layout Description + +PTL_EVENT_SEND_START An outgoing +\emph on +send +\emph default + operation has been started. + The memory region associated with this descriptor should not be altered + until the corresponding END or FAIL event is logged. +\layout Description + +PTL_EVENT_SEND_END A previously initiated +\emph on +send +\emph default + operation has completed successfully. + This event is logged after the entire buffer has been sent and it is safe + for the application to reuse the buffer. + +\layout Description + +PTL_EVENT_SEND_FAIL A previously initiated +\emph on +send +\emph default + operation has completed unsuccessfully. + The process can safely manipulate the memory or free the memory descriptor + once it sees this event. +\layout Description + +PTL_EVENT_UNLINK A memory descriptor associated with this event queue has + been automatically unlinked. + This event is not generated when a memory descriptor is explicitly unlinked + by calling +\shape italic +PtlMDUnlink +\shape default +. + This event does not decrement the threshold count. +\layout Subsection + +Event Ordering +\layout Standard + +The Portals API guarantees that a when a process initiates two operations + on a remote process, the operations will be initiated on the remote process + in the same order that they were initiated on the original process. + As an example, if process A intitates two +\emph on +put +\emph default + operations, +\emph on +x +\emph default + and +\emph on +y +\emph default +, on process B, the Portals API guarantees that process A will receive the + +\family typewriter +PTL_EVENT_SEND_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default + in the same order that process B receives the +\family typewriter +PTL_EVENT_PUT_START +\family default + events for +\emph on +x +\emph default + and +\emph on +y +\emph default +. + Notice that the API does not guarantee that the start events will be delivered + in the same order that process A initiated the +\emph on +x +\emph default + and +\emph on +y +\emph default + operations. + If process A needs to ensure the ordering of these operations, it should + include code to wait for the initiation of +\emph on +x +\emph default + before it initiates +\emph on +y +\emph default +. +\layout Subsection + +Failure Notification +\layout Standard + +Operations may fail to complete successfully; however, unless the node itself + fails, every operation that is started will eventually complete. + While an operation is in progress, the memory associated with the operation + should not be viewed (in the case of a put or a reply) or altered (in the + case of a send or get). + Operation completion, whether successful or unsuccessful, is final. + That is, when an operation completes, the memory associated with the operation + will no longer be read or altered by the operation. + A network interface can use the +\family typewriter +ptl_ni_fail_t +\family default + to define more specific information regarding the failure of the operation + and record this information in the +\family typewriter +ni_fail_type +\family default + field of the event. +\layout Subsection + +The Event Type +\begin_inset LatexCommand \label{sec:event-type} + +\end_inset + + +\layout LyX-Code + +typedef struct { +\newline + ptl_event_kind_t type; +\newline + ptl_process_id_t initiator; +\newline + ptl_uid_t uid; +\layout LyX-Code + + ptl_pt_index_t portal; +\newline + ptl_match_bits_t match_bits; +\newline + ptl_size_t rlength; +\newline + ptl_size_t mlength; +\newline + ptl_size_t offset; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_hdr_data_t hdr_data; +\newline + ptl_seq_t link; +\newline + ptl_ni_fail_t ni_fail_type; +\newline + volatile ptl_seq_t sequence; +\newline +} ptl_event_t; +\layout Standard +\noindent +An event structure includes the following members: +\layout Description + +type Indicates the type of the event. + +\layout Description + +initiator The id of the initiator. + +\layout Description + +portal The Portal table index specified in the request. + +\layout Description + +match_bits A copy of the match bits specified in the request. + See section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + for more information on match bits. + +\layout Description + +rlength The length (in bytes) specified in the request. + +\layout Description + +mlength The length (in bytes) of the data that was manipulated by the operation. + For truncated operations, the manipulated length will be the number of + bytes specified by the memory descriptor (possibly with an offset) operation. + For all other operations, the manipulated length will be the length of + the requested operation. + +\layout Description + +offset Is the displacement (in bytes) into the memory region that the operation + used. + The offset can be determined by the operation (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + +) for a remote managed memory descriptor, or by the local memory descriptor + (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +). + +\layout Description + +md_handle Is the handle to the memory descriptor associated with the event. +\layout Description + +mem_desc Is the state of the memory descriptor immediately after the event + has been processed. + +\layout Description + +hdr_data 64 bits of out-of-band user data (see Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +). + +\layout Description + +link The +\emph on +link +\emph default + member is used to link +\family typewriter +START +\family default + events with the +\family typewriter +END +\family default + or +\family typewriter +FAIL +\family default + event that signifies completion of the operation. + The +\emph on +link +\emph default + member will be the same for the two events associated with an operation. + The link member is also used to link an +\family typewriter +UNLINK +\family default + event with the event that caused the memory descriptor to be unlinked. +\layout Description + +sequence The sequence number for this event. + Sequence numbers are unique to each event. +\layout Comment + +The +\emph on +sequence +\emph default + member is the last member and is volatile to support SMP implementations. + When an event structure is filled in, the +\emph on +sequence +\emph default + member should be written after all other members have been updated. + Moreover, a memory barrier should be inserted between the updating of other + members and the updating of the +\emph on +sequence +\emph default + member. +\layout Subsection + +PtlEQAlloc +\begin_inset LatexCommand \label{sec:eqalloc} + +\end_inset + + +\layout LyX-Code + +int PtlEQAlloc( ptl_handle_ni_t interface, +\newline + ptl_size_t count, +\newline + ptl_handle_eq_t* handle ); +\layout Standard +\noindent +The +\emph on +PtlEQAlloc +\emph default + function is used to build an event queue. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_NOSPACE Indicates that there is insufficient memory to allocate the + event queue. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +handle +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="3" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the interface with which the event queue will be associated. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +count +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The number of events that can be stored in the event queue. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +handle +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold a handle for the newly created + event queue. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlEQFree +\begin_inset LatexCommand \label{sec:eqfree} + +\end_inset + + +\layout LyX-Code + +int PtlEQFree( ptl_handle_eq_t eventq ); +\layout Standard +\noindent +The +\emph on +PtlEQFree +\emph default + function releases the resources associated with an event queue. + It is up to the user to insure that no memory descriptors are associated + with the event queue once it is freed. + +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="1" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +A handle for the event queue to be released. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlEQGet +\begin_inset LatexCommand \label{sec:eqget} + +\end_inset + + +\layout LyX-Code + +int PtlEQGet( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQGet +\emph default + function is a nonblocking function that can be used to get the next event + in an event queue. + The event is removed from the queue. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_EQ_EMPTY Indicates that +\family typewriter +eventq +\family default + is empty or another thread is waiting on +\emph on +PtlEQWait +\emph default +. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="2" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.5in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlEQWait +\begin_inset LatexCommand \label{sec:eqwait} + +\end_inset + + +\layout LyX-Code + +int PtlEQWait( ptl_handle_eq_t eventq, +\newline + ptl_event_t* event ); +\layout Standard +\noindent +The +\emph on +PTLEQWait +\emph default + function can be used to block the calling process (thread) until there + is an event in an event queue. + This function also returns the next event in the event queue and removes + this event from the queue. + This is the only blocking operation in the Portals 3.2 API. + In the event that multiple threads are waiting on the same event queue, + PtlEQWait is guaranteed to wake exactly one thread, but the order in which + they are awakened is not specified. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_EQ_DROPPED Indicates success (i.e., an event is returned) and that at + least one event between this event and the last event obtained (using +\emph on +PtlEQGet +\emph default + or +\emph on +PtlEQWait +\emph default +) from this event queue has been dropped due to limited space in the event + queue. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_EQ Indicates that +\family typewriter +eventq +\family default + is not a valid event queue handle. + +\layout Description + +PTL_SEGV Indicates that +\family typewriter +event +\family default + is not a legal address. + queue handle. + +\layout Subsubsection + +Arguments +\layout Standard +\noindent + +\begin_inset Tabular +<lyxtabular version="3" rows="2" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +eventq +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the event queue to wait on. + The calling process (thread) will be blocked until +\family typewriter +eventq +\family default + is not empty. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +event +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +output +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +On successful return, this location will hold the values associated with + the next event in the event queue. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Section + +The Access Control Table +\begin_inset LatexCommand \label{sec:ac} + +\end_inset + + +\layout Standard + +Processes can use the access control table to control which processes are + allowed to perform operations on Portal table entries. + Each communication interface has a Portal table and an access control table. + The access control table for the default interface contains an entry at + index zero that allows all processes with the same user id to communicate. + Entries in the access control table can be manipulated using the +\emph on +PtlACEntry +\emph default + function. +\layout Subsection + +PtlACEntry +\begin_inset LatexCommand \label{sec:acentry} + +\end_inset + + +\layout LyX-Code + +int PtlACEntry( ptl_handle_ni_t interface, +\newline + ptl_ac_index_t index, +\newline + ptl_process_id_t matchid, +\newline + ptl_uid_t user_id, +\newline + ptl_pt_index_t portal ); +\layout Standard +\noindent +The +\emph on +PtlACEntry +\emph default + function can be used to update an entry in the access control table for + an interface. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_NI Indicates that +\family typewriter +interface +\family default + is not a valid network interface handle. + +\layout Description + +PTL_AC_INV_INDEX Indicates that +\family typewriter +index +\family default + is not a valid access control table index. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +matchid +\family default + is not a valid process identifier. + +\layout Description + +PTL_PT_INV_INDEX Indicates that +\family typewriter +portal +\family default + is not a valid Portal table index. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="5" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +interface +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the interface to use. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +index +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The index of the entry in the access control table to update. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +matchid +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the process(es) that are allowed to perform operations. + The constants +\family typewriter +PTL_PID_ANY +\family default + and +\family typewriter +PTL_NID_ANY +\family default + can be used to wildcard either of the ids in the +\family typewriter +ptl_process_id_t +\family default + structure. + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +user_id +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the user that is allowed to perform operations. + The value +\family typewriter +PTL_UID_ANY +\family default + can be used to wildcard the user. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Identifies the Portal index(es) that can be used. + The value +\family typewriter +PTL_PT_INDEX_ANY +\family default + can be used to wildcard the Portal index. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Section + +Data Movement Operations +\begin_inset LatexCommand \label{sec:datamovement} + +\end_inset + + +\layout Standard + +The Portals API provides two data movement operations: +\emph on +PtlPut +\emph default + and +\emph on +PtlGet +\emph default +. +\layout Subsection + +PtlPut +\begin_inset LatexCommand \label{sec:put} + +\end_inset + + +\layout LyX-Code + +typedef enum { PTL_ACK_REQ, PTL_NOACK_REQ } ptl_ack_req_t; +\newline + +\newline +int PtlPut( ptl_handle_md_t mem_desc, +\newline + ptl_ack_req_t ack_req, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset, +\newline + ptl_hdr_data_t hdr_data ); +\layout Standard +\noindent +Values of the type +\family typewriter +ptl_ack_req_t +\family default + are used to control whether an acknowledgement should be sent when the + operation completes (i.e., when the data has been written to a memory descriptor + of the +\family typewriter +target +\family default + process). + The value +\family typewriter +PTL_ACK_REQ +\family default + requests an acknowledgement, the value +\family typewriter +PTL_NOACK_REQ +\family default + requests that no acknowledgement should be generated. +\layout Standard + +The +\emph on +PtlPut +\emph default + function initiates an asynchronous put operation. + There are several events associated with a put operation: initiation of + the send on the local node ( +\family typewriter +PTL_EVENT_SEND_START +\family default +), completion of the send on the local node ( +\family typewriter +PTL_EVENT_SEND_END +\family default + or +\family typewriter +PTL_EVENT_SEND_FAIL +\family default +), and, when the send completes successfully, the receipt of an acknowledgement + ( +\family typewriter +PTL_EVENT_ACK +\family default +) indicating that the operation was accepted by the target. + These events will be logged in the event queue associated with the memory + descriptor ( +\family typewriter +mem_desc +\family default +) used in the put operation. + Using a memory descriptor that does not have an associated event queue + results in these events being discarded. + In this case, the application must have another mechanism (e.g., a higher + level protocol) for determining when it is safe to modify the memory region + associated with the memory descriptor. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="8" columns="3"> +<features> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory to be sent. + If the memory descriptor has an event queue associated with it, it will + be used to record events when the message has been sent (PTL_EVENT_SEND_START, + PTL_EVENT_SEND_END). + +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ack_req +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +Controls whether an acknowledgement event is requested. + Acknowledgements are only sent when they are requested by the initiating + process +\series bold +and +\series default + the memory descriptor has an event queue +\series bold +and +\series default + the target memory descriptor enables them. + Allowed constants: +\family typewriter +PTL_ACK_REQ +\family default +, +\family typewriter +PTL_NOACK_REQ +\family default +. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +hdr_data +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +64 bits of user data that can be included in message header. + This data is written to an event queue entry at the target if an event + queue is present on the matching memory descriptor. +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Subsection + +PtlGet +\begin_inset LatexCommand \label{sec:get} + +\end_inset + + +\layout LyX-Code + +int PtlGet( ptl_handle_md_t mem_desc, +\newline + ptl_process_id_t target, +\newline + ptl_pt_index_t portal, +\newline + ptl_ac_index_t cookie, +\newline + ptl_match_bits_t match_bits, +\newline + ptl_size_t offset ); +\layout Standard +\noindent +The +\emph on +PtlGet +\emph default + function initiates a remote read operation. + There are two event pairs associated with a get operation , when the data + is sent from the remote node, a +\family typewriter +PTL_EVENT_GET{START|END} +\family default + event pair is registered on the remote node; and when the data is returned + from the remote node a +\family typewriter +PTL_EVENT_REPLY{START|END} +\family default + event pair is registered on the local node. +\layout Subsubsection + +Return Codes +\layout Description + +PTL_OK Indicates success. + +\layout Description + +PTL_NOINIT Indicates that the Portals API has not been successfully initialized. + +\layout Description + +PTL_INV_MD Indicates that +\family typewriter +mem_desc +\family default + is not a valid memory descriptor. + +\layout Description + +PTL_INV_PROC Indicates that +\family typewriter +target +\family default + is not a valid process id. + +\layout Subsubsection + +Arguments +\layout Standard + + +\begin_inset Tabular +<lyxtabular version="3" rows="6" columns="3"> +<features> +<column alignment="right" valignment="top" width="0pt"> +<column alignment="center" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="4.7in"> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="left" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A handle for the memory descriptor that describes the memory into which + the requested data will be received. + The memory descriptor can have an event queue associated with it to record + events, such as when the message receive has started ( +\family typewriter +PTL_EVENT_REPLY +\family default +_ +\family typewriter +START +\family default +). +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +A process id for the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The index in the remote Portal table. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The index into the access control table of the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The match bits to use for message selection at the target process. +\end_inset +</cell> +</row> +<row> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +input +\end_inset +</cell> +<cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +The offset into the target memory descriptor (only used when the target + memory descriptor has the +\family typewriter +PTL_MD_MANAGE_REMOTE +\family default + option set). +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\layout Section + +Summary +\layout Standard + + +\begin_inset LatexCommand \label{sec:summary} + +\end_inset + + We conclude this section by summarizing the names introduced by the Portals + 3.2 API. + We start by summarizing the names of the types introduced by the API. + This is followed by a summary of the functions introduced by the API. + Which is followed by a summary of the function return codes. + Finally, we conclude with a summary of the other constant values introduced + by the API. +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:types} + +\end_inset + + presents a summary of the types defined by the Portals API. + The first column in this table gives the type name, the second column gives + a brief description of the type, the third column identifies the section + where the type is defined, and the fourth column lists the functions that + have arguments of this type. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Types Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:types} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\noindent + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="25" columns="4"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="2in"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="2.2in"> +<row bottomline="true"> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold + Name +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold + Meaning +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold + Sect +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold + Functions +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +indexes for an access control table +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlACEntry, PtlPut, PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +acknowledgement request types +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlPut +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +kinds of events +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +information about events +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlEQGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +plt_seq_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +event sequence number +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:event-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_any_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +handles for any object +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +handles for event queues +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlEQFree, PtlEQGet, PtlEQWait, PtlMDUpdate +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +handles for memory descriptors +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDAlloc, PtlMDUnlink, PtlMDUpdate, PtlMEAttach, PtlMEAttachAny, PtlMEInsert, + PtlPut, PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_me_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +handles for match entries +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMEUnlink +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_ni_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +handles for network interfaces +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIFini, PtlNIStatus, PtlNIDist, PtlEQAlloc, PtlACEntry, PtlPut, + PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +node identifiers +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlGetId,PtlACEntry +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +process identifier +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlGetId, PtlACEntry +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +user indentifier +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlGetUid, PtlACEntry +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +insertion position (before or after) +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +identifiers for network interfaces +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +match (and ignore) bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mb-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlPut, PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_md_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +memory descriptors +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDUpdate +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ni_fail_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +network interface-specific failures +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlEQGet, PtlEQWait +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +process identifiers +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlGetId, PtlNIDist, PtlMEAttach, PtlMEAttachAny, PtlACEntry, PtlPut, PtlGet + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +indexes for Portal tables +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:index-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlACEntry +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +sizes +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:size-t} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlEQAlloc, PtlPut, PtlGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +indexes for status registers +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_value_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +values in status registers +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIStatus +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +unlink options +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMEAttach, PtlMEAttachAny, PtlMEInsert, PtlMDAttach +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:func} + +\end_inset + + presents a summary of the functions defined by the Portals API. + The first column in this table gives the name for the function, the second + column gives a brief description of the operation implemented by the function, + and the third column identifies the section where the function is defined. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Functions Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:func} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="24" columns="3"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +Name +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + Operation +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + Section +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlACEntry +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + update an entry in an access control table +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ac} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlEQAlloc +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + create an event queue +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlEQGet +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + get the next event from an event queue +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlEQFree +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + release the resources for an event queue +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlEQWait +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + wait for a new event in an event queue +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:eq} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlFini +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + shutdown the Portals API +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlGet +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + perform a get operation +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlGetId +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + get the id for the current process +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:pid} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlInit +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + initialize the Portals API +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:init} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMDAttach +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + create a memory descriptor and attach it to a match entry +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMDBind +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + create a free-floating memory descriptor +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdbind} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMDUnlink +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + remove a memory descriptor from a list and release its resources +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMDUpdate +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + update a memory descriptor +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMEAttach +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a Portal table +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +create a match entry and attach it to a free Portal table entry +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:attachany} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMEInsert +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + create a match entry and insert it in a list +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlMEUnlink +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + remove a match entry from a list and release its resources +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:me} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlNIDist +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + get the distance to another process +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlNIFini +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + shutdown a network interface +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlNIHandle +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + get the network interface handle for an object +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlNIInit +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + initialize a network interface +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlNIStatus +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + read a network interface status register +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + PtlPut +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + perform a put operation +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:datamovement} + +\end_inset + + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:retcodes} + +\end_inset + + summarizes the return codes used by functions defined by the Portals API. + All of these constants are integer values. + The first column of this table gives the symbolic name for the constant, + the second column gives a brief description of the value, and the third + column identifies the functions that can return this value. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Function Return Codes for the Portals 3.2 API +\begin_inset LatexCommand \label{tab:retcodes} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="27" columns="3"> +<features> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="2.6in"> +<row bottomline="true"> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Functions +\series default + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_AC_INV_INDEX +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid access control table index +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlACEntry +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_DROPPED +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +at least one event has been dropped +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet, PtlWait +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_EMPTY +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +no events available in an event queue +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlEQGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +error during initialization or cleanup +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlInit, PtlFini +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ILL_MD +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +illegal memory descriptor values +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach, PtlMDBind, PtlMDUpdate +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_DUP +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +duplicate initialization of an interface +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INIT_INV +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +initialization of an invalid interface +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INUSE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +the ME already has an MD +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ASIZE +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid access control table size +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_EQ +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid event queue handle +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDUpdate, PtlEQFree, PtlEQGet +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_HANDLE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid handle +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIHandle +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_MD +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid memory descriptor handle +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDUnlink, PtlMDUpdate +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_ME +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid match entry handle +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlMDAttach +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_NI +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid network interface handle +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIDist, PtlNIFini, PtlMDBind, PtlEQAlloc +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PROC +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid process identifier +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIDist, PtlMEAttach, PtlMEInsert, PtlACEntry, PtlPut, PtlGet + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_PTINDEX +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid Portal table index +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_REG +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid status register +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INV_SR_INDX +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +invalid status register index +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlNIStatus +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ML_TOOLONG +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +match list too long +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlMEAttach, PtlMEInsert +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_INUSE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +MD has pending operations +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlMDUnlink +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOINIT +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +uninitialized API +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\emph default +, except PtlInit +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOSPACE +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +insufficient memory +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlMDAttach, PtlMDBind, PtlEQAlloc, PtlMEAttach, PtlMEInsert + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOUPDATE +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + no update was performed +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + PtlMDUpdate +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_FULL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +Portal table is full +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +PtlMEAttachAny +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_OK +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + success +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent + +\emph on +all +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SEGV +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +addressing violation +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard +\noindent +PtlNIInit, PtlNIStatus, PtlNIDist, PtlNIHandle, PtlMDBind, PtlMDUpdate, + PtlEQAlloc, PtlEQGet, PtlEQWait +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:oconsts} + +\end_inset + + summarizes the remaining constant values introduced by the Portals API. + The first column in this table presents the symbolic name for the constant, + the second column gives a brief description of the value, the third column + identifies the type for the value, and the fourth column identifies the + sections in which the value is mentioned. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Other Constants Defined by the Portals 3.2 API +\begin_inset LatexCommand \label{tab:oconsts} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="36" columns="5"> +<features> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Name +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Meaning +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Base type +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Intr. +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Ref. +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_ACK_REQ +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +request an acknowledgement +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EQ_NONE +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +a NULL event queue handle +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_eq_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:handle-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:mdupdate} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_START +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +get event start +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_END +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +get event end +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_GET_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +get event fail +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_START +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +put event start +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_END +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +put event end +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_PUT_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +put event fail +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_START +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +reply event start +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_END +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +reply event end +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_REPLY_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +reply event fail +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_START +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +acknowledgement event start +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_END +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +acknowledgement event end +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_ACK_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +acknowledgement event fail +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_START +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +send event start +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_END +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +send event end +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_SEND_FAIL +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +send event fail +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_EVENT_UNLINK +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +unlink event +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_event_kind_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ek-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PID_ANY +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +wildcard for process id fields +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NID_ANY +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +wildcard for node id fields +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_nid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UID_ANY +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +wildcard for user id +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:id-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meattach} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_IFACE_DEFAULT +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +default interface +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_interface_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:ni-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_AFTER +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +insert after +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_INS_BEFORE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +insert before +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ins_pos_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:meinsert} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_ACK_DISABLE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +a flag to disable acknowledgements +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_MANAGE_REMOTE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +a flag to enable the use of remote offsets +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + +, +\begin_inset LatexCommand \ref{sec:get} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_GET +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +a flag to enable get operations +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_OP_PUT +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +a flag to enable put operations +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_THRESH_INF +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +infinite threshold for a memory descriptor +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_MD_TRUNCATE +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +a flag to enable truncation of a request +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:md-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_NOACK_REQ +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +request no acknowledgement +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ack_req_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:put} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_PT_INDEX_ANY +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +wildcard for Portal indexes +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:acentry} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_RETAIN +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +disable unlinking +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_SR_DROP_COUNT +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +index for the dropped count register +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_sr_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:stat-type} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + + +\end_inset +</cell> +</row> +<row> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +PTL_UNLINK +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +enable unlinking +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_unlink_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\begin_inset LatexCommand \ref{sec:mdattach} + +\end_inset + + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Chapter + +The Semantics of Message Transmission +\begin_inset LatexCommand \label{sec:semantics} + +\end_inset + + +\layout Standard + +The portals API uses four types of messages: put requests, acknowledgements, + get requests, and replies. + In this section, we describe the information passed on the wire for each + type of message. + We also describe how this information is used to process incoming messages. +\layout Section + +Sending Messages +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:put-wire} + +\end_inset + + summarizes the information that is transmitted for a put request. + The first column provides a descriptive name for the information, the second + column provides the type for this information, the third column identifies + the source of the information, and the fourth column provides additional + notes. + Most information that is transmitted is obtained directly from the +\emph on +PtlPut +\emph default + operation. + Notice that the handle for the memory descriptor used in the +\emph on +PtlPut +\emph default + operation is transmitted even though this value cannot be interpreted by + the target. + A value of anything other than +\family typewriter +PTL_MD_NONE +\family default +, is interpreted as a request for an acknowledgement. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Put Request +\begin_inset LatexCommand \label{tab:put-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="12" columns="4"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Information +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlPut +\emph default + arg +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +operation +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +indicates a put request +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +local information +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +user +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +local information +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +cookie +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +no ack if +\family typewriter +PTL_MD_NONE +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +data +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family roman +\emph on +bytes +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +start +\family default + and +\family typewriter +length +\family default + members +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:ack-wire} + +\end_inset + + summarizes the information transmitted in an acknowledgement. + Most of the information is simply echoed from the put request. + Notice that the initiator and target are obtained directly from the put + request, but are swapped in generating the acknowledgement. + The only new piece of information in the acknowledgement is the manipulated + length which is determined as the put request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in an Acknowledgement +\begin_inset LatexCommand \label{tab:ack-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="10" columns="4"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +operation +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + indicates an acknowledgement +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_handle_md_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + requested length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + manipulated length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter + ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + obtained from the operation +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:get-wire} + +\end_inset + + summarizes the information that is transmitted for a get request. + Like the information transmitted in a put request, most of the information + transmitted in a get request is obtained directly from the +\emph on +PtlGet +\emph default + operation. + Unlike put requests, get requests do not include the event queue handle. + In this case, the reply is generated whenever the operation succeeds and + the memory descriptor must not be unlinked until the reply is received. + As such, there is no advantage to explicitly sending the event queue handle. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Get Request +\begin_inset LatexCommand \label{tab:get-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="11" columns="4"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +\emph on +PtlGet +\emph default + argument +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +operation +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +indicates a get operation +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +local information +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +user +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_uid_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +local information +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +portal +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +cookie +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_ac_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +cookie +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +match_bits +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\family default + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +mem_desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +length +\family default + member +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Standard + +Table\SpecialChar ~ + +\begin_inset LatexCommand \ref{tab:reply-wire} + +\end_inset + + summarizes the information transmitted in a reply. + Like an acknowledgement, most of the information is simply echoed from + the get request. + The initiator and target are obtained directly from the get request, but + are swapped in generating the acknowledgement. + The only new information in the acknowledgement are the manipulated length + and the data, which are determined as the get request is satisfied. +\layout Standard + + +\begin_inset Float table +placement htbp +wide false +collapsed false + +\layout Caption + +Information Passed in a Reply +\begin_inset LatexCommand \label{tab:reply-wire} + +\end_inset + + +\layout Standard + + +\begin_inset ERT +status Collapsed + +\layout Standard + +\backslash +medskip +\end_inset + + +\layout Standard +\align center + +\size small + +\begin_inset Tabular +<lyxtabular version="3" rows="11" columns="4"> +<features firstHeadEmpty="true"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<column alignment="left" valignment="top" width="0pt"> +<row bottomline="true"> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Information +\series default + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Type +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Put Information +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\series bold +Notes +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +operation +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +int +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +indicates an acknowledgement +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +target +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_process_id_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +initiator +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_pt_index_t +\end_inset +</cell> +<cell alignment="left" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +portal index +\end_inset +</cell> +<cell alignment="right" valignment="top" bottomline="true" usebox="none"> +\begin_inset Text + +\layout Standard + +echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_match_bits_t +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +match bits +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +offset +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_handle_md_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +memory desc +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +requested length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +echo +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +manipulated length +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\family typewriter +ptl_size_t +\family default + +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset +</cell> +</row> +<row> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +data +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + + +\emph on +bytes +\end_inset +</cell> +<cell alignment="left" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +\end_inset +</cell> +<cell alignment="right" valignment="top" usebox="none"> +\begin_inset Text + +\layout Standard + +obtained from the operation +\end_inset +</cell> +</row> +</lyxtabular> + +\end_inset + + +\end_inset + + +\layout Section + +Receiving Messages +\begin_inset LatexCommand \label{sec:receiving} + +\end_inset + + +\layout Standard + +When an incoming message arrives on a network interface, the communication + system first checks that the target process identified in the request is + a valid process that has initialized the network interface (i.e., that the + target process has a valid Portal table). + If this test fails, the communication system discards the message and increment +s the dropped message count for the interface. + The remainder of the processing depends on the type of the incoming message. + Put and get messages are subject to access control checks and translation + (searching a match list), while acknowledgement and reply messages bypass + the access control checks and the translation step. +\layout Standard + +Acknowledgement messages include a handle for the memory descriptor used + in the original +\emph on +PtlPut +\emph default + operation. + This memory descriptor will identify the event queue where the event should + be recorded. + Upon receipt of an acknowledgement, the runtime system only needs to confirm + that the memory descriptor and event queue still exist and that there is + space for another event. + Should the any of these conditions fail, the message is simply discarded + and the dropped message count for the interface is incremented. + Otherwise, the system builds an acknowledgement event from the information + in the acknowledgement message and adds it to the event queue. +\layout Standard + +Reception of reply messages is also relatively straightforward. + Each reply message includes a handle for a memory descriptor. + If this descriptor exists, it is used to receive the message. + A reply message will be dropped if the memory descriptor identified in + the request doesn't exist. + In either of this case, the dropped message count for the interface is + incremented. + These are the only reasons for dropping reply messages. + Every memory descriptor accepts and truncates incoming reply messages, + eliminating the other potential reasons for rejecting a reply message. +\layout Standard + +The critical step in processing an incoming put or get request involves + mapping the request to a memory descriptor. + This step starts by using the Portal index in the incoming request to identify + a list of match entries. + This list of match entries is searched in order until a match entry is + found whose match criteria matches the match bits in the incoming request + and whose memory descriptor accepts the request. +\layout Standard + +Because acknowledge and reply messages are generated in response to requests + made by the process receiving these messages, the checks performed by the + runtime system for acknowledgements and replies are minimal. + In contrast, put and get messages are generated by remote processes and + the checks performed for these messages are more extensive. + Incoming put or get messages may be rejected because: +\layout Itemize + +the Portal index supplied in the request is not valid; +\layout Itemize + +the cookie supplied in the request is not a valid access control entry; + +\layout Itemize + +the access control entry identified by the cookie does not match the identifier + of the requesting process; +\layout Itemize + +the access control entry identified by the access control entry does not + match the Portal index supplied in the request; or +\layout Itemize + +the match bits supplied in the request do not match any of the match entries + with a memory descriptor that accepts the request. + +\layout Standard + +In all cases, if the message is rejected, the incoming message is discarded + and the dropped message count for the interface is incremented. +\layout Standard + +A memory descriptor may reject an incoming request for any of the following + reasons: +\layout Itemize + +the +\family typewriter +PTL_MD_PUT +\family default + or +\family typewriter +PTL_MD_GET +\family default + option has not been enabled and the operation is put or get, respectively; + +\layout Itemize + +the length specified in the request is too long for the memory descriptor + and the +\family typewriter +PTL_MD_TRUNCATE +\family default + option has not been enabled. +\layout Chapter + +Examples +\begin_inset LatexCommand \label{sec:examples} + +\end_inset + + +\layout Comment + +The examples presented in this chapter have not been updated to reflect + the current API. +\layout Standard + +In this section we present several example to illustrate expected usage + patterns for the Portals 3.2 API. + The first example describes how to implement parallel servers using the + features of the Portals 3.2 API. + This example covers the access control list and the use of remote managed + offsets. + The second example presents an approach to dealing with dropped requests. + This example covers aspects of match lists and memory descriptors. + The final example covers message reception in MPI. + This example illustrates more sophisticated uses of matching and a procedure + to update a memory descriptor. +\layout Section + +Parallel File Servers +\begin_inset LatexCommand \label{sec:expfs} + +\end_inset + + +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:file} + +\end_inset + + illustrates the logical structure of a parallel file server. + In this case, the parallel server consists of four servers that stripe + application data across four disks. + We would like to present applications with the illusion that the file server + is a single entity. + We will assume that all of the processes that constitute the parallel server + have the same user id. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename file.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 196pt + lyxheight 147pt +\end_inset + + +\layout Caption + +Parallel File Server +\begin_inset LatexCommand \label{fig:file} + +\end_inset + + +\end_inset + + +\layout Standard + +When an application establishes a connection to the parallel file server, + it will allocate a Portal and access control list entry for communicating + with the server. + The access control list entry will include the Portal and match any process + in the parallel file server's, so all of the file server processes will + have access to the portal. + The Portal information and access control entry will be sent to the file + server at this time. + If the application and server need to have multiple, concurrent I/O operations, + they can use additional portals or match entries to keep the operations + from interfering with one another. +\layout Standard + +When an application initiates an I/O operation, it first builds a memory + descriptor that describes the memory region involved in the operation. + This memory descriptor will enable the appropriate operation (put for read + operations and get for write operations) and enable the use of remote offsets + (this lets the servers decide where their data should be placed in the + memory region). + After creating the memory descriptor and linking it into the appropriate + Portal entry, the application sends a read or write request (using +\emph on +PtlPut +\emph default +) to one of the file server processes. + The file server processes can then use put or get operations with the appropria +te offsets to fill or retrieve the contents of the application's buffer. + To know when the operation has completed, the application can add an event + queue to the memory descriptor and add up the lengths of the remote operations + until the sum is the size of the requested I/O operation. +\layout Section + +Dealing with Dropped Requests +\begin_inset LatexCommand \label{sec:exdrop} + +\end_inset + + +\layout Standard + +If a process does not anticipate unexpected requests, they will be discarded. + Applications using the Portals API can query the dropped count for the + interface to determine the number of requests that have been dropped (see + Section\SpecialChar ~ + +\begin_inset LatexCommand \ref{sec:nistatus} + +\end_inset + +). + While this approach minimizes resource consumption, it does not provide + information that might be critical in debugging the implementation of a + higher level protocol. +\layout Standard + +To keep track of more information about dropped requests, we use a memory + descriptor that truncates each incoming request to zero bytes and logs + the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + operations in an event queue. + Note that the operations are not dropped in the Portals sense, because + the operation succeeds. +\layout Standard + +The following code fragment illustrates an implementation of this approach. + In this case, we assume that a thread is launched to execute the function + +\family typewriter +watch_drop +\family default +. + This code starts by building an event queue to log truncated operations + and a memory descriptor to truncate the incoming requests. + This example only captures +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests for a single portal. + In a more realistic situation, the memory descriptor would be appended + to the match list for every portal. + We also assume that the thread is capable of keeping up with the +\begin_inset Quotes eld +\end_inset + +dropped +\begin_inset Quotes erd +\end_inset + + requests. + If this is not the case, we could use a finite threshold on the memory + descriptor to capture the first few dropped requests. +\layout LyX-Code + + +\size small +#include <stdio.h> +\newline +#include <stdlib.h> +\newline +#include <portals.h> +\newline + +\newline +#define DROP_SIZE 32 /* number of dropped requests to track */ +\newline + +\newline +int watch_drop( ptl_handle_ni_t ni, ptl_pt_index_t index ) { +\newline + ptl_handle_eq_t drop_events; +\newline + ptl_event_t event; +\newline + ptl_handle_md_t drop_em; +\newline + ptl_md_t drop_desc; +\newline + ptl_process_id_t any_proc; +\newline + ptl_handle_me_t match_any; +\newline + +\newline + /* create the event queue */ +\newline + if( PtlEQAlloc(ni, DROP_SIZE, &drop_events) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the event queue +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* build a match entry */ +\newline + any_proc.nid = PTL_ID_ANY; +\newline + any_proc.pid = PTL_ID_ANY; +\newline + PtlMEAttach( index, any_proc, 0, ~(ptl_match_bits_t)0, PTL_RETAIN, +\newline + &match_any ); +\newline + +\newline + /* create the memory descriptor */ +\newline + drop_desc.start = NULL; +\newline + drop_desc.length = 0; +\newline + drop_desc.threshold = PTL_MD_THRESH_INF; +\newline + drop_desc.options = PTL_MD_OP_PUT | PTL_MD_OP_GET | PTL_MD_TRUNCATE; +\newline + drop_desc.user_ptr = NULL; +\newline + drop_desc.eventq = drop_events; +\newline + if( PtlMDAttach(match_any, drop_desc, &drop_em) != PTL_OK ) { +\newline + fprintf( stderr, "Couldn't create the memory descriptor +\backslash +n" ); +\newline + exit( 1 ); +\newline + } +\newline + +\newline + /* watch for "dropped" requests */ +\newline + while( 1 ) { +\newline + if( PtlEQWait( drop_events, &event ) != PTL_OK ) break; +\newline + fprintf( stderr, "Dropped request from gid = event.initiator.gid, + event.initiator.rid ); +\newline + } +\newline +} +\layout Section + +Message Transmission in MPI +\begin_inset LatexCommand \label{sec:exmpi} + +\end_inset + + +\layout Standard + +We conclude this section with a fairly extensive example that describes + an approach to implementing message transmission for MPI. + Like many MPI implementations, we distinguish two message transmission + protocols: a short message protocol and a long message protocol. + We use the constant +\family typewriter +MPI_LONG_LENGTH +\family default + to determine the size of a long message. +\layout Standard + +For small messages, the sender simply sends the message and presumes that + the message will be received (i.e., the receiver has allocated a memory region + to receive the message body). + For large messages, the sender also sends the message, but does not presume + that the message body will be saved. + Instead, the sender builds a memory descriptor for the message and enables + get operations on this descriptor. + If the target does not save the body of the message, it will record an + event for the put operation. + When the process later issues a matching MPI receive, it will perform a + get operation to retrieve the body of the message. +\layout Standard + +To facilitate receive side matching based on the protocol, we use the most + significant bit in the match bits to indicate the protocol: 1 for long + messages and 0 for short messages. +\layout Standard + +The following code presents a function that implements the send side of + the protocol. + The global variable +\family typewriter +EndGet +\family default + is the last match entry attached to the Portal index used for posting long + messages. + This entry does not match any incoming requests (i.e., the memory descriptor + rejects all get operations) and is built during initialization of the MPI + library. + The other global variable, +\family typewriter +MPI_NI +\family default +, is a handle for the network interface used by the MPI implementation. +\layout LyX-Code + + +\size small +extern ptl_handle_me_t EndGet; +\newline +extern ptl_handle_ni_t MPI_NI; +\newline + +\newline +void MPIsend( void *buf, ptl_size_t len, void *data, ptl_handle_eq_t eventq, +\newline + ptl_process_id target, ptl_match_bits_t match ) +\newline +{ +\newline + ptl_handle_md_t send_handle; +\newline + ptl_md_t mem_desc; +\newline + ptl_ack_req_t want_ack; +\newline + +\newline + mem_desc.start = buf; +\newline + mem_desc.length = len; +\newline + mem_desc.threshold = 1; +\newline + mem_desc.options = PTL_MD_GET_OP; +\newline + mem_desc.user_ptr = data; +\newline + mem_desc.eventq = eventq; +\newline + +\newline + if( len >= MPI_LONG_LENGTH ) { +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + /* add a match entry to the end of the get list */ +\newline + PtlMEInsert( target, match, 0, PTL_UNLINK, PTL_INS_BEFORE, EndGet, + &me_handle ); +\newline + PtlMDAttach( me_handle, mem_desc, PTL_UNLINK, NULL ); +\newline + +\newline + /* we want an ack for long messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a long message + */ +\newline + match |= 1<<63; +\newline + } else { +\newline + /* we don't want an ack for short messages */ +\newline + want_ack = PTL_ACK_REQ; +\newline + +\newline + /* set the protocol bit to indicate that this is a short message + */ +\newline + match &= ~(1<<63); +\newline + } +\newline + +\newline + /* create a memory descriptor and send it */ +\newline + PtlMDBind( MPI_NI, mem_desc, &send_handle ); +\newline + PtlPut( send_handle, want_ack, target, MPI_SEND_PINDEX, MPI_AINDEX, match, + 0 ); +\newline +} +\layout Standard + +The +\emph on +MPISend +\emph default + function returns as soon as the message has been scheduled for transmission. + The event queue argument, +\family typewriter +eventq +\family default +, can be used to determine the disposition of the message. + Assuming that +\family typewriter +eventq +\family default + is not +\family typewriter +PTL_EQ_NONE +\family default +, a +\family typewriter +PTL_EVENT_SENT +\family default + event will be recorded for each message as the message is transmitted. + For small messages, this is the only event that will be recorded in +\family typewriter +eventq +\family default +. + In contrast, long messages include an explicit request for an acknowledgement. + If the +\family typewriter +target +\family default + process has posted a matching receive, the acknowledgement will be sent + as the message is received. + If a matching receive has not been posted, the message will be discarded + and no acknowledgement will be sent. + When the +\family typewriter +target +\family default + process later issues a matching receive, the receive will be translated + into a get operation and a +\family typewriter +PTL_EVENT_GET +\family default + event will be recorded in +\family typewriter +eventq +\family default +. +\layout Standard + +Figure\SpecialChar ~ + +\begin_inset LatexCommand \ref{fig:mpi} + +\end_inset + + illustrates the organization of the match list used for receiving MPI messages. + The initial entries (not shown in this figure) would be used to match the + MPI receives that have been preposted by the application. + The preposted receives are followed by a match entry, +\emph on +RcvMark +\emph default +, that marks the boundary between preposted receives and the memory descriptors + used for +\begin_inset Quotes eld +\end_inset + +unexpected +\begin_inset Quotes erd +\end_inset + + messages. + The +\emph on +RcvMark +\emph default + entry is followed by a small collection of match entries that match unexpected + +\begin_inset Quotes eld +\end_inset + +short +\begin_inset Quotes erd +\end_inset + + messages, i.e., messages that have a 0 in the most significant bit of their + match bits. + The memory descriptors associated with these match entries will append + the incoming message to the associated memory descriptor and record an + event in an event queue for unexpected messages. + The unexpected short message matching entries are followed by a match entry + that will match messages that were not matched by the preceding match entries, + i.e., the unexpected long messages. + The memory descriptor associated with this match entry truncates the message + body and records an event in the event queue for unexpected messages. + Note that of the memory descriptors used for unexpected messages share + a common event queue. + This makes it possible to process the unexpected messages in the order + in which they arrived, regardless of. +\layout Standard + + +\begin_inset Float figure +placement htbp +wide false +collapsed false + +\layout Standard +\align center + +\begin_inset Graphics FormatVersion 1 + filename mpi.eps + display color + size_type 0 + rotateOrigin center + lyxsize_type 1 + lyxwidth 389pt + lyxheight 284pt +\end_inset + + +\layout Caption + +Message Reception in MPI +\begin_inset LatexCommand \label{fig:mpi} + +\end_inset + + +\end_inset + + +\layout Standard + +When the local MPI process posts an MPI receive, we must first search the + events unexpected message queue to see if a matching message has already + arrived. + If no matching message is found, a match entry for the receive is inserted + before the +\emph on +RcvMark +\emph default + entry--after the match entries for all of the previously posted receives + and before the match entries for the unexpected messages. + This ensures that preposted receives are matched in the order that they + were posted (a requirement of MPI). + +\layout Standard + +While this strategy respects the temporal semantics of MPI, it introduces + a race condition: a matching message might arrive after the events in the + unexpected message queue have been searched, but before the match entry + for the receive has been inserted in the match list. + +\layout Standard + +To avoid this race condition we start by setting the +\family typewriter +threshold +\family default + of the memory descriptor to 0, making the descriptor inactive. + We then insert the match entry into the match list and proceed to search + the events in the unexpected message queue. + A matching message that arrives as we are searching the unexpected message + queue will not be accepted by the memory descriptor and, if not matched + by an earlier match list element, will add an event to the unexpected message + queue. + After searching the events in the unexpected message queue, we update the + memory descriptor, setting the threshold to 1 to activate the memory descriptor. + This update is predicated by the condition that the unexpected message + queue is empty. + We repeat the process of searching the unexpected message queue until the + update succeeds. +\layout Standard + +The following code fragment illustrates this approach. + Because events must be removed from the unexpected message queue to be + examined, this code fragment assumes the existence of a user managed event + list, +\family typewriter +Rcvd +\family default +, for the events that have already been removed from the unexpected message + queue. + In an effort to keep the example focused on the basic protocol, we have + omitted the code that would be needed to manage the memory descriptors + used for unexpected short messages. + In particular, we simply leave messages in these descriptors until they + are received by the application. + In a robust implementation, we would introduce code to ensure that short + unexpected messages are removed from these memory descriptors so that they + can be re-used. +\layout LyX-Code + + +\size small +extern ptl_handle_eq_t UnexpQueue; +\newline +extern ptl_handle_me_t RcvMark; +\newline +extern ptl_handle_me_t ShortMatch; +\newline + +\newline +typedef struct event_list_tag { +\newline + ptl_event_t event; +\newline + struct event_list_tag* next; +\newline +} event_list; +\newline + +\newline +extern event_list Rcvd; +\newline + +\newline +void AppendRcvd( ptl_event_t event ) +\newline +{ +\newline + /* append an event onto the Rcvd list */ +\newline +} +\newline + +\newline +int SearchRcvd( void *buf, ptl_size_t len, ptl_process_id_t sender, ptl_match_bi +ts_t match, +\newline + ptl_match_bits_t ignore, ptl_event_t *event ) +\newline +{ +\newline + /* Search the Rcvd event queue, looking for a message that matches the + requested message. +\newline + * If one is found, remove the event from the Rcvd list and return it. + */ +\newline +} +\newline + +\newline +typedef enum { RECEIVED, POSTED } receive_state; +\newline + +\newline +receive_state CopyMsg( void *buf, ptl_size_t &length, ptl_event_t event, + ptl_md_t md_buf ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_me_t me_handle; +\newline + +\newline + if( event.rlength >= MPI_LONG_LENGTH ) { +\newline + PtlMDBind( MPI_NI, md_buf, &md_handle ); +\newline + PtlGet( event.initiator, MPI_GET_PINDEX, 0, event.match_bits, MPI_AINDEX, + md_handle ); +\newline + return POSTED; +\newline + } else { +\newline + /* copy the message */ +\newline + if( event.mlength < *length ) *length = event.mlength; +\newline + memcpy( buf, (char*)event.md_desc.start+event.offset, *length ); +\newline + return RECEIVED; +\newline + } +\newline +} +\newline + +\newline +receive_state MPIreceive( void *buf, ptl_size_t &len, void *MPI_data, ptl_handle +_eq_t eventq, +\newline + ptl_process_id_t sender, ptl_match_bits_t match, + ptl_match_bits_t ignore ) +\newline +{ +\newline + ptl_md_t md_buf; +\newline + ptl_handle_md_t md_handle; +\newline + ptl_handle_me_t me_handle; +\newline + ptl_event_t event; +\newline + +\newline + /* build a memory descriptor for the receive */ +\newline + md_buf.start = buf; +\newline + md_buf.length = *len; +\newline + md_buf.threshold = 0; /* temporarily disabled */ +\newline + md_buf.options = PTL_MD_PUT_OP; +\newline + md_buf.user_ptr = MPI_data; +\newline + md_buf.eventq = eventq; +\newline + +\newline + /* see if we have already received the message */ +\newline + if( SearchRcvd(buf, len, sender, match, ignore, &event) ) +\newline + return CopyMsg( buf, len, event, md_buf ); +\newline + +\newline + /* create the match entry and attach the memory descriptor */ +\newline + PtlMEInsert(sender, match, ignore, PTL_UNLINK, PTL_INS_BEFORE, RcvMark, + &me_handle); +\newline + PtlMDAttach( me_handle, md_buf, PTL_UNLINK, &md_handle ); +\newline + +\newline + md_buf.threshold = 1; +\newline + do +\newline + if( PtlEQGet( UnexpQueue, &event ) != PTL_EQ_EMPTY ) { +\newline + if( MPIMatch(event, match, ignore, sender) ) { +\newline + return CopyMsg( buf, len, (char*)event.md_desc.start+event.offset, + md_buf ); +\newline + } else { +\newline + AppendRcvd( event ); +\newline + } +\newline + } +\newline + while( PtlMDUpdate(md_handle, NULL, &md_buf, unexp_queue) == PTL_NOUPDATE + ); +\newline + return POSTED; +\newline +} +\layout Chapter* + +Acknowledgments +\layout Standard + +Several people have contributed to the philosophy, design, and implementation + of the Portals message passing architecture as it has evolved. + We acknowledge the following people for their contributions: Al Audette, + Lee Ann Fisk, David Greenberg, Tramm Hudson, Gabi Istrail, Chu Jong, Mike + Levenhagen, Jim Otto, Mark Sears, Lance Shuler, Mack Stallcup, Jeff VanDyke, + Dave van Dresser, Lee Ward, and Stephen Wheat. + +\layout Standard + + +\begin_inset LatexCommand \BibTeX[ieee]{portals3} + +\end_inset + + +\the_end diff --git a/lustre/portals/doc/put.fig b/lustre/portals/doc/put.fig new file mode 100644 index 0000000..5235b6d --- /dev/null +++ b/lustre/portals/doc/put.fig @@ -0,0 +1,32 @@ +#FIG 3.2 +Landscape +Center +Inches +Letter +100.00 +Single +-2 +1200 2 +6 1350 900 2175 1200 +4 0 0 100 0 0 10 0.0000 0 105 825 1350 1200 Transmission\001 +4 0 0 100 0 0 10 0.0000 0 105 285 1620 1050 Data\001 +-6 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 2700 1275 2700 1725 +2 1 0 1 0 7 100 0 -1 4.000 0 0 -1 1 0 2 + 0 0 1.00 60.00 120.00 + 900 525 2700 1200 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 0 300 1200 300 1200 2250 0 2250 0 300 +2 2 0 1 0 7 100 0 -1 3.000 0 0 7 0 0 5 + 2400 300 3600 300 3600 2250 2400 2250 2400 300 +2 1 1 1 0 7 100 0 -1 4.000 0 0 7 1 0 2 + 0 0 1.00 60.00 120.00 + 2699 1788 899 1938 +4 0 0 100 0 0 10 0.0000 0 105 720 2775 1650 Translation\001 +4 1 0 100 0 0 10 0.0000 0 135 555 1800 2025 Optional\001 +4 1 0 100 0 0 10 0.0000 0 135 1170 1800 2175 Acknowledgement\001 +4 0 0 100 0 0 10 0.0000 0 105 405 2850 1500 Portal\001 +4 1 0 100 0 0 10 0.0000 0 135 405 3000 525 Target\001 +4 1 0 100 0 0 10 0.0000 0 105 540 600 525 Initiator\001 diff --git a/lustre/portals/include/.cvsignore b/lustre/portals/include/.cvsignore new file mode 100644 index 0000000..d45f796 --- /dev/null +++ b/lustre/portals/include/.cvsignore @@ -0,0 +1,4 @@ +config.h +stamp-h +stamp-h1 +stamp-h.in diff --git a/lustre/portals/include/Makefile.am b/lustre/portals/include/Makefile.am new file mode 100644 index 0000000..2cf7f99 --- /dev/null +++ b/lustre/portals/include/Makefile.am @@ -0,0 +1,8 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = portals linux +EXTRA_DIST = config.h.in +include $(top_srcdir)/Rules diff --git a/lustre/portals/include/config.h.in b/lustre/portals/include/config.h.in new file mode 100644 index 0000000..b05d0c4 --- /dev/null +++ b/lustre/portals/include/config.h.in @@ -0,0 +1,11 @@ +/* ../include/config.h.in. Generated automatically from configure.in by autoheader. */ + +/* Define if you have the readline library (-lreadline). */ +#undef HAVE_LIBREADLINE + +/* Name of package */ +#undef PACKAGE + +/* Version number of package */ +#undef VERSION + diff --git a/lustre/portals/include/linux/Makefile.am b/lustre/portals/include/linux/Makefile.am new file mode 100644 index 0000000..6a65cb5 --- /dev/null +++ b/lustre/portals/include/linux/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(top_srcdir)/Rules + +linuxincludedir = $(includedir)/linux + +linuxinclude_HEADERS=kp30.h portals_lib.h diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h new file mode 100644 index 0000000..6d7f3f3 --- /dev/null +++ b/lustre/portals/include/linux/kp30.h @@ -0,0 +1,943 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _KP30_INCLUDED +#define _KP30_INCLUDED + + +#define PORTAL_DEBUG + +#ifndef offsetof +# define offsetof(typ,memb) ((int)((char *)&(((typ *)0)->memb))) +#endif + +#define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) + +#ifndef CONFIG_SMP +# define smp_processor_id() 0 +#endif + +/* + * Debugging + */ +extern unsigned int portal_subsystem_debug; +extern unsigned int portal_stack; +extern unsigned int portal_debug; +extern unsigned int portal_printk; +/* Debugging subsystems (8 bit ID) + * + * If you add debug subsystem #32, you need to send email to phil, because + * you're going to break kernel subsystem debug filtering. */ +#define S_UNDEFINED (0 << 24) +#define S_MDC (1 << 24) +#define S_MDS (2 << 24) +#define S_OSC (3 << 24) +#define S_OST (4 << 24) +#define S_CLASS (5 << 24) +#define S_OBDFS (6 << 24) /* obsolete */ +#define S_LLITE (7 << 24) +#define S_RPC (8 << 24) +#define S_EXT2OBD (9 << 24) /* obsolete */ +#define S_PORTALS (10 << 24) +#define S_SOCKNAL (11 << 24) +#define S_QSWNAL (12 << 24) +#define S_PINGER (13 << 24) +#define S_FILTER (14 << 24) +#define S_TRACE (15 << 24) /* obsolete */ +#define S_ECHO (16 << 24) +#define S_LDLM (17 << 24) +#define S_LOV (18 << 24) +#define S_GMNAL (19 << 24) +#define S_PTLROUTER (20 << 24) +#define S_COBD (21 << 24) +#define S_PTLBD (22 << 24) +#define S_LOG (23 << 24) + +/* If you change these values, please keep portals/linux/utils/debug.c + * up to date! */ + +/* Debugging masks (24 bits, non-overlapping) */ +#define D_TRACE (1 << 0) /* ENTRY/EXIT markers */ +#define D_INODE (1 << 1) +#define D_SUPER (1 << 2) +#define D_EXT2 (1 << 3) /* anything from ext2_debug */ +#define D_MALLOC (1 << 4) /* print malloc, free information */ +#define D_CACHE (1 << 5) /* cache-related items */ +#define D_INFO (1 << 6) /* general information */ +#define D_IOCTL (1 << 7) /* ioctl related information */ +#define D_BLOCKS (1 << 8) /* ext2 block allocation */ +#define D_NET (1 << 9) /* network communications */ +#define D_WARNING (1 << 10) +#define D_BUFFS (1 << 11) +#define D_OTHER (1 << 12) +#define D_DENTRY (1 << 13) +#define D_PORTALS (1 << 14) /* ENTRY/EXIT markers */ +#define D_PAGE (1 << 15) /* bulk page handling */ +#define D_DLMTRACE (1 << 16) +#define D_ERROR (1 << 17) /* CERROR(...) == CDEBUG (D_ERROR, ...) */ +#define D_EMERG (1 << 18) /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ +#define D_HA (1 << 19) /* recovery and failover */ +#define D_RPCTRACE (1 << 20) /* for distributed debugging */ +#define D_VFSTRACE (1 << 21) + +#ifndef __KERNEL__ +#define THREAD_SIZE 8192 +#endif +#ifdef __ia64__ +#define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +#else +#define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) +#endif + +#ifdef __KERNEL__ +#define CHECK_STACK(stack) \ + do { \ + if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) { \ + portals_debug_msg(DEBUG_SUBSYSTEM, D_ERROR, \ + __FILE__, __FUNCTION__, __LINE__, \ + (stack), \ + "maximum lustre stack %u\n", \ + portal_stack = (stack)); \ + /*panic("LBUG");*/ \ + } \ + } while (0) +#else +#define CHECK_STACK(stack) do { } while(0) +#endif + +#if 1 +#define CDEBUG(mask, format, a...) \ +do { \ + CHECK_STACK(CDEBUG_STACK()); \ + if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \ + (portal_debug & (mask) && \ + portal_subsystem_debug & (1 << (DEBUG_SUBSYSTEM >> 24)))) \ + portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + CDEBUG_STACK(), format , ## a); \ +} while (0) + +#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) +#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +#define CEMERG(format, a...) CDEBUG(D_EMERG, format, ## a) + +#define GOTO(label, rc) \ +do { \ + long GOTO__ret = (long)(rc); \ + CDEBUG(D_TRACE,"Process leaving via %s (rc=%lu : %ld : %lx)\n", \ + #label, (unsigned long)GOTO__ret, (signed long)GOTO__ret,\ + (signed long)GOTO__ret); \ + goto label; \ +} while (0) + +#define RETURN(rc) \ +do { \ + typeof(rc) RETURN__ret = (rc); \ + CDEBUG(D_TRACE, "Process leaving (rc=%lu : %ld : %lx)\n", \ + (long)RETURN__ret, (long)RETURN__ret, (long)RETURN__ret);\ + return RETURN__ret; \ +} while (0) + +#define ENTRY \ +do { \ + CDEBUG(D_TRACE, "Process entered\n"); \ +} while (0) + +#define EXIT \ +do { \ + CDEBUG(D_TRACE, "Process leaving\n"); \ +} while(0) +#else +#define CDEBUG(mask, format, a...) do { } while (0) +#define CWARN(format, a...) do { } while (0) +#define CERROR(format, a...) printk("<3>" format, ## a) +#define CEMERG(format, a...) printk("<0>" format, ## a) +#define GOTO(label, rc) do { (void)(rc); goto label; } while (0) +#define RETURN(rc) return (rc) +#define ENTRY do { } while (0) +#define EXIT do { } while (0) +#endif + + +#ifdef __KERNEL__ +# include <linux/vmalloc.h> +# include <linux/time.h> +# include <linux/slab.h> +# include <linux/interrupt.h> +# include <linux/highmem.h> +# include <linux/module.h> +# include <linux/version.h> +# include <portals/lib-nal.h> +# include <linux/smp_lock.h> +# include <asm/atomic.h> + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define schedule_work schedule_task +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_TQUEUE((wq), 0, 0); \ + PREPARE_TQUEUE((wq), (cb), (cbdata)); \ +} while (0) + +#define ll_invalidate_inode_pages invalidate_inode_pages +#define PageUptodate Page_Uptodate +#define our_recalc_sigpending(current) recalc_sigpending(current) +#define num_online_cpus() smp_num_cpus +static inline void our_cond_resched(void) +{ + if (current->need_resched) + schedule (); +} + +#else + +#define prepare_work(wq,cb,cbdata) \ +do { \ + INIT_WORK((wq), (void *)(cb), (void *)(cbdata)); \ +} while (0) +#define ll_invalidate_inode_pages(inode) invalidate_inode_pages((inode)->i_mapping) +#define wait_on_page wait_on_page_locked +#define our_recalc_sigpending(current) recalc_sigpending() +#define strtok(a,b) strpbrk(a, b) +static inline void our_cond_resched(void) +{ + cond_resched(); +} +#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) */ + +#ifdef PORTAL_DEBUG +extern void kportal_assertion_failed(char *expr,char *file,char *func,int line); +#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ + __FUNCTION__, __LINE__)) +#else +#define LASSERT(e) +#endif + +#ifdef __arch_um__ +#define LBUG_WITH_LOC(file, func, line) \ +do { \ + CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(file, func, line); \ + panic("LBUG"); \ +} while (0) +#else +#define LBUG_WITH_LOC(file, func, line) \ +do { \ + CEMERG("LBUG\n"); \ + portals_debug_dumplog(); \ + portals_run_lbug_upcall(file, func, line); \ + set_task_state(current, TASK_UNINTERRUPTIBLE); \ + schedule(); \ +} while (0) +#endif /* __arch_um__ */ + +#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__) + +/* + * Memory + */ +#ifdef PORTAL_DEBUG +extern atomic_t portal_kmemory; + +# define portal_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &portal_kmemory); \ +} while (0) + +# define portal_kmem_dec(ptr, size) do { \ + atomic_sub(size, &portal_kmemory); \ +} while (0) + +#else +# define portal_kmem_inc(ptr, size) do {} while (0) +# define portal_kmem_dec(ptr, size) do {} while (0) +#endif /* PORTAL_DEBUG */ + +#define PORTAL_VMALLOC_SIZE 16384 + +#define PORTAL_ALLOC(ptr, size) \ +do { \ + long s = size; \ + LASSERT (!in_interrupt()); \ + if (s > PORTAL_VMALLOC_SIZE) \ + (ptr) = vmalloc(s); \ + else \ + (ptr) = kmalloc(s, GFP_NOFS); \ + if ((ptr) == NULL) \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' = %ld)\n", __FILE__, __LINE__, s); \ + else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_FREE(ptr, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + if (s > PORTAL_VMALLOC_SIZE) \ + vfree(ptr); \ + else \ + kfree(ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_ALLOC(ptr, slab, size) \ +do { \ + long s = (size); \ + LASSERT (!in_interrupt()); \ + (ptr) = kmem_cache_alloc((slab), SLAB_KERNEL); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: out of memory at %s:%d (tried to alloc" \ + " '" #ptr "' from slab '" #slab "')\n", __FILE__, \ + __LINE__); \ + } else { \ + portal_kmem_inc((ptr), s); \ + memset((ptr), 0, s); \ + } \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +#define PORTAL_SLAB_FREE(ptr, slab, size) \ +do { \ + long s = (size); \ + if ((ptr) == NULL) { \ + CERROR("PORTALS: free NULL '" #ptr "' (%ld bytes) at " \ + "%s:%d\n", s, __FILE__, __LINE__); \ + break; \ + } \ + memset((ptr), 0x5a, s); \ + kmem_cache_free((slab), ptr); \ + portal_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %ld at %p (tot %d).\n", \ + s, (ptr), atomic_read (&portal_kmemory)); \ +} while (0) + +/* ------------------------------------------------------------------- */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + +#define PORTAL_SYMBOL_REGISTER(x) inter_module_register(#x, THIS_MODULE, &x) +#define PORTAL_SYMBOL_UNREGISTER(x) inter_module_unregister(#x) + +#define PORTAL_SYMBOL_GET(x) ((typeof(&x))inter_module_get(#x)) +#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) + +#define PORTAL_MODULE_USE MOD_INC_USE_COUNT +#define PORTAL_MODULE_UNUSE MOD_DEC_USE_COUNT +#else + +#define PORTAL_SYMBOL_REGISTER(x) +#define PORTAL_SYMBOL_UNREGISTER(x) + +#define PORTAL_SYMBOL_GET(x) symbol_get(x) +#define PORTAL_SYMBOL_PUT(x) symbol_put(x) + +#define PORTAL_MODULE_USE try_module_get(THIS_MODULE) +#define PORTAL_MODULE_UNUSE module_put(THIS_MODULE) + +#endif + +/******************************************************************************/ +/* Kernel Portals Router interface */ + +typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback + +/* space for routing targets to stash "stuff" in a forwarded packet */ +typedef union { + long long _alignment; + void *_space[16]; /* scale with CPU arch */ +} kprfd_scratch_t; + +/* Kernel Portals Routing Forwarded message Descriptor */ +typedef struct { + struct list_head kprfd_list; /* stash in queues (routing target can use) */ + ptl_nid_t kprfd_target_nid; /* final destination NID */ + ptl_nid_t kprfd_gateway_nid; /* gateway NID */ + int kprfd_nob; /* # message bytes (including header) */ + int kprfd_niov; /* # message frags (including header) */ + struct iovec *kprfd_iov; /* message fragments */ + void *kprfd_router_arg; // originating NAL's router arg + kpr_fwd_callback_t kprfd_callback; /* completion callback */ + void *kprfd_callback_arg; /* completion callback arg */ + kprfd_scratch_t kprfd_scratch; // scratchpad for routing targets +} kpr_fwd_desc_t; + +typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); + +/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ +typedef const struct { + int kprni_nalid; /* NAL's id */ + void *kprni_arg; /* Arg to pass when calling into NAL */ + kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ +} kpr_nal_interface_t; + +/* Router's routing interface (Kernel Portals Routing Router Interface) */ +typedef const struct { + /* register the calling NAL with the router and get back the handle for + * subsequent calls */ + int (*kprri_register) (kpr_nal_interface_t *nal_interface, + void **router_arg); + + /* ask the router to find a gateway that forwards to 'nid' and is a peer + * of the calling NAL */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + ptl_nid_t *gateway_nid); + + /* hand a packet over to the router for forwarding */ + kpr_fwd_t kprri_fwd_start; + + /* hand a packet back to the router for completion */ + void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, + int error); + + /* the calling NAL is shutting down */ + void (*kprri_shutdown) (void *router_arg); + + /* deregister the calling NAL with the router */ + void (*kprri_deregister) (void *router_arg); + +} kpr_router_interface_t; + +/* Convenient struct for NAL to stash router interface/args */ +typedef struct { + kpr_router_interface_t *kpr_interface; + void *kpr_arg; +} kpr_router_t; + +/* Router's control interface (Kernel Portals Routing Control Interface) */ +typedef const struct { + int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); + int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_get_route)(int index, int *gateway_nal, + ptl_nid_t *gateway, ptl_nid_t *lo_nid, + ptl_nid_t *hi_nid); +} kpr_control_interface_t; + +extern kpr_control_interface_t kpr_control_interface; +extern kpr_router_interface_t kpr_router_interface; + +static inline int +kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) +{ + int rc; + + router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); + if (router->kpr_interface == NULL) + return (-ENOENT); + + rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); + if (rc != 0) + router->kpr_interface = NULL; + + PORTAL_SYMBOL_PUT (kpr_router_interface); + return (rc); +} + +static inline int +kpr_routing (kpr_router_t *router) +{ + return (router->kpr_interface != NULL); +} + +static inline int +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +{ + if (!kpr_routing (router)) + return (-EHOSTUNREACH); + + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + gateway_nid)); +} + +static inline void +kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, + int nob, int niov, struct iovec *iov, + kpr_fwd_callback_t callback, void *callback_arg) +{ + fwd->kprfd_target_nid = nid; + fwd->kprfd_gateway_nid = nid; + fwd->kprfd_nob = nob; + fwd->kprfd_niov = niov; + fwd->kprfd_iov = iov; + fwd->kprfd_callback = callback; + fwd->kprfd_callback_arg = callback_arg; +} + +static inline void +kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) +{ + if (!kpr_routing (router)) + fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + else + router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); +} + +static inline void +kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) +{ + LASSERT (kpr_routing (router)); + router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); +} + +static inline void +kpr_shutdown (kpr_router_t *router) +{ + if (kpr_routing (router)) + router->kpr_interface->kprri_shutdown (router->kpr_arg); +} + +static inline void +kpr_deregister (kpr_router_t *router) +{ + if (!kpr_routing (router)) + return; + router->kpr_interface->kprri_deregister (router->kpr_arg); + router->kpr_interface = NULL; +} + +/******************************************************************************/ + +#ifdef PORTALS_PROFILING +#define prof_enum(FOO) PROF__##FOO +enum { + prof_enum(our_recvmsg), + prof_enum(our_sendmsg), + prof_enum(socknal_recv), + prof_enum(lib_parse), + prof_enum(conn_list_walk), + prof_enum(memcpy), + prof_enum(lib_finalize), + prof_enum(pingcli_time), + prof_enum(gmnal_send), + prof_enum(gmnal_recv), + MAX_PROFS +}; + +struct prof_ent { + char *str; + /* hrmph. wrap-tastic. */ + u32 starts; + u32 finishes; + cycles_t total_cycles; + cycles_t start; + cycles_t end; +}; + +extern struct prof_ent prof_ents[MAX_PROFS]; + +#define PROF_START(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->starts++; \ + pe->start = get_cycles(); \ + } while (0) + +#define PROF_FINISH(FOO) \ + do { \ + struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ + pe->finishes++; \ + pe->end = get_cycles(); \ + pe->total_cycles += (pe->end - pe->start); \ + } while (0) +#else /* !PORTALS_PROFILING */ +#define PROF_START(FOO) do {} while(0) +#define PROF_FINISH(FOO) do {} while(0) +#endif /* PORTALS_PROFILING */ + +/* debug.c */ +void portals_run_lbug_upcall(char * file, char *fn, int line); +void portals_debug_dumplog(void); +int portals_debug_init(unsigned long bufsize); +int portals_debug_cleanup(void); +int portals_debug_clear_buffer(void); +int portals_debug_mark_buffer(char *text); +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *file, unsigned int size); +__s32 portals_debug_copy_to_user(char *buf, unsigned long len); +#if (__GNUC__) +/* Use the special GNU C __attribute__ hack to have the compiler check the + * printf style argument string against the actual argument count and + * types. + */ +#ifdef printf +# warning printf has been defined as a macro... +# undef printf +#endif +void portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) + __attribute__ ((format (printf, 7, 8))); +#else +void portals_debug_msg (int subsys, int mask, char *file, char *fn, + int line, unsigned long stack, + const char *format, ...); +#endif /* __GNUC__ */ +void portals_debug_set_level(unsigned int debug_level); + +# define fprintf(a, format, b...) CDEBUG(D_OTHER, format , ## b) +# define printf(format, b...) CDEBUG(D_OTHER, format , ## b) +# define time(a) CURRENT_TIME + +extern void kportal_daemonize (char *name); +extern void kportal_blockallsigs (void); + +#else /* !__KERNEL__ */ +# include <stdio.h> +# include <stdlib.h> +#ifndef __CYGWIN__ +# include <stdint.h> +#endif +# include <unistd.h> +# include <time.h> +# include <asm/types.h> +# ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +# endif +# ifdef PORTAL_DEBUG +# undef NDEBUG +# include <assert.h> +# define LASSERT(e) assert(e) +# else +# define LASSERT(e) +# endif +# define printk(format, args...) printf (format, ## args) +# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); +# define PORTAL_FREE(a, b) do { free(a); } while (0); +# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ + printf ("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ + (subsys) >> 24, (mask), (long)time(0), file, fn, line, \ + getpid() , stack, ## a); +#endif + +#ifndef CURRENT_TIME +# define CURRENT_TIME time(0) +#endif + +#include <linux/portals_lib.h> + +/* + * USER LEVEL STUFF BELOW + */ + +#define PORTAL_IOCTL_VERSION 0x00010007 +#define PING_SYNC 0 +#define PING_ASYNC 1 + +struct portal_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + __u64 ioc_nid; + __u64 ioc_nid2; + __u64 ioc_nid3; + __u32 ioc_count; + __u32 ioc_nal; + __u32 ioc_nal_cmd; + __u32 ioc_fd; + __u32 ioc_id; + + __u32 ioc_flags; + __u32 ioc_size; + + __u32 ioc_wait; + __u32 ioc_timeout; + __u32 ioc_misc; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + +struct portal_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +struct portals_debug_ioctl_data +{ + struct portal_ioctl_hdr hdr; + unsigned int subs; + unsigned int debug; +}; + +#define PORTAL_IOC_INIT(data) \ +do { \ + memset(&data, 0, sizeof(data)); \ + data.ioc_version = PORTAL_IOCTL_VERSION; \ + data.ioc_len = sizeof(data); \ +} while (0) + +/* FIXME check conflict with lustre_lib.h */ +#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) + +static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) +{ + int len = sizeof(*data); + len += size_round(data->ioc_inllen1); + len += size_round(data->ioc_inllen2); + return len; +} + +static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + return 1; + } + if (portal_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("PORTALS ioctl: packlen != ioc_len\n"); + return 1; + } + if (data->ioc_inllen1 && + data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); + return 1; + } + if (data->ioc_inllen2 && + data->ioc_bulk[size_round(data->ioc_inllen1) + + data->ioc_inllen2 - 1] != '\0') { + CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); + return 1; + } + return 0; +} + +#ifndef __KERNEL__ +static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, + int max) +{ + char *ptr; + struct portal_ioctl_data *overlay; + data->ioc_len = portal_ioctl_packlen(data); + data->ioc_version = PORTAL_IOCTL_VERSION; + + if (*pbuf && portal_ioctl_packlen(data) > max) + return 1; + if (*pbuf == NULL) { + *pbuf = malloc(data->ioc_len); + } + if (!*pbuf) + return 1; + overlay = (struct portal_ioctl_data *)*pbuf; + memcpy(*pbuf, data, sizeof(*data)); + + ptr = overlay->ioc_bulk; + if (data->ioc_inlbuf1) + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + if (data->ioc_inlbuf2) + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + if (portal_ioctl_is_invalid(overlay)) + return 1; + + return 0; +} +#else +#include <asm/uaccess.h> + +/* buffer MUST be at least the size of portal_ioctl_hdr */ +static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct portal_ioctl_hdr *hdr; + struct portal_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct portal_ioctl_hdr *)buf; + data = (struct portal_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if ( err ) { + EXIT; + return err; + } + + if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { + CERROR ("PORTALS: version mismatch kernel vs application\n"); + return -EINVAL; + } + + if (hdr->ioc_len + buf >= end) { + CERROR ("PORTALS: user buffer exceeds kernel buffer\n"); + return -EINVAL; + } + + + if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { + CERROR ("PORTALS: user buffer too small for ioctl\n"); + return -EINVAL; + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if ( err ) { + EXIT; + return err; + } + + if (portal_ioctl_is_invalid(data)) { + CERROR ("PORTALS: ioctl not correctly formatted\n"); + return -EINVAL; + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); + } + + EXIT; + return 0; +} +#endif + +/* ioctls for manipulating snapshots 30- */ +#define IOC_PORTAL_TYPE 'e' +#define IOC_PORTAL_MIN_NR 30 + +#define IOC_PORTAL_PING _IOWR('e', 30, long) +#define IOC_PORTAL_GET_DEBUG _IOWR('e', 31, long) +#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, long) +#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, long) +#define IOC_PORTAL_PANIC _IOWR('e', 34, long) +#define IOC_PORTAL_ADD_ROUTE _IOWR('e', 35, long) +#define IOC_PORTAL_DEL_ROUTE _IOWR('e', 36, long) +#define IOC_PORTAL_GET_ROUTE _IOWR('e', 37, long) +#define IOC_PORTAL_NAL_CMD _IOWR('e', 38, long) +#define IOC_PORTAL_GET_NID _IOWR('e', 39, long) +#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) +#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) + +#define IOC_PORTAL_MAX_NR 41 + +enum { + QSWNAL = 1, + SOCKNAL, + GMNAL, + TOENAL, + TCPNAL, + SCIMACNAL, + NAL_ENUM_END_MARKER +}; + +#ifdef __KERNEL__ +extern ptl_handle_ni_t kqswnal_ni; +extern ptl_handle_ni_t ksocknal_ni; +extern ptl_handle_ni_t ktoenal_ni; +extern ptl_handle_ni_t kgmnal_ni; +extern ptl_handle_ni_t kscimacnal_ni; +#endif + +#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) + +#define NAL_CMD_REGISTER_PEER_FD 100 +#define NAL_CMD_CLOSE_CONNECTION 101 +#define NAL_CMD_REGISTER_MYNID 102 +#define NAL_CMD_PUSH_CONNECTION 103 + +enum { + DEBUG_DAEMON_START = 1, + DEBUG_DAEMON_STOP = 2, + DEBUG_DAEMON_PAUSE = 3, + DEBUG_DAEMON_CONTINUE = 4, +}; + +/* XXX remove to lustre ASAP */ +struct lustre_peer { + ptl_nid_t peer_nid; + ptl_handle_ni_t peer_ni; +}; + +/* module.c */ +typedef int (*nal_cmd_handler_t)(struct portal_ioctl_data *, void * private); +int kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private); +int kportal_nal_unregister(int nal); + +ptl_handle_ni_t *kportal_get_ni (int nal); +void kportal_put_ni (int nal); + +#ifdef __CYGWIN__ +#ifndef BITS_PER_LONG +#if (~0UL) == 0xffffffffUL +#define BITS_PER_LONG 32 +#else +#define BITS_PER_LONG 64 +#endif +#endif +#endif + +#if (BITS_PER_LONG == 32 || __WORDSIZE == 32) +# define LPU64 "%Lu" +# define LPD64 "%Ld" +# define LPX64 "%#Lx" +# define LPSZ "%u" +# define LPSSZ "%d" +#endif +#if (BITS_PER_LONG == 64 || __WORDSIZE == 64) +# define LPU64 "%lu" +# define LPD64 "%ld" +# define LPX64 "%#lx" +# define LPSZ "%lu" +# define LPSSZ "%ld" +#endif +#ifndef LPU64 +# error "No word size defined" +#endif + +#endif diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h new file mode 100644 index 0000000..e28fbac --- /dev/null +++ b/lustre/portals/include/linux/portals_compat25.h @@ -0,0 +1,13 @@ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20) +# define SIGNAL_MASK_LOCK(task, flags) \ + spin_lock_irqsave(&task->sighand->siglock, flags) +# define SIGNAL_MASK_UNLOCK(task, flags) \ + spin_unlock_irqrestore(&task->sighand->siglock, flags) +# define RECALC_SIGPENDING recalc_sigpending() +#else +# define SIGNAL_MASK_LOCK(task, flags) \ + spin_lock_irqsave(&task->sigmask_lock, flags) +# define SIGNAL_MASK_UNLOCK(task, flags) \ + spin_unlock_irqrestore(&task->sigmask_lock, flags) +# define RECALC_SIGPENDING recalc_sigpending(current) +#endif diff --git a/lustre/portals/include/linux/portals_lib.h b/lustre/portals/include/linux/portals_lib.h new file mode 100644 index 0000000..a528a80 --- /dev/null +++ b/lustre/portals/include/linux/portals_lib.h @@ -0,0 +1,188 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _PORTALS_LIB_H +#define _PORTALS_LIB_H + +#ifndef __KERNEL__ +# include <string.h> +#else +# include <asm/types.h> +#endif + +#undef MIN +#define MIN(a,b) (((a)<(b)) ? (a): (b)) +#undef MAX +#define MAX(a,b) (((a)>(b)) ? (a): (b)) +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int size_round (int val) +{ + return (val + 7) & (~0x7); +} + +static inline int size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t round_strlen(char *fset) +{ + return size_round(strlen(fset) + 1); +} + +#ifdef __KERNEL__ +static inline char *strdup(const char *str) +{ + int len = strlen(str) + 1; + char *tmp = kmalloc(len, GFP_KERNEL); + if (tmp) + memcpy(tmp, str, len); + + return tmp; +} +#endif + +#ifdef __KERNEL__ +# define NTOH__u32(var) le32_to_cpu(var) +# define NTOH__u64(var) le64_to_cpu(var) +# define HTON__u32(var) cpu_to_le32(var) +# define HTON__u64(var) cpu_to_le64(var) +#else +# define expansion_u64(var) \ + ({ __u64 ret; \ + switch (sizeof(var)) { \ + case 8: (ret) = (var); break; \ + case 4: (ret) = (__u32)(var); break; \ + case 2: (ret) = (__u16)(var); break; \ + case 1: (ret) = (__u8)(var); break; \ + }; \ + (ret); \ + }) +# define NTOH__u32(var) (var) +# define NTOH__u64(var) (expansion_u64(var)) +# define HTON__u32(var) (var) +# define HTON__u64(var) (expansion_u64(var)) +#endif + +/* + * copy sizeof(type) bytes from pointer to var and move ptr forward. + * return EFAULT if pointer goes beyond end + */ +#define UNLOGV(var,type,ptr,end) \ +do { \ + var = *(type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* the following two macros convert to little endian */ +/* type MUST be __u32 or __u64 */ +#define LUNLOGV(var,type,ptr,end) \ +do { \ + var = NTOH##type(*(type *)ptr); \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +/* now log values */ +#define LOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = var; \ + ptr += sizeof(type); \ +} while (0) + +/* and in network order */ +#define LLOGV(var,type,ptr) \ +do { \ + *((type *)ptr) = HTON##type(var); \ + ptr += sizeof(type); \ +} while (0) + + +/* + * set var to point at (type *)ptr, move ptr forward with sizeof(type) + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGP(var,type,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += sizeof(type); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define LOGP(var,type,ptr) \ +do { \ + memcpy(ptr, var, sizeof(type)); \ + ptr += sizeof(type); \ +} while (0) + +/* + * set var to point at (char *)ptr, move ptr forward by size_round(len); + * return from function with EFAULT if ptr goes beyond end + */ +#define UNLOGL(var,type,len,ptr,end) \ +do { \ + var = (type *)ptr; \ + ptr += size_round(len * sizeof(type)); \ + if (ptr > end ) \ + return -EFAULT; \ +} while (0) + +#define UNLOGL0(var,type,len,ptr,end) \ +do { \ + UNLOGL(var,type,len,ptr,end); \ + if ( *((char *)ptr - size_round(len) + len - 1) != '\0') \ + return -EFAULT; \ +} while (0) + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += size_round(len + 1); \ +} while (0) + +#endif /* _PORTALS_LIB_H */ diff --git a/lustre/portals/include/portals/Makefile.am b/lustre/portals/include/portals/Makefile.am new file mode 100644 index 0000000..c61b084 --- /dev/null +++ b/lustre/portals/include/portals/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = base +include $(top_srcdir)/Rules + +pkginclude_HEADERS=api-support.h api.h arg-blocks.h defines.h errno.h internal.h lib-dispatch.h lib-nal.h lib-p30.h lib-types.h myrnal.h nal.h p30.h ppid.h ptlctl.h stringtab.h types.h nalids.h list.h bridge.h ipmap.h procbridge.h lltrace.h + diff --git a/lustre/portals/include/portals/api-support.h b/lustre/portals/include/portals/api-support.h new file mode 100644 index 0000000..af4a2dc --- /dev/null +++ b/lustre/portals/include/portals/api-support.h @@ -0,0 +1,27 @@ +# define DEBUG_SUBSYSTEM S_PORTALS +# define PORTAL_DEBUG + +#ifndef __KERNEL__ +# include <stdio.h> +# include <stdlib.h> +# include <unistd.h> +# include <time.h> + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include <signal.h> +# include <setjmp.h> +# include <time.h> +#endif + +#include <portals/types.h> +#include <linux/kp30.h> +#include <portals/p30.h> + +#include <portals/internal.h> +#include <portals/nal.h> +#include <portals/arg-blocks.h> + +/* Hack for 2.4.18 macro name collision */ +#ifdef yield +#undef yield +#endif diff --git a/lustre/portals/include/portals/api.h b/lustre/portals/include/portals/api.h new file mode 100644 index 0000000..a83749b --- /dev/null +++ b/lustre/portals/include/portals/api.h @@ -0,0 +1,159 @@ +#ifndef P30_API_H +#define P30_API_H + +#include <portals/types.h> + +#ifndef PTL_NO_WRAP +int PtlInit(void); +int PtlInitialized(void); +void PtlFini(void); + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size_in, + ptl_ac_index_t acl_size_in, ptl_pid_t requested_pid, + ptl_handle_ni_t * interface_out); + +int PtlNIInitialized(ptl_interface_t); + +int PtlNIFini(ptl_handle_ni_t interface_in); + +#endif + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); + + +/* + * Network interfaces + */ + +#ifndef PTL_NO_WRAP +int PtlNIBarrier(ptl_handle_ni_t interface_in); +#endif + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out); + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out); + +#ifndef PTL_NO_WRAP +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); +#endif + + +/* + * PtlNIDebug: + * + * This is not an official Portals 3 API call. It is provided + * by the reference implementation to allow the maintainers an + * easy way to turn on and off debugging information in the + * library. Do not use it in code that is not intended for use + * with any version other than the portable reference library. + */ +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in); + +/* + * PtlNIFailNid + * + * Not an official Portals 3 API call. It provides a way of simulating + * communications failures to all (nid == PTL_NID_ANY), or specific peers + * (via multiple calls), either until further notice (threshold == -1), or + * for a specific number of messages. Passing a threshold of zero, "heals" + * the given peer. + */ +int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); + + +/* + * Match entries + */ + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out); + +int PtlMEUnlink(ptl_handle_me_t current_in); + +int PtlMEUnlinkList(ptl_handle_me_t current_in); + +int PtlTblDump(ptl_handle_ni_t ni, int index_in); +int PtlMEDump(ptl_handle_me_t current_in); + + + +/* + * Memory descriptors + */ + +#ifndef PTL_NO_WRAP +int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out); + +int PtlMDUnlink(ptl_handle_md_t md_in); + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in); + +#endif + +/* These should not be called by users */ +int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, + ptl_md_t * new_inout, ptl_handle_eq_t testq_in, + ptl_seq_t sequence_in); + + + + +/* + * Event queues + */ +#ifndef PTL_NO_WRAP + +/* These should be called by users */ +int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out); +int PtlEQFree(ptl_handle_eq_t eventq_in); + +int PtlEQCount(ptl_handle_eq_t eventq_in, ptl_size_t * count_out); + +int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout); +#endif + +/* + * Access Control Table + */ +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); + + +/* + * Data movement + */ + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in); + + + +#endif diff --git a/lustre/portals/include/portals/arg-blocks.h b/lustre/portals/include/portals/arg-blocks.h new file mode 100644 index 0000000..3c3b154 --- /dev/null +++ b/lustre/portals/include/portals/arg-blocks.h @@ -0,0 +1,265 @@ +#ifndef PTL_BLOCKS_H +#define PTL_BLOCKS_H + +/* + * blocks.h + * + * Argument block types for the Portals 3.0 library + * Generated by idl + * + */ + +#include <portals/types.h> + +/* put LIB_MAX_DISPATCH last here -- these must match the + assignements to the dispatch table in lib-p30/dispatch.c */ +#define PTL_GETID 1 +#define PTL_NISTATUS 2 +#define PTL_NIDIST 3 +#define PTL_NIDEBUG 4 +#define PTL_MEATTACH 5 +#define PTL_MEINSERT 6 +// #define PTL_MEPREPEND 7 +#define PTL_MEUNLINK 8 +#define PTL_TBLDUMP 9 +#define PTL_MEDUMP 10 +#define PTL_MDATTACH 11 +// #define PTL_MDINSERT 12 +#define PTL_MDBIND 13 +#define PTL_MDUPDATE 14 +#define PTL_MDUNLINK 15 +#define PTL_EQALLOC 16 +#define PTL_EQFREE 17 +#define PTL_ACENTRY 18 +#define PTL_PUT 19 +#define PTL_GET 20 +#define PTL_FAILNID 21 +#define LIB_MAX_DISPATCH 21 + +typedef struct PtlFailNid_in { + ptl_handle_ni_t interface; + ptl_nid_t nid; + unsigned int threshold; +} PtlFailNid_in; + +typedef struct PtlFailNid_out { + int rc; +} PtlFailNid_out; + +typedef struct PtlGetId_in { + ptl_handle_ni_t handle_in; +} PtlGetId_in; + +typedef struct PtlGetId_out { + int rc; + ptl_process_id_t id_out; +} PtlGetId_out; + +typedef struct PtlNIStatus_in { + ptl_handle_ni_t interface_in; + ptl_sr_index_t register_in; +} PtlNIStatus_in; + +typedef struct PtlNIStatus_out { + int rc; + ptl_sr_value_t status_out; +} PtlNIStatus_out; + + +typedef struct PtlNIDist_in { + ptl_handle_ni_t interface_in; + ptl_process_id_t process_in; +} PtlNIDist_in; + +typedef struct PtlNIDist_out { + int rc; + unsigned long distance_out; +} PtlNIDist_out; + + +typedef struct PtlNIDebug_in { + unsigned int mask_in; +} PtlNIDebug_in; + +typedef struct PtlNIDebug_out { + unsigned int rc; +} PtlNIDebug_out; + + +typedef struct PtlMEAttach_in { + ptl_handle_ni_t interface_in; + ptl_pt_index_t index_in; + ptl_ins_pos_t position_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; +} PtlMEAttach_in; + +typedef struct PtlMEAttach_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEAttach_out; + + +typedef struct PtlMEInsert_in { + ptl_handle_me_t current_in; + ptl_process_id_t match_id_in; + ptl_match_bits_t match_bits_in; + ptl_match_bits_t ignore_bits_in; + ptl_unlink_t unlink_in; + ptl_ins_pos_t position_in; +} PtlMEInsert_in; + +typedef struct PtlMEInsert_out { + int rc; + ptl_handle_me_t handle_out; +} PtlMEInsert_out; + +typedef struct PtlMEUnlink_in { + ptl_handle_me_t current_in; + ptl_unlink_t unlink_in; +} PtlMEUnlink_in; + +typedef struct PtlMEUnlink_out { + int rc; +} PtlMEUnlink_out; + + +typedef struct PtlTblDump_in { + int index_in; +} PtlTblDump_in; + +typedef struct PtlTblDump_out { + int rc; +} PtlTblDump_out; + + +typedef struct PtlMEDump_in { + ptl_handle_me_t current_in; +} PtlMEDump_in; + +typedef struct PtlMEDump_out { + int rc; +} PtlMEDump_out; + + +typedef struct PtlMDAttach_in { + ptl_handle_me_t me_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; + ptl_unlink_t unlink_in; +} PtlMDAttach_in; + +typedef struct PtlMDAttach_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDAttach_out; + + +typedef struct PtlMDBind_in { + ptl_handle_ni_t ni_in; + ptl_handle_eq_t eq_in; + ptl_md_t md_in; +} PtlMDBind_in; + +typedef struct PtlMDBind_out { + int rc; + ptl_handle_md_t handle_out; +} PtlMDBind_out; + + +typedef struct PtlMDUpdate_internal_in { + ptl_handle_md_t md_in; + ptl_handle_eq_t testq_in; + ptl_seq_t sequence_in; + + ptl_md_t old_inout; + int old_inout_valid; + ptl_md_t new_inout; + int new_inout_valid; +} PtlMDUpdate_internal_in; + +typedef struct PtlMDUpdate_internal_out { + int rc; + ptl_md_t old_inout; + ptl_md_t new_inout; +} PtlMDUpdate_internal_out; + + +typedef struct PtlMDUnlink_in { + ptl_handle_md_t md_in; +} PtlMDUnlink_in; + +typedef struct PtlMDUnlink_out { + int rc; + ptl_md_t status_out; +} PtlMDUnlink_out; + + +typedef struct PtlEQAlloc_in { + ptl_handle_ni_t ni_in; + ptl_size_t count_in; + void *base_in; + int len_in; + int (*callback_in) (ptl_event_t * event); +} PtlEQAlloc_in; + +typedef struct PtlEQAlloc_out { + int rc; + ptl_handle_eq_t handle_out; +} PtlEQAlloc_out; + + +typedef struct PtlEQFree_in { + ptl_handle_eq_t eventq_in; +} PtlEQFree_in; + +typedef struct PtlEQFree_out { + int rc; +} PtlEQFree_out; + + +typedef struct PtlACEntry_in { + ptl_handle_ni_t ni_in; + ptl_ac_index_t index_in; + ptl_process_id_t match_id_in; + ptl_pt_index_t portal_in; +} PtlACEntry_in; + +typedef struct PtlACEntry_out { + int rc; +} PtlACEntry_out; + + +typedef struct PtlPut_in { + ptl_handle_md_t md_in; + ptl_ack_req_t ack_req_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; + ptl_hdr_data_t hdr_data_in; +} PtlPut_in; + +typedef struct PtlPut_out { + int rc; +} PtlPut_out; + + +typedef struct PtlGet_in { + ptl_handle_md_t md_in; + ptl_process_id_t target_in; + ptl_pt_index_t portal_in; + ptl_ac_index_t cookie_in; + ptl_match_bits_t match_bits_in; + ptl_size_t offset_in; +} PtlGet_in; + +typedef struct PtlGet_out { + int rc; +} PtlGet_out; + + +#endif diff --git a/lustre/portals/include/portals/defines.h b/lustre/portals/include/portals/defines.h new file mode 100644 index 0000000..785ce73 --- /dev/null +++ b/lustre/portals/include/portals/defines.h @@ -0,0 +1,116 @@ +/* +** +** This files contains definitions that are used throughout the cplant code. +*/ + +#ifndef CPLANT_H +#define CPLANT_H + +#define TITLE(fname,zmig) + + +/* +** TRUE and FALSE +*/ +#undef TRUE +#define TRUE (1) +#undef FALSE +#define FALSE (0) + + +/* +** Return codes from functions +*/ +#undef OK +#define OK (0) +#undef ERROR +#define ERROR (-1) + + + +/* +** The GCC macro for a safe max() that works on all types arithmetic types. +*/ +#ifndef MAX +#define MAX(a, b) (a) > (b) ? (a) : (b) +#endif /* MAX */ + +#ifndef MIN +#define MIN(a, b) (a) < (b) ? (a) : (b) +#endif /* MIN */ + +/* +** The rest is from the old qkdefs.h +*/ + +#ifndef __linux__ +#define __inline__ +#endif + +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef __osf__ +#define PRIVATE static +#define PUBLIC +#endif + +#ifndef __osf__ +typedef unsigned char uchar; +#endif + +typedef char CHAR; +typedef unsigned char UCHAR; +typedef char INT8; +typedef unsigned char UINT8; +typedef short int INT16; +typedef unsigned short int UINT16; +typedef int INT32; +typedef unsigned int UINT32; +typedef long LONG32; +typedef unsigned long ULONG32; + +/* long may be 32 or 64, so we can't really append the size to the definition */ +typedef long LONG; +typedef unsigned long ULONG; + +#ifdef __alpha__ +typedef long int_t; +#ifndef __osf__ +typedef unsigned long uint_t; +#endif +#endif + +#ifdef __i386__ +typedef int int_t; +typedef unsigned int uint_t; +#endif + +typedef float FLOAT32; +typedef double FLOAT64; +typedef void VOID; +typedef INT32 BOOLEAN; +typedef void (*FCN_PTR)(void); + +#ifndef off64_t + +#if defined (__alpha__) || defined (__ia64__) +typedef long off64_t; +#else +typedef long long off64_t; +#endif + +#endif + +/* +** Process related typedefs +*/ +typedef UINT16 PID_TYPE; /* Type of Local process ID */ +typedef UINT16 NID_TYPE; /* Type of Physical node ID */ +typedef UINT16 GID_TYPE; /* Type of Group ID */ +typedef UINT16 RANK_TYPE; /* Type of Logical rank/process within a group */ + + + +#endif /* CPLANT_H */ diff --git a/lustre/portals/include/portals/errno.h b/lustre/portals/include/portals/errno.h new file mode 100644 index 0000000..817936a --- /dev/null +++ b/lustre/portals/include/portals/errno.h @@ -0,0 +1,61 @@ +#ifndef _P30_ERRNO_H_ +#define _P30_ERRNO_H_ + +/* + * include/portals/errno.h + * + * Shared error number lists + */ + +/* If you change these, you must update the string table in api-errno.c */ +typedef enum { + PTL_OK = 0, + PTL_SEGV = 1, + + PTL_NOSPACE = 2, + PTL_INUSE = 3, + PTL_VAL_FAILED = 4, + + PTL_NAL_FAILED = 5, + PTL_NOINIT = 6, + PTL_INIT_DUP = 7, + PTL_INIT_INV = 8, + PTL_AC_INV_INDEX = 9, + + PTL_INV_ASIZE = 10, + PTL_INV_HANDLE = 11, + PTL_INV_MD = 12, + PTL_INV_ME = 13, + PTL_INV_NI = 14, +/* If you change these, you must update the string table in api-errno.c */ + PTL_ILL_MD = 15, + PTL_INV_PROC = 16, + PTL_INV_PSIZE = 17, + PTL_INV_PTINDEX = 18, + PTL_INV_REG = 19, + + PTL_INV_SR_INDX = 20, + PTL_ML_TOOLONG = 21, + PTL_ADDR_UNKNOWN = 22, + PTL_INV_EQ = 23, + PTL_EQ_DROPPED = 24, + + PTL_EQ_EMPTY = 25, + PTL_NOUPDATE = 26, + PTL_FAIL = 27, + PTL_NOT_IMPLEMENTED = 28, + PTL_NO_ACK = 29, + + PTL_IOV_TOO_MANY = 30, + PTL_IOV_TOO_SMALL = 31, + + PTL_EQ_INUSE = 32, + PTL_MD_INUSE = 33, + + PTL_MAX_ERRNO = 33 +} ptl_err_t; +/* If you change these, you must update the string table in api-errno.c */ + +extern const char *ptl_err_str[]; + +#endif diff --git a/lustre/portals/include/portals/internal.h b/lustre/portals/include/portals/internal.h new file mode 100644 index 0000000..e69de29 diff --git a/lustre/portals/include/portals/lib-dispatch.h b/lustre/portals/include/portals/lib-dispatch.h new file mode 100644 index 0000000..f87ff83 --- /dev/null +++ b/lustre/portals/include/portals/lib-dispatch.h @@ -0,0 +1,45 @@ +#ifndef PTL_DISPATCH_H +#define PTL_DISPATCH_H + +/* + * include/dispatch.h + * + * Dispatch table header and externs for remote side + * operations + * + * Generated by idl + * + */ + +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +extern int do_PtlGetId(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIStatus(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDist(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlNIDebug(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEAttach(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEInsert(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEPrepend(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlTblDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMEDump(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlMDAttach(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDBind(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *args, + void *ret); +extern int do_PtlPut(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlGet(nal_cb_t * nal, void *private, void *args, void *ret); +extern int do_PtlFailNid (nal_cb_t *nal, void *private, void *args, void *ret); + +extern char *dispatch_name(int index); +#endif diff --git a/lustre/portals/include/portals/lib-nal.h b/lustre/portals/include/portals/lib-nal.h new file mode 100644 index 0000000..4052c0c --- /dev/null +++ b/lustre/portals/include/portals/lib-nal.h @@ -0,0 +1,102 @@ +#ifndef _LIB_NAL_H_ +#define _LIB_NAL_H_ + +/* + * nal.h + * + * Library side headers that define the abstraction layer's + * responsibilities and interfaces + */ + +#include <portals/lib-types.h> + +struct nal_cb_t { + /* + * Per interface portal table, access control table + * and NAL private data field; + */ + lib_ni_t ni; + void *nal_data; + /* + * send: Sends a preformatted header and user data to a + * specified remote process. + * Can overwrite iov. + */ + int (*cb_send) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, struct iovec *iov, size_t mlen); + + /* as send, but with a set of page fragments (NULL if not supported) */ + int (*cb_send_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int niov, ptl_kiov_t *iov, size_t mlen); + /* + * recv: Receives an incoming message from a remote process + * Type of iov depends on options. Can overwrite iov. + */ + int (*cb_recv) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, struct iovec *iov, size_t mlen, + size_t rlen); + + /* as recv, but with a set of page fragments (NULL if not supported) */ + int (*cb_recv_pages) (nal_cb_t * nal, void *private, lib_msg_t * cookie, + unsigned int niov, ptl_kiov_t *iov, size_t mlen, + size_t rlen); + /* + * read: Reads a block of data from a specified user address + */ + int (*cb_read) (nal_cb_t * nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len); + + /* + * write: Writes a block of data into a specified user address + */ + int (*cb_write) (nal_cb_t * nal, void *private, user_ptr dsr_addr, + void *src_addr, size_t len); + + /* + * callback: Calls an event callback + */ + int (*cb_callback) (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev); + + /* + * malloc: Acquire a block of memory in a system independent + * fashion. + */ + void *(*cb_malloc) (nal_cb_t * nal, size_t len); + + void (*cb_free) (nal_cb_t * nal, void *buf, size_t len); + + /* + * (un)map: Tell the NAL about some memory it will access. + * *addrkey passed to cb_unmap() is what cb_map() set it to. + * type of *iov depends on options. + * Set to NULL if not required. + */ + int (*cb_map) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + void (*cb_unmap) (nal_cb_t * nal, unsigned int niov, struct iovec *iov, + void **addrkey); + + /* as (un)map, but with a set of page fragments */ + int (*cb_map_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + void (*cb_unmap_pages) (nal_cb_t * nal, unsigned int niov, ptl_kiov_t *iov, + void **addrkey); + + void (*cb_printf) (nal_cb_t * nal, const char *fmt, ...); + + /* Turn interrupts off (begin of protected area) */ + void (*cb_cli) (nal_cb_t * nal, unsigned long *flags); + + /* Turn interrupts on (end of protected area) */ + void (*cb_sti) (nal_cb_t * nal, unsigned long *flags); + + /* + * Calculate a network "distance" to given node + */ + int (*cb_dist) (nal_cb_t * nal, ptl_nid_t nid, unsigned long *dist); +}; + +#endif diff --git a/lustre/portals/include/portals/lib-p30.h b/lustre/portals/include/portals/lib-p30.h new file mode 100644 index 0000000..b623b93 --- /dev/null +++ b/lustre/portals/include/portals/lib-p30.h @@ -0,0 +1,385 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib-p30.h + * + * Top level include for library side routines + */ + +#ifndef _LIB_P30_H_ +#define _LIB_P30_H_ + +#ifdef __KERNEL__ +# include <asm/page.h> +# include <linux/string.h> +#else +# include <portals/list.h> +# include <string.h> +#endif +#include <portals/types.h> +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/errno.h> +#include <portals/lib-types.h> +#include <portals/lib-nal.h> +#include <portals/lib-dispatch.h> + +static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); +} + +#ifdef __KERNEL__ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + nal->cb_cli(nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + nal->cb_sti(nal, flagsp); \ +} +#else +/* not needed in user space until we thread there */ +#define state_lock(nal,flagsp) \ +do { \ + CDEBUG(D_PORTALS, "taking state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} while (0) + +#define state_unlock(nal,flagsp) \ +{ \ + CDEBUG(D_PORTALS, "releasing state lock\n"); \ + CDEBUG(D_PORTALS, "%p:%p\n", nal, flagsp); \ +} +#endif /* __KERNEL__ */ + +#ifndef PTL_USE_SLAB_CACHE + +#define MAX_MES 2048 +#define MAX_MDS 2048 +#define MAX_MSGS 2048 /* Outstanding messages */ +#define MAX_EQS 512 + +extern int lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int nobj, int objsize); +extern void lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl); + +static inline void * +lib_freelist_alloc (lib_freelist_t *fl) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o; + + if (list_empty (&fl->fl_list)) + return (NULL); + + o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + list_del (&o->fo_list); + return ((void *)&o->fo_contents); +} + +static inline void +lib_freelist_free (lib_freelist_t *fl, void *obj) +{ + /* ALWAYS called with statelock held */ + lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + + list_add (&o->fo_list, &fl->fl_list); +} + + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_eq_t *eq; + + state_lock (nal, &flags); + eq = (lib_eq_t *)lib_freelist_alloc (&nal->ni.ni_free_eqs); + state_unlock (nal, &flags); + + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_eqs, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_md_t *md; + + state_lock (nal, &flags); + md = (lib_md_t *)lib_freelist_alloc (&nal->ni.ni_free_mds); + state_unlock (nal, &flags); + + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mds, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + unsigned long flags; + lib_me_t *me; + + state_lock (nal, &flags); + me = (lib_me_t *)lib_freelist_alloc (&nal->ni.ni_free_mes); + state_unlock (nal, &flags); + + return (me); +} + +static inline void +lib_me_free (nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_mes, me); +} + +static inline lib_msg_t * +lib_msg_alloc (nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + return ((lib_msg_t *)lib_freelist_alloc (&nal->ni.ni_free_msgs)); +} + +static inline void +lib_msg_free (nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + lib_freelist_free (&nal->ni.ni_free_msgs, msg); +} + +#else + +extern kmem_cache_t *ptl_md_slab; +extern kmem_cache_t *ptl_msg_slab; +extern kmem_cache_t *ptl_me_slab; +extern kmem_cache_t *ptl_eq_slab; +extern atomic_t md_in_use_count; +extern atomic_t msg_in_use_count; +extern atomic_t me_in_use_count; +extern atomic_t eq_in_use_count; + +static inline lib_eq_t * +lib_eq_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_eq_t *eq = kmem_cache_alloc(ptl_eq_slab, GFP_NOFS); + + if (eq == NULL) + return (NULL); + + atomic_inc (&eq_in_use_count); + return (eq); +} + +static inline void +lib_eq_free (nal_cb_t *nal, lib_eq_t *eq) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&eq_in_use_count); + kmem_cache_free(ptl_eq_slab, eq); +} + +static inline lib_md_t * +lib_md_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_md_t *md = kmem_cache_alloc(ptl_md_slab, GFP_NOFS); + + if (md == NULL) + return (NULL); + + atomic_inc (&md_in_use_count); + return (md); +} + +static inline void +lib_md_free (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&md_in_use_count); + kmem_cache_free(ptl_md_slab, md); +} + +static inline lib_me_t * +lib_me_alloc (nal_cb_t *nal) +{ + /* NEVER called with statelock held */ + lib_me_t *me = kmem_cache_alloc(ptl_me_slab, GFP_NOFS); + + if (me == NULL) + return (NULL); + + atomic_inc (&me_in_use_count); + return (me); +} + +static inline void +lib_me_free(nal_cb_t *nal, lib_me_t *me) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&me_in_use_count); + kmem_cache_free(ptl_me_slab, me); +} + +static inline lib_msg_t * +lib_msg_alloc(nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_msg_t *msg = kmem_cache_alloc(ptl_msg_slab, GFP_ATOMIC); + + if (msg == NULL) + return (NULL); + + atomic_inc (&msg_in_use_count); + return (msg); +} + +static inline void +lib_msg_free(nal_cb_t *nal, lib_msg_t *msg) +{ + /* ALWAYS called with statelock held */ + atomic_dec (&msg_in_use_count); + kmem_cache_free(ptl_msg_slab, msg); +} +#endif + +extern lib_handle_t *lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type); +extern void lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type); +extern void lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh); + +static inline void +ptl_eq2handle (ptl_handle_eq_t *handle, lib_eq_t *eq) +{ + handle->cookie = eq->eq_lh.lh_cookie; +} + +static inline lib_eq_t * +ptl_handle2eq (ptl_handle_eq_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_EQ); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_eq_t, eq_lh)); +} + +static inline void +ptl_md2handle (ptl_handle_md_t *handle, lib_md_t *md) +{ + handle->cookie = md->md_lh.lh_cookie; +} + +static inline lib_md_t * +ptl_handle2md (ptl_handle_md_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline lib_md_t * +ptl_wire_handle2md (ptl_handle_wire_t *wh, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh; + + if (wh->wh_interface_cookie != nal->ni.ni_interface_cookie) + return (NULL); + + lh = lib_lookup_cookie (nal, wh->wh_object_cookie, + PTL_COOKIE_TYPE_MD); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_md_t, md_lh)); +} + +static inline void +ptl_me2handle (ptl_handle_me_t *handle, lib_me_t *me) +{ + handle->cookie = me->me_lh.lh_cookie; +} + +static inline lib_me_t * +ptl_handle2me (ptl_handle_me_t *handle, nal_cb_t *nal) +{ + /* ALWAYS called with statelock held */ + lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, + PTL_COOKIE_TYPE_ME); + if (lh == NULL) + return (NULL); + + return (lh_entry (lh, lib_me_t, me_lh)); +} + +extern int lib_init(nal_cb_t * cb, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t tbl_size, ptl_ac_index_t ac_size); +extern int lib_fini(nal_cb_t * cb); +extern void lib_dispatch(nal_cb_t * cb, void *private, int index, + void *arg_block, void *ret_block); +extern char *dispatch_name(int index); + +/* + * When the NAL detects an incoming message, it should call + * lib_parse() decode it. The NAL callbacks will be handed + * the private cookie as a way for the NAL to maintain state + * about which transaction is being processed. An extra parameter, + * lib_cookie will contain the necessary information for + * finalizing the message. + * + * After it has finished the handling the message, it should + * call lib_finalize() with the lib_cookie parameter. + * Call backs will be made to write events, send acks or + * replies and so on. + */ +extern int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private); +extern int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t * msg); +extern void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr); + +extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); +extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len); +extern void lib_copy_buf2iov (int niov, struct iovec *iov, char *dest, ptl_size_t len); + +extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); +extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *iov, ptl_size_t len); +extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *iov, char *src, ptl_size_t len); +extern void lib_assert_wire_constants (void); + +extern void lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); +extern int lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len); + +extern void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md_in, + ptl_md_t * md_out); +extern void lib_md_unlink(nal_cb_t * nal, lib_md_t * md_in); +extern void lib_me_unlink(nal_cb_t * nal, lib_me_t * me_in); +#endif diff --git a/lustre/portals/include/portals/lib-types.h b/lustre/portals/include/portals/lib-types.h new file mode 100644 index 0000000..47c0dd2 --- /dev/null +++ b/lustre/portals/include/portals/lib-types.h @@ -0,0 +1,282 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * p30/lib-types.h + * + * Types used by the library side routines that do not need to be + * exposed to the user application + */ + +#ifndef _LIB_TYPES_H_ +#define _LIB_TYPES_H_ + +#include <portals/types.h> +#ifdef __KERNEL__ +# define PTL_USE_SLAB_CACHE +# include <linux/uio.h> +# include <linux/smp_lock.h> +# include <linux/types.h> +#else +# include <sys/types.h> +#endif + +/* struct nal_cb_t is defined in lib-nal.h */ +typedef struct nal_cb_t nal_cb_t; + +typedef char *user_ptr; +typedef struct lib_msg_t lib_msg_t; +typedef struct lib_ptl_t lib_ptl_t; +typedef struct lib_ac_t lib_ac_t; +typedef struct lib_me_t lib_me_t; +typedef struct lib_md_t lib_md_t; +typedef struct lib_eq_t lib_eq_t; + +#define WIRE_ATTR __attribute__((packed)) + +/* The wire handle's interface cookie only matches one network interface in + * one epoch (i.e. new cookie when the interface restarts or the node + * reboots). The object cookie only matches one object on that interface + * during that object's lifetime (i.e. no cookie re-use). */ +typedef struct { + __u64 wh_interface_cookie; + __u64 wh_object_cookie; +} WIRE_ATTR ptl_handle_wire_t; + +/* byte-flip insensitive! */ +#define PTL_WIRE_HANDLE_NONE \ +((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) + +typedef enum { + PTL_MSG_ACK = 0, + PTL_MSG_PUT, + PTL_MSG_GET, + PTL_MSG_REPLY, + PTL_MSG_HELLO, +} ptl_msg_type_t; + +/* Each of these structs should start with an odd number of + * __u32, or the compiler could add its own padding and confuse + * everyone. + * + * Also, "length" needs to be at offset 28 of each struct. + */ +typedef struct ptl_ack { + ptl_size_t mlength; + ptl_handle_wire_t dst_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for acks) moving out RSN */ +} WIRE_ATTR ptl_ack_t; + +typedef struct ptl_put { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t ack_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length moving out RSN */ + ptl_size_t offset; + ptl_hdr_data_t hdr_data; +} WIRE_ATTR ptl_put_t; + +typedef struct ptl_get { + ptl_pt_index_t ptl_index; + ptl_handle_wire_t return_wmd; + ptl_match_bits_t match_bits; + ptl_size_t length; /* common length (0 for gets) moving out RSN */ + ptl_size_t src_offset; + ptl_size_t return_offset; /* unused: going RSN */ + ptl_size_t sink_length; +} WIRE_ATTR ptl_get_t; + +typedef struct ptl_reply { + __u32 unused1; /* unused fields going RSN */ + ptl_handle_wire_t dst_wmd; + ptl_size_t dst_offset; /* unused: going RSN */ + __u32 unused2; + ptl_size_t length; /* common length moving out RSN */ +} WIRE_ATTR ptl_reply_t; + +typedef struct { + ptl_nid_t dest_nid; + ptl_nid_t src_nid; + ptl_pid_t dest_pid; + ptl_pid_t src_pid; + __u32 type; /* ptl_msg_type_t */ + union { + ptl_ack_t ack; + ptl_put_t put; + ptl_get_t get; + ptl_reply_t reply; + } msg; +} WIRE_ATTR ptl_hdr_t; + +/* All length fields in individual unions at same offset */ +/* LASSERT for same in lib-move.c */ +#define PTL_HDR_LENGTH(h) ((h)->msg.ack.length) + +/* A HELLO message contains the portals magic number and protocol version + * code in the header's dest_nid, the peer's NID in the src_nid, and + * PTL_MSG_HELLO in the type field. All other fields are zero (including + * PTL_HDR_LENGTH; i.e. no payload). + * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID, so that hosts with + * multiple IP interfaces can have a single NID. These NALs should exchange + * HELLO messages when a connection is first established. */ +typedef struct { + __u32 magic; /* PORTALS_PROTO_MAGIC */ + __u16 version_major; /* increment on incompatible change */ + __u16 version_minor; /* increment on compatible change */ +} WIRE_ATTR ptl_magicversion_t; + +#define PORTALS_PROTO_MAGIC 0xeebc0ded + +#define PORTALS_PROTO_VERSION_MAJOR 0 +#define PORTALS_PROTO_VERSION_MINOR 1 + +typedef struct { + long recv_count, recv_length, send_count, send_length, drop_count, + drop_length, msgs_alloc, msgs_max; +} lib_counters_t; + +/* temporary expedient: limit number of entries in discontiguous MDs */ +#if PTL_LARGE_MTU +# define PTL_MD_MAX_IOV 64 +#else +# define PTL_MD_MAX_IOV 16 +#endif + +struct lib_msg_t { + struct list_head msg_list; + int send_ack; + lib_md_t *md; + ptl_nid_t nid; + ptl_pid_t pid; + ptl_event_t ev; + ptl_handle_wire_t ack_wmd; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } msg_iov; +}; + +struct lib_ptl_t { + ptl_pt_index_t size; + struct list_head *tbl; +}; + +struct lib_ac_t { + int next_free; +}; + +typedef struct { + struct list_head lh_hash_chain; + __u64 lh_cookie; +} lib_handle_t; + +#define lh_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +struct lib_eq_t { + struct list_head eq_list; + lib_handle_t eq_lh; + ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + int eq_refcount; + int (*event_callback) (ptl_event_t * event); + void *eq_addrkey; +}; + +struct lib_me_t { + struct list_head me_list; + lib_handle_t me_lh; + ptl_process_id_t match_id; + ptl_match_bits_t match_bits, ignore_bits; + ptl_unlink_t unlink; + lib_md_t *md; +}; + +struct lib_md_t { + struct list_head md_list; + lib_handle_t md_lh; + lib_me_t *me; + user_ptr start; + ptl_size_t offset; + ptl_size_t length; + ptl_size_t max_size; + int threshold; + int pending; + ptl_unlink_t unlink; + unsigned int options; + unsigned int md_flags; + void *user_ptr; + lib_eq_t *eq; + void *md_addrkey; + unsigned int md_niov; /* # frags */ + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } md_iov; +}; + +#define PTL_MD_FLAG_UNLINK (1 << 0) +#define PTL_MD_FLAG_AUTO_UNLINKED (1 << 1) + +#ifndef PTL_USE_SLAB_CACHE +typedef struct +{ + void *fl_objs; /* single contiguous array of objects */ + int fl_nobjs; /* the number of them */ + int fl_objsize; /* the size (including overhead) of each of them */ + struct list_head fl_list; /* where they are enqueued */ +} lib_freelist_t; + +typedef struct +{ + struct list_head fo_list; /* enqueue on fl_list */ + void *fo_contents; /* aligned contents */ +} lib_freeobj_t; +#endif + +typedef struct { + /* info about peers we are trying to fail */ + struct list_head tp_list; /* stash in ni.ni_test_peers */ + ptl_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lib_test_peer_t; + +#define PTL_COOKIE_TYPE_MD 1 +#define PTL_COOKIE_TYPE_ME 2 +#define PTL_COOKIE_TYPE_EQ 3 +#define PTL_COOKIE_TYPES 4 +/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be + * extracted by masking with (PTL_COOKIE_TYPES - 1) */ + +typedef struct { + int up; + int refcnt; + ptl_nid_t nid; + ptl_pid_t pid; + int num_nodes; + unsigned int debug; + lib_ptl_t tbl; + lib_ac_t ac; + lib_counters_t counters; + + int ni_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ni_next_object_cookie; /* cookie generator */ + __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ + + struct list_head ni_test_peers; + +#ifndef PTL_USE_SLAB_CACHE + lib_freelist_t ni_free_mes; + lib_freelist_t ni_free_msgs; + lib_freelist_t ni_free_mds; + lib_freelist_t ni_free_eqs; +#endif + struct list_head ni_active_msgs; + struct list_head ni_active_mds; + struct list_head ni_active_eqs; +} lib_ni_t; + +#endif diff --git a/lustre/portals/include/portals/list.h b/lustre/portals/include/portals/list.h new file mode 100644 index 0000000..2b63312 --- /dev/null +++ b/lustre/portals/include/portals/list.h @@ -0,0 +1,245 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +#define prefetch(a) ((void)a) + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#endif + +#ifndef list_for_each_entry +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) +#endif + +#ifndef list_for_each_entry_safe +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) +#endif diff --git a/lustre/portals/include/portals/lltrace.h b/lustre/portals/include/portals/lltrace.h new file mode 100644 index 0000000..7d1b304 --- /dev/null +++ b/lustre/portals/include/portals/lltrace.h @@ -0,0 +1,175 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Compile with: + * cc -I../../portals/include -o fio fio.c -L../../portals/linux/utils -lptlctl + */ +#ifndef __LTRACE_H_ +#define __LTRACE_H_ + +#include <stdio.h> +#include <stdlib.h> +#include <getopt.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/time.h> +#include <portals/types.h> +#include <portals/ptlctl.h> +#include <linux/kp30.h> +#include <linux/limits.h> +#include <asm/page.h> +#include <linux/version.h> + +static inline int ltrace_write_file(char* fname) +{ + char* argv[3]; + + argv[0] = "debug_kernel"; + argv[1] = fname; + argv[2] = "1"; + + fprintf(stderr, "[ptlctl] %s %s %s\n", argv[0], argv[1], argv[2]); + + return jt_dbg_debug_kernel(3, argv); +} + +static inline int ltrace_clear() +{ + char* argv[1]; + + argv[0] = "clear"; + + fprintf(stderr, "[ptlctl] %s\n", argv[0]); + + return jt_dbg_clear_debug_buf(1, argv); +} + +static inline int ltrace_mark(int indent_level, char* text) +{ + char* argv[2]; + char mark_buf[PATH_MAX]; + + snprintf(mark_buf, PATH_MAX, "====%d=%s", indent_level, text); + + argv[0] = "mark"; + argv[1] = mark_buf; + return jt_dbg_mark_debug_buf(2, argv); +} + +static inline int ltrace_applymasks() +{ + char* argv[2]; + argv[0] = "list"; + argv[1] = "applymasks"; + + fprintf(stderr, "[ptlctl] %s %s\n", argv[0], argv[1]); + + return jt_dbg_list(2, argv); +} + + +static inline int ltrace_filter(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "filter"; + argv[1] = subsys_or_mask; + return jt_dbg_filter(2, argv); +} + +static inline int ltrace_show(char* subsys_or_mask) +{ + char* argv[2]; + argv[0] = "show"; + argv[1] = subsys_or_mask; + return jt_dbg_show(2, argv); +} + +static inline int ltrace_start() +{ + int rc = 0; + dbg_initialize(0, NULL); +#ifdef PORTALS_DEV_ID + rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); +#endif + ltrace_filter("class"); + ltrace_filter("socknal"); + ltrace_filter("qswnal"); + ltrace_filter("gmnal"); + ltrace_filter("portals"); + + ltrace_show("all_types"); + ltrace_filter("trace"); + ltrace_filter("malloc"); + ltrace_filter("net"); + ltrace_filter("page"); + ltrace_filter("other"); + ltrace_filter("info"); + ltrace_applymasks(); + + return rc; +} + + +static inline void ltrace_stop() +{ +#ifdef PORTALS_DEV_ID + unregister_ioc_dev(PORTALS_DEV_ID); +#endif +} + +static inline int not_uml() +{ + /* Return Values: + * 0 when run under UML + * 1 when run on host + * <0 when lookup failed + */ + struct stat buf; + int rc = stat("/dev/ubd", &buf); + rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; + if (rc<0) { + fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); + rc = 1; /* Assume host */ + } + return rc; +} + +#define LTRACE_MAX_NOB 256 +static inline void ltrace_add_processnames(char* fname) +{ + char cmdbuf[LTRACE_MAX_NOB]; + struct timeval tv; + struct timezone tz; + int nob; + int underuml = !not_uml(); + + gettimeofday(&tv, &tz); + + nob = snprintf(cmdbuf, LTRACE_MAX_NOB, "ps --no-headers -eo \""); + + /* Careful - these format strings need to match the CDEBUG + * formats in portals/linux/debug.c EXACTLY + */ + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, "%02x:%06x:%d:%lu.%06lu ", + S_RPC >> 24, D_VFSTRACE, 0, tv.tv_sec, tv.tv_usec); + + if (underuml && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))) { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d | %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0, 0L); + } + else { + nob += snprintf (cmdbuf+nob, LTRACE_MAX_NOB, + "(%s:%d:%s() %d+%lu): ", + "lltrace.h", __LINE__, __FUNCTION__, 0, 0L); + } + + nob += snprintf(cmdbuf+nob, LTRACE_MAX_NOB, " %%p %%c\" >> %s", fname); + system(cmdbuf); +} + +#endif diff --git a/lustre/portals/include/portals/myrnal.h b/lustre/portals/include/portals/myrnal.h new file mode 100644 index 0000000..12b1925 --- /dev/null +++ b/lustre/portals/include/portals/myrnal.h @@ -0,0 +1,26 @@ +/* +*/ + +#ifndef MYRNAL_H +#define MYRNAL_H + +#define MAX_ARGS_LEN (256) +#define MAX_RET_LEN (128) +#define MYRNAL_MAX_ACL_SIZE (64) +#define MYRNAL_MAX_PTL_SIZE (64) + +#define P3CMD (100) +#define P3SYSCALL (200) +#define P3REGISTER (300) + +enum { PTL_MLOCKALL }; + +typedef struct { + void *args; + size_t args_len; + void *ret; + size_t ret_len; + int p3cmd; +} myrnal_forward_t; + +#endif /* MYRNAL_H */ diff --git a/lustre/portals/include/portals/nal.h b/lustre/portals/include/portals/nal.h new file mode 100644 index 0000000..88be63c --- /dev/null +++ b/lustre/portals/include/portals/nal.h @@ -0,0 +1,49 @@ +/* +*/ +#ifndef _NAL_H_ +#define _NAL_H_ + +/* + * p30/nal.h + * + * The API side NAL declarations + */ + +#include <portals/types.h> + +#ifdef yield +#undef yield +#endif + +typedef struct nal_t nal_t; + +struct nal_t { + ptl_ni_t ni; + int refct; + void *nal_data; + int *timeout; /* for libp30api users */ + int (*forward) (nal_t * nal, int index, /* Function ID */ + void *args, size_t arg_len, void *ret, size_t ret_len); + + int (*shutdown) (nal_t * nal, int interface); + + int (*validate) (nal_t * nal, void *base, size_t extent); + + void (*yield) (nal_t * nal); + + void (*lock) (nal_t * nal, unsigned long *flags); + + void (*unlock) (nal_t * nal, unsigned long *flags); +}; + +typedef nal_t *(ptl_interface_t) (int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_IP(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); +extern nal_t *PTL_IFACE_MYR(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t requested_pid); + +extern nal_t *ptl_hndl2nal(ptl_handle_any_t * any); + +#ifndef PTL_IFACE_DEFAULT +#define PTL_IFACE_DEFAULT (PTL_IFACE_IP) +#endif + +#endif diff --git a/lustre/portals/include/portals/nalids.h b/lustre/portals/include/portals/nalids.h new file mode 100644 index 0000000..1b837b4 --- /dev/null +++ b/lustre/portals/include/portals/nalids.h @@ -0,0 +1,4 @@ +#define PTL_IFACE_TCP 1 +#define PTL_IFACE_ER 2 +#define PTL_IFACE_SS 3 +#define PTL_IFACE_MAX 4 diff --git a/lustre/portals/include/portals/p30.h b/lustre/portals/include/portals/p30.h new file mode 100644 index 0000000..a4ea39b --- /dev/null +++ b/lustre/portals/include/portals/p30.h @@ -0,0 +1,72 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _P30_H_ +#define _P30_H_ + +/* + * p30.h + * + * User application interface file + */ + +#if defined (__KERNEL__) +#include <linux/uio.h> +#include <linux/types.h> +#else +#include <sys/types.h> +#include <sys/uio.h> +#endif + +#include <portals/types.h> +#include <portals/nal.h> +#include <portals/api.h> +#include <portals/errno.h> +#include <portals/nalids.h> + +extern int __p30_initialized; /* for libraries & test codes */ +extern int __p30_myr_initialized; /* that don't know if p30 */ +extern int __p30_ip_initialized; /* had been initialized yet */ +extern ptl_handle_ni_t __myr_ni_handle, __ip_ni_handle; + +extern int __p30_myr_timeout; /* in seconds, for PtlNIBarrier, */ +extern int __p30_ip_timeout; /* PtlReduce_all, & PtlBroadcast_all */ + +/* + * Debugging flags reserved for the Portals reference library. + * These are not part of the API as described in the SAND report + * but are for the use of the maintainers of the reference implementation. + * + * It is not expected that the real implementations will export + * this functionality. + */ +#define PTL_DEBUG_NONE 0ul +#define PTL_DEBUG_ALL (0x0FFFul) /* Only the Portals flags */ + +#define __bit(x) ((unsigned long) 1<<(x)) +#define PTL_DEBUG_PUT __bit(0) +#define PTL_DEBUG_GET __bit(1) +#define PTL_DEBUG_REPLY __bit(2) +#define PTL_DEBUG_ACK __bit(3) +#define PTL_DEBUG_DROP __bit(4) +#define PTL_DEBUG_REQUEST __bit(5) +#define PTL_DEBUG_DELIVERY __bit(6) +#define PTL_DEBUG_UNLINK __bit(7) +#define PTL_DEBUG_THRESHOLD __bit(8) +#define PTL_DEBUG_API __bit(9) + +/* + * These eight are reserved for the NAL to define + * It should probably give them better names... + */ +#define PTL_DEBUG_NI_ALL (0xF000ul) /* Only the NAL flags */ +#define PTL_DEBUG_NI0 __bit(24) +#define PTL_DEBUG_NI1 __bit(25) +#define PTL_DEBUG_NI2 __bit(26) +#define PTL_DEBUG_NI3 __bit(27) +#define PTL_DEBUG_NI4 __bit(28) +#define PTL_DEBUG_NI5 __bit(29) +#define PTL_DEBUG_NI6 __bit(30) +#define PTL_DEBUG_NI7 __bit(31) + +#endif diff --git a/lustre/portals/include/portals/ppid.h b/lustre/portals/include/portals/ppid.h new file mode 100644 index 0000000..4727599 --- /dev/null +++ b/lustre/portals/include/portals/ppid.h @@ -0,0 +1,52 @@ +/* + */ + +#ifndef _INCppidh_ +#define _INCppidh_ + +#include "defines.h" +// #include "idtypes.h" + + +#define MAX_PPID 1000 /* this needs to fit into 16 bits so the + maximum value is 65535. having it "large" + can help w/ debugging process accounting + but there are reasons for making it + somewhat smaller than the maximum -- + requiring storage for arrays that index + on the ppid, eg... */ + +#define MAX_GID 1000 /* this needs to fit into 16 bits... */ + +#define MAX_FIXED_PPID 100 +#define MAX_FIXED_GID 100 +#define PPID_FLOATING MAX_FIXED_PPID+1 /* Floating area starts here */ +#define GID_FLOATING MAX_FIXED_GID+1 /* Floating area starts here */ +#define NUM_PTL_TASKS MAX_FIXED_PPID+80 /* Maximum no. portals tasks */ + +#define PPID_AUTO 0 + +/* Minimum PPID is 1 */ +#define PPID_BEBOPD 1 /* bebopd */ +#define GID_BEBOPD 1 /* bebopd */ + +#define PPID_PCT 2 /* pct */ +#define GID_PCT 2 /* pct */ + +#define PPID_FYOD 3 /* fyod */ +#define GID_FYOD 3 /* fyod */ + +#define PPID_GDBWRAP 11 /* portals proxy for gdb */ +#define GID_GDBWRAP 11 /* portals proxy for gdb */ + +#define PPID_TEST 15 /* for portals tests */ +#define GID_TEST 15 + +#define GID_YOD 5 /* yod */ +#define GID_PINGD 6 /* pingd */ +#define GID_BT 7 /* bt */ +#define GID_PTLTEST 8 /* ptltest */ +#define GID_CGDB 9 /* cgdb */ +#define GID_TVDSVR 10 /* start-tvdsvr */ + +#endif /* _INCppidh_ */ diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h new file mode 100644 index 0000000..dc02780 --- /dev/null +++ b/lustre/portals/include/portals/ptlctl.h @@ -0,0 +1,75 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * header for libptlctl.a + */ +#ifndef _PTLCTL_H_ +#define _PTLCTL_H_ + +#define PORTALS_DEV_ID 0 +#define PORTALS_DEV_PATH "/dev/portals" +#define OBD_DEV_ID 1 +#define OBD_DEV_PATH "/dev/obd" + +int ptl_name2nal(char *str); +int ptl_parse_nid (ptl_nid_t *nidp, char *str); +char * ptl_nid2str (char *buffer, ptl_nid_t nid); + +int ptl_initialize(int argc, char **argv); +int jt_ptl_network(int argc, char **argv); +int jt_ptl_connect(int argc, char **argv); +int jt_ptl_disconnect(int argc, char **argv); +int jt_ptl_push_connection(int argc, char **argv); +int jt_ptl_ping(int argc, char **argv); +int jt_ptl_shownid(int argc, char **argv); +int jt_ptl_mynid(int argc, char **argv); +int jt_ptl_add_uuid(int argc, char **argv); +int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ +int jt_ptl_close_uuid(int argc, char **argv); +int jt_ptl_del_uuid(int argc, char **argv); +int jt_ptl_rxmem (int argc, char **argv); +int jt_ptl_txmem (int argc, char **argv); +int jt_ptl_nagle (int argc, char **argv); +int jt_ptl_add_route (int argc, char **argv); +int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_print_routes (int argc, char **argv); +int jt_ptl_fail_nid (int argc, char **argv); + +int dbg_initialize(int argc, char **argv); +int jt_dbg_filter(int argc, char **argv); +int jt_dbg_show(int argc, char **argv); +int jt_dbg_list(int argc, char **argv); +int jt_dbg_debug_kernel(int argc, char **argv); +int jt_dbg_debug_daemon(int argc, char **argv); +int jt_dbg_debug_file(int argc, char **argv); +int jt_dbg_clear_debug_buf(int argc, char **argv); +int jt_dbg_mark_debug_buf(int argc, char **argv); +int jt_dbg_modules(int argc, char **argv); +int jt_dbg_panic(int argc, char **argv); + +/* l_ioctl.c */ +int register_ioc_dev(int dev_id, const char * dev_name); +void unregister_ioc_dev(int dev_id); +int set_ioctl_dump(char * file); +int l_ioctl(int dev_id, int opc, void *buf); +int parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)); +int jt_ioc_dump(int argc, char **argv); + +#endif diff --git a/lustre/portals/include/portals/stringtab.h b/lustre/portals/include/portals/stringtab.h new file mode 100644 index 0000000..c9683f7 --- /dev/null +++ b/lustre/portals/include/portals/stringtab.h @@ -0,0 +1,5 @@ +/* +*/ +/* + * stringtab.h + */ diff --git a/lustre/portals/include/portals/types.h b/lustre/portals/include/portals/types.h new file mode 100644 index 0000000..d4038b6 --- /dev/null +++ b/lustre/portals/include/portals/types.h @@ -0,0 +1,157 @@ +#ifndef _P30_TYPES_H_ +#define _P30_TYPES_H_ + +#ifdef __linux__ +#include <asm/types.h> +#include <asm/timex.h> +#else +#include <sys/types.h> +typedef u_int32_t __u32; +typedef u_int64_t __u64; +typedef unsigned long long cycles_t; +static inline cycles_t get_cycles(void) { return 0; } +#endif + +typedef __u64 ptl_nid_t; +typedef __u32 ptl_pid_t; +typedef __u32 ptl_pt_index_t; +typedef __u32 ptl_ac_index_t; +typedef __u64 ptl_match_bits_t; +typedef __u64 ptl_hdr_data_t; +typedef __u32 ptl_size_t; + +typedef struct { + unsigned long nal_idx; /* which network interface */ + __u64 cookie; /* which thing on that interface */ +} ptl_handle_any_t; + +typedef ptl_handle_any_t ptl_handle_ni_t; +typedef ptl_handle_any_t ptl_handle_eq_t; +typedef ptl_handle_any_t ptl_handle_md_t; +typedef ptl_handle_any_t ptl_handle_me_t; + +#define PTL_HANDLE_NONE \ +((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) +#define PTL_EQ_NONE PTL_HANDLE_NONE + +static inline int PtlHandleEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +{ + return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); +} + +#define PTL_NID_ANY ((ptl_nid_t) -1) +#define PTL_PID_ANY ((ptl_pid_t) -1) + +typedef struct { + ptl_nid_t nid; + ptl_pid_t pid; /* node id / process id */ +} ptl_process_id_t; + +typedef enum { + PTL_RETAIN = 0, + PTL_UNLINK +} ptl_unlink_t; + +typedef enum { + PTL_INS_BEFORE, + PTL_INS_AFTER +} ptl_ins_pos_t; + +typedef struct { + struct page *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; +} ptl_kiov_t; + +typedef struct { + void *start; + ptl_size_t length; + int threshold; + int max_size; + unsigned int options; + void *user_ptr; + ptl_handle_eq_t eventq; + unsigned int niov; +} ptl_md_t; + +/* Options for the MD structure */ +#define PTL_MD_OP_PUT (1 << 0) +#define PTL_MD_OP_GET (1 << 1) +#define PTL_MD_MANAGE_REMOTE (1 << 2) +#define PTL_MD_AUTO_UNLINK (1 << 3) +#define PTL_MD_TRUNCATE (1 << 4) +#define PTL_MD_ACK_DISABLE (1 << 5) +#define PTL_MD_IOV (1 << 6) +#define PTL_MD_MAX_SIZE (1 << 7) +#define PTL_MD_KIOV (1 << 8) + +#define PTL_MD_THRESH_INF (-1) + +typedef enum { + PTL_EVENT_GET, + PTL_EVENT_PUT, + PTL_EVENT_REPLY, + PTL_EVENT_ACK, + PTL_EVENT_SENT +} ptl_event_kind_t; + +#define PTL_SEQ_BASETYPE long +typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; +#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) + +typedef struct { + ptl_event_kind_t type; + ptl_process_id_t initiator; + ptl_pt_index_t portal; + ptl_match_bits_t match_bits; + ptl_size_t rlength, mlength, offset; + ptl_handle_me_t unlinked_me; + ptl_md_t mem_desc; + ptl_hdr_data_t hdr_data; + cycles_t arrival_time; + volatile ptl_seq_t sequence; +} ptl_event_t; + + +typedef enum { + PTL_ACK_REQ, + PTL_NOACK_REQ +} ptl_ack_req_t; + + +typedef struct { + volatile ptl_seq_t sequence; + ptl_size_t size; + ptl_event_t *base; + ptl_handle_any_t cb_eq_handle; +} ptl_eq_t; + +typedef struct { + ptl_eq_t *eq; +} ptl_ni_t; + + +typedef struct { + int max_match_entries; /* max number of match entries */ + int max_mem_descriptors; /* max number of memory descriptors */ + int max_event_queues; /* max number of event queues */ + int max_atable_index; /* maximum access control list table index */ + int max_ptable_index; /* maximum portals table index */ +} ptl_ni_limits_t; + +/* + * Status registers + */ +typedef enum { + PTL_SR_DROP_COUNT, + PTL_SR_DROP_LENGTH, + PTL_SR_RECV_COUNT, + PTL_SR_RECV_LENGTH, + PTL_SR_SEND_COUNT, + PTL_SR_SEND_LENGTH, + PTL_SR_MSGS_MAX, +} ptl_sr_index_t; + +typedef int ptl_sr_value_t; + +#endif diff --git a/lustre/portals/knals/.cvsignore b/lustre/portals/knals/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/portals/knals/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/portals/knals/Makefile.am b/lustre/portals/knals/Makefile.am new file mode 100644 index 0000000..fed2785 --- /dev/null +++ b/lustre/portals/knals/Makefile.am @@ -0,0 +1,7 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +DIST_SUBDIRS= socknal toenal qswnal gmnal scimacnal +SUBDIRS= socknal toenal @QSWNAL@ @GMNAL@ @SCIMACNAL@ diff --git a/lustre/portals/knals/Makefile.mk b/lustre/portals/knals/Makefile.mk new file mode 100644 index 0000000..ce40a60 --- /dev/null +++ b/lustre/portals/knals/Makefile.mk @@ -0,0 +1,4 @@ +include ../Kernelenv + +obj-y = socknal/ +# more coming... \ No newline at end of file diff --git a/lustre/portals/knals/gmnal/.cvsignore b/lustre/portals/knals/gmnal/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/knals/gmnal/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/knals/gmnal/Makefile.am b/lustre/portals/knals/gmnal/Makefile.am new file mode 100644 index 0000000..1dc6f4e --- /dev/null +++ b/lustre/portals/knals/gmnal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kgmnal +modulenet_DATA = kgmnal.o +EXTRA_PROGRAMS = kgmnal + +DEFS = +kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h diff --git a/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch new file mode 100644 index 0000000..23c80d9 --- /dev/null +++ b/lustre/portals/knals/gmnal/gm-1.5.2.1-exports.patch @@ -0,0 +1,43 @@ +diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c +--- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002 ++++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002 +@@ -30,6 +30,8 @@ + * + ************************************************************************/ + ++#define EXPORT_SYMTAB ++ + #include <linux/config.h> + #include <linux/module.h> + +@@ -4075,6 +4077,28 @@ + return 0; + } + ++EXPORT_SYMBOL(gm_blocking_receive_no_spin); ++EXPORT_SYMBOL(gm_close); ++EXPORT_SYMBOL(gm_dma_free); ++EXPORT_SYMBOL(gm_dma_malloc); ++EXPORT_SYMBOL(gm_drop_sends); ++EXPORT_SYMBOL(gm_finalize); ++EXPORT_SYMBOL(gm_get_node_id); ++EXPORT_SYMBOL(gm_init); ++EXPORT_SYMBOL(gm_initialize_alarm); ++EXPORT_SYMBOL(gm_max_node_id_in_use); ++EXPORT_SYMBOL(gm_min_size_for_length); ++EXPORT_SYMBOL(gm_num_receive_tokens); ++EXPORT_SYMBOL(gm_num_send_tokens); ++EXPORT_SYMBOL(gm_open); ++EXPORT_SYMBOL(gm_provide_receive_buffer); ++EXPORT_SYMBOL(gm_resume_sending); ++EXPORT_SYMBOL(gm_send_with_callback); ++EXPORT_SYMBOL(gm_set_acceptable_sizes); ++EXPORT_SYMBOL(gm_set_alarm); ++EXPORT_SYMBOL(gm_unknown); ++ ++ + /* + This file uses GM standard indentation. + +Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~ +Only in gm-1.5.2.1_Linux-cfs/: trace diff --git a/lustre/portals/knals/gmnal/gmnal.c b/lustre/portals/knals/gmnal/gmnal.c new file mode 100644 index 0000000..ceeea2a --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal.c @@ -0,0 +1,284 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read <rread@datarithm.net> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "gmnal.h" + +ptl_handle_ni_t kgmnal_ni; +nal_t kgmnal_api; + +kgmnal_data_t kgmnal_data; +int gmnal_debug = 0; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: GMNAL, + kprni_arg: NULL, + kprni_fwd: kgmnal_fwd_packet, +}; + +static int kgmnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return PTL_OK; +} + +static void kgmnal_lock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void kgmnal_unlock(nal_t *nal, unsigned long *flags) +{ + kgmnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kgm_cb; + + + LASSERT (nal == &kgmnal_api); + LASSERT (k == &kgmnal_data); + LASSERT (nal_cb == &kgmnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int kgmnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kgmnal_api); + return 0; +} + +static void kgmnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kgmnal_api); + + if (current->need_resched) + schedule(); + return; +} + +kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx) +{ + kgmnal_rx_t *conn; + + PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t)); + /* Check for out of mem here */ + if (conn==NULL) { + printk("kgm_add_recv: memory alloc failed\n"); + return NULL; + } + + list_add(&conn->krx_item,(struct list_head *)&data->kgm_list); + // conn->ndx=ndx; + // conn->len=conn->ptlhdr_copied=0; + // conn->loopback=0; + return conn; +} + +static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n", + kgmnal_data.kgm_nid, nnids); + lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size); + return &kgmnal_api; +} + +static void __exit +kgmnal_finalize(void) +{ + struct list_head *tmp; + + PORTAL_SYMBOL_UNREGISTER (kgmnal_ni); + PtlNIFini(kgmnal_ni); + lib_fini(&kgmnal_api); + + if (kgmnal_data.kgm_port) { + gm_close(kgmnal_data.kgm_port); + } + + /* FIXME: free dma buffers */ + /* FIXME: kill receiver thread */ + + PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS); + + list_for_each(tmp, &kgmnal_data.kgm_list) { + kgmnal_rx_t *conn; + conn = list_entry(tmp, kgmnal_rx_t, krx_item); + CDEBUG(D_IOCTL, "freeing conn %p\n",conn); + tmp = tmp->next; + list_del(&conn->krx_item); + PORTAL_FREE(conn, sizeof(*conn)); + } + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + +static int __init +kgmnal_initialize(void) +{ + int rc; + int ntok; + unsigned long sizemask; + unsigned int nid; + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kgmnal_api.forward = kgmnal_forward; + kgmnal_api.shutdown = kgmnal_shutdown; + kgmnal_api.yield = kgmnal_yield; + kgmnal_api.validate = NULL; /* our api validate is a NOOP */ + kgmnal_api.lock= kgmnal_lock; + kgmnal_api.unlock= kgmnal_unlock; + kgmnal_api.nal_data = &kgmnal_data; + + kgmnal_lib.nal_data = &kgmnal_data; + + memset(&kgmnal_data, 0, sizeof(kgmnal_data)); + + INIT_LIST_HEAD(&kgmnal_data.kgm_list); + kgmnal_data.kgm_cb = &kgmnal_lib; + + /* Allocate transmit descriptors */ + PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS); + if (kgmnal_data.kgm_trans==NULL) { + printk("kgmnal: init: failed to allocate transmit " + "descriptors\n"); + return -1; + } + memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS)); + + spin_lock_init(&kgmnal_data.kgm_dispatch_lock); + spin_lock_init(&kgmnal_data.kgm_update_lock); + spin_lock_init(&kgmnal_data.kgm_send_lock); + + /* Do the receiver and xmtr allocation */ + + rc = gm_init(); + if (rc != GM_SUCCESS) { + CERROR("gm_init failed: %d\n", rc); + return -1; + } + + rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME, + GM_API_VERSION_1_1); + if (rc != GM_SUCCESS) { + gm_finalize(); + kgmnal_data.kgm_port = NULL; + CERROR("gm_open failed: %d\n", rc); + return -1; + } + gm_get_node_id(kgmnal_data.kgm_port, &nid); + kgmnal_data.kgm_nid = nid; + /* Allocate 2 different sizes of buffers. For new, use half + the tokens for each. */ + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n", + ntok, MSG_LEN_LARGE); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_LARGE); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_LARGE, GM_LOW_PRIORITY); + } + + ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2; + CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n", + ntok, MSG_LEN_SMALL); + while (ntok-- > 0) { + void * buffer = gm_dma_malloc(kgmnal_data.kgm_port, + MSG_LEN_SMALL); + if (buffer == NULL) { + CERROR("gm_init failed: %d\n", rc); + return (-ENOMEM); + } + CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d " + "pri %d\n ", kgmnal_data.kgm_port, buffer, + MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer, + MSG_SIZE_SMALL, GM_LOW_PRIORITY); + + } + sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL); + CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n", + kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY, + sizemask); + gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0); + + /* Initialize Network Interface */ + rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + return (-ENOMEM); + } + + /* Start receiver thread */ + kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0); + + PORTAL_SYMBOL_REGISTER(kgmnal_ni); + + kgmnal_data.kgm_init = 1; + + return 0; +} + +MODULE_AUTHOR("Robert Read <rread@datarithm.net>"); +MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1"); +MODULE_LICENSE("GPL"); + +module_init (kgmnal_initialize); +module_exit (kgmnal_finalize); + +EXPORT_SYMBOL (kgmnal_ni); diff --git a/lustre/portals/knals/gmnal/gmnal.h b/lustre/portals/knals/gmnal/gmnal.h new file mode 100644 index 0000000..47e8c3c --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal.h @@ -0,0 +1,101 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef _GMNAL_H +#define _GMNAL_H + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/locks.h> +#include <linux/unistd.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/segment.h> + +#define DEBUG_SUBSYSTEM S_GMNAL + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +#include <gm.h> + + +/* + * Myrinet GM NAL + */ +#define NPAGES_LARGE 16 +#define NPAGES_SMALL 1 +#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE +#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE +#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE)) +#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL)) + +#define TXMSGS 64 /* Number of Transmit Messages */ +#define ENVELOPES 8 /* Number of outstanding receive msgs */ + +#define KGM_PORT_NUM 3 +#define KGM_HOSTNAME "kgmnal" + + +typedef struct { + char *krx_buffer; + unsigned long krx_len; + unsigned int krx_size; + unsigned int krx_priority; + struct list_head krx_item; +} kgmnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + char *ktx_buffer; + size_t ktx_len; + unsigned long ktx_size; + int ktx_ndx; + unsigned int ktx_priority; + unsigned int ktx_tgt_node; + unsigned int ktx_tgt_port_id; +} kgmnal_tx_t; + + +typedef struct { + char kgm_init; + char kgm_shuttingdown; + struct gm_port *kgm_port; + struct list_head kgm_list; + ptl_nid_t kgm_nid; + nal_cb_t *kgm_cb; + struct kgm_trans *kgm_trans; + struct tq_struct kgm_ready_tq; + spinlock_t kgm_dispatch_lock; + spinlock_t kgm_update_lock; + spinlock_t kgm_send_lock; +} kgmnal_data_t; + +int kgm_init(kgmnal_data_t *kgm_data); +int kgmnal_recv_thread(void *); +int gm_return_mynid(void); +void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +extern kgmnal_data_t kgmnal_data; +extern nal_t kgmnal_api; +extern nal_cb_t kgmnal_lib; + +#endif /* _GMNAL_H */ + diff --git a/lustre/portals/knals/gmnal/gmnal_cb.c b/lustre/portals/knals/gmnal/gmnal_cb.c new file mode 100644 index 0000000..3d4c86d --- /dev/null +++ b/lustre/portals/knals/gmnal/gmnal_cb.c @@ -0,0 +1,517 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Based on ksocknal and qswnal + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Robert Read <rread@datarithm.net> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* TODO + * preallocate send buffers, store on list + * put receive buffers on queue, handle with receive threads + * use routing + */ + +#include "gmnal.h" + +extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int); + +static kgmnal_tx_t * +get_trans(void) +{ + kgmnal_tx_t *t; + PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t))); + return t; +} + +static void +put_trans(kgmnal_tx_t *t) +{ + PORTAL_FREE(t, sizeof(kgmnal_tx_t)); +} + +int +kgmnal_ispeer (ptl_nid_t nid) +{ + unsigned int gmnid = (unsigned int)nid; + unsigned int nnids; + + gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids); + + return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */ + gmnid < nnids); /* it's in this machine */ +} + +/* + * LIB functions follow + * + */ +static int +kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static int +kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + +static void * +kgmnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + +static void +kgmnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kgmnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kgmnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kgm_dispatch_lock,*flags); +} + + +static void +kgmnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kgmnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags); +} + + +static int +kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* network distance doesn't mean much for this nal */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +/* FIXME rmr: add rounting code here */ +static void +kgmnal_tx_done(kgmnal_tx_t *trans, int error) +{ + lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie); + + gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer); + + trans->ktx_buffer = NULL; + trans->ktx_len = 0; + + put_trans(trans); +} +static char * gm_error_strings[GM_NUM_STATUS_CODES] = { + [GM_SUCCESS] = "GM_SUCCESS", + [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT", + [GM_SEND_REJECTED] = "GM_SEND_REJECTED", + [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED", + [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE", + [GM_SEND_DROPPED] = "GM_SEND_DROPPED", + [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED", +}; + +inline char * get_error(int status) +{ + if (gm_error_strings[status] != NULL) + return gm_error_strings[status]; + else + return "Unknown error"; +} + +static void +kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status) +{ + CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status); +} + +static void +kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status) +{ + kgmnal_tx_t *ktx = (kgmnal_tx_t *)context; + int err = 0; + + LASSERT (p != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id); + + switch((int)status) { + case GM_SUCCESS: /* normal */ + break; + case GM_SEND_TIMED_OUT: /* application error */ + case GM_SEND_REJECTED: /* size of msg unacceptable */ + case GM_SEND_TARGET_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_TARGET_NODE_UNREACHABLE: + case GM_SEND_PORT_CLOSED: + CERROR("%s (%d):\n", get_error(status), status); + gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority, + ktx->ktx_tgt_node, ktx->ktx_tgt_port_id, + kgmnal_errhandler, NULL); + err = -EIO; + break; + case GM_SEND_DROPPED: + CERROR("%s (%d):\n", get_error(status), status); + err = -EIO; + break; + default: + CERROR("Unknown status: %d\n", status); + err = -EIO; + break; + } + + kgmnal_tx_done(ktx, err); +} + +/* + */ + +static int +kgmnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t len) +{ + /* + * ipnal assumes that this is the private as passed to lib_dispatch.. + * so do we :/ + */ + kgmnal_tx_t *ktx=NULL; + int rc=0; + void * buf; + int buf_len = sizeof(ptl_hdr_t) + len; + int buf_size = 0; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + PROF_START(gmnal_send); + + + CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n", + len, iov, nid, KGM_PORT_NUM); + + /* ensure there is an available tx handle */ + + /* save transaction info to trans for later finalize and cleanup */ + ktx = get_trans(); + if (ktx == NULL) { + rc = -ENOMEM; + goto send_exit; + } + + /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce + header and data. + Also, memory must be dma'able or registered with GM. */ + + if (buf_len <= MSG_LEN_SMALL) { + buf_size = MSG_SIZE_SMALL; + } else if (buf_len <= MSG_LEN_LARGE) { + buf_size = MSG_SIZE_LARGE; + } else { + printk("kgmnal:request exceeds TX MTU size (%d).\n", + MSG_SIZE_LARGE); + rc = -1; + goto send_exit; + } + + buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len); + if (buf == NULL) { + rc = -ENOMEM; + goto send_exit; + } + memcpy(buf, hdr, sizeof(ptl_hdr_t)); + + if (len != 0) + lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t), + options, niov, iov, len); + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + ktx->ktx_len = buf_len; + ktx->ktx_size = buf_size; + ktx->ktx_buffer = buf; + ktx->ktx_priority = GM_LOW_PRIORITY; + ktx->ktx_tgt_node = nid; + ktx->ktx_tgt_port_id = KGM_PORT_NUM; + + CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx " + "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM, + GM_LOW_PRIORITY); + + gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size, + buf_len, GM_LOW_PRIORITY, + nid, KGM_PORT_NUM, + kgmnal_txhandler, ktx); + + PROF_FINISH(gmnal_send); + send_exit: + return rc; +} +void +kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + CERROR ("forwarding not implemented\n"); +} + + +static inline void +kgmnal_requeue_rx(kgmnal_rx_t *krx) +{ + gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer, + krx->krx_size, krx->krx_priority); +} + +/* Process a received portals packet */ + +/* Receive Interrupt Handler */ +static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size, + void * buf, unsigned int pri) +{ + ptl_hdr_t *hdr = buf; + kgmnal_rx_t krx; + + CDEBUG(D_NET,"buf %p, len %ld\n", buf, len); + + if ( len < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (kgm->kgm_shuttingdown) + return; + CERROR("kgmnal: did not receive complete portal header, " + "len= %ld", len); + gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri); + return; + } + + /* might want to use seperate threads to handle receive */ + krx.krx_buffer = buf; + krx.krx_len = len; + krx.krx_size = size; + krx.krx_priority = pri; + + if ( hdr->dest_nid == kgmnal_lib.ni.nid ) { + PROF_START(lib_parse); + lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx); + PROF_FINISH(lib_parse); + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx: target is " + "a peer", hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); + } else { + /* forward to gateway */ + CERROR("forwarding not implemented yet"); + kgmnal_requeue_rx(&krx); + } + + return; +} + + +static int kgmnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + int options, + unsigned int niov, + lib_md_iov_t *iov, + size_t mlen, + size_t rlen) +{ + kgmnal_rx_t *krx = private; + + LASSERT ((options & PTL_MD_KIOV) == 0); + + CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen); + + /* What was actually received must be >= what sender claims to + * have sent. This is an LASSERT, since lib-move doesn't + * check cb return code yet. */ + LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen); + LASSERT (mlen <= rlen); + + PROF_START(gmnal_recv); + + if(mlen != 0) { + PROF_START(memcpy); + lib_copy_buf2iov (options, niov, iov, + krx->krx_buffer + sizeof (ptl_hdr_t), mlen); + PROF_FINISH(memcpy); + } + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + kgmnal_requeue_rx(krx); + + PROF_FINISH(gmnal_recv); + + return rlen; +} + + +static void kgmnal_shutdown(void * none) +{ + CERROR("called\n"); + return; +} + +/* + * Set terminate and use alarm to wake up the recv thread. + */ +static void recv_shutdown(kgmnal_data_t *kgm) +{ + gm_alarm_t alarm; + + kgm->kgm_shuttingdown = 1; + gm_initialize_alarm(&alarm); + gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL); +} + +int kgmnal_end(kgmnal_data_t *kgm) +{ + + /* wait for sends to finish ? */ + /* remove receive buffers */ + /* shutdown receive thread */ + + recv_shutdown(kgm); + + return 0; +} + +/* Used only for the spinner */ +int kgmnal_recv_thread(void *arg) +{ + kgmnal_data_t *kgm = arg; + + LASSERT(kgm != NULL); + + kportal_daemonize("kgmnal_rx"); + + while(1) { + gm_recv_event_t *e; + int priority = GM_LOW_PRIORITY; + if (kgm->kgm_shuttingdown) + break; + + e = gm_blocking_receive_no_spin(kgm->kgm_port); + if (e == NULL) { + CERROR("gm_blocking_receive returned NULL\n"); + break; + } + + switch(gm_ntohc(e->recv.type)) { + case GM_HIGH_RECV_EVENT: + priority = GM_HIGH_PRIORITY; + /* fall through */ + case GM_RECV_EVENT: + kgmnal_rx(kgm, gm_ntohl(e->recv.length), + gm_ntohc(e->recv.size), + gm_ntohp(e->recv.buffer), priority); + break; + case GM_ALARM_EVENT: + CERROR("received alarm"); + gm_unknown(kgm->kgm_port, e); + break; + case GM_BAD_SEND_DETECTED_EVENT: /* ?? */ + CERROR("received bad send!\n"); + break; + default: + gm_unknown(kgm->kgm_port, e); + } + } + + CERROR("shuttting down.\n"); + return 0; +} + +nal_cb_t kgmnal_lib = { + nal_data: &kgmnal_data, /* NAL private data */ + cb_send: kgmnal_send, + cb_recv: kgmnal_recv, + cb_read: kgmnal_read, + cb_write: kgmnal_write, + cb_malloc: kgmnal_malloc, + cb_free: kgmnal_free, + cb_printf: kgmnal_printf, + cb_cli: kgmnal_cli, + cb_sti: kgmnal_sti, + cb_dist: kgmnal_dist +}; diff --git a/lustre/portals/knals/qswnal/.cvsignore b/lustre/portals/knals/qswnal/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/knals/qswnal/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/knals/qswnal/Makefile.am b/lustre/portals/knals/qswnal/Makefile.am new file mode 100644 index 0000000..3eb4dd5 --- /dev/null +++ b/lustre/portals/knals/qswnal/Makefile.am @@ -0,0 +1,17 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kqswnal +modulenet_DATA = kqswnal.o +EXTRA_PROGRAMS = kqswnal + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +CPPFLAGS=@CPPFLAGS@ @with_quadrics@ +kqswnal_SOURCES = qswnal.c qswnal_cb.c qswnal.h diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c new file mode 100644 index 0000000..1a8fb74 --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -0,0 +1,608 @@ +/* + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton <eric@bartonsoftware.com> + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +ptl_handle_ni_t kqswnal_ni; +nal_t kqswnal_api; +kqswnal_data_t kqswnal_data; + +kpr_nal_interface_t kqswnal_router_interface = { + kprni_nalid: QSWNAL, + kprni_arg: NULL, + kprni_fwd: kqswnal_fwd_packet, +}; + + +static int +kqswnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */ + return (PTL_OK); +} + +static void +kqswnal_lock (nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + +static void +kqswnal_unlock(nal_t *nal, unsigned long *flags) +{ + kqswnal_data_t *k = nal->nal_data; + nal_cb_t *nal_cb = k->kqn_cb; + + LASSERT (nal == &kqswnal_api); + LASSERT (k == &kqswnal_data); + LASSERT (nal_cb == &kqswnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + +static int +kqswnal_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "shutdown\n"); + + LASSERT (nal == &kqswnal_api); + return (0); +} + +static void +kqswnal_yield( nal_t *nal ) +{ + CDEBUG (D_NET, "yield\n"); + + if (current->need_resched) + schedule(); + return; +} + +static nal_t * +kqswnal_init(int interface, ptl_pt_index_t ptl_size, ptl_ac_index_t ac_size, + ptl_pid_t requested_pid) +{ + ptl_nid_t mynid = kqswnal_elanid2nid (kqswnal_data.kqn_elanid); + int nnids = kqswnal_data.kqn_nnodes; + + CDEBUG(D_NET, "calling lib_init with nid "LPX64" of %d\n", mynid, nnids); + + lib_init(&kqswnal_lib, mynid, 0, nnids, ptl_size, ac_size); + + return (&kqswnal_api); +} + +int +kqswnal_cmd (struct portal_ioctl_data *data, void *private) +{ + LASSERT (data != NULL); + + switch (data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_MYNID: + CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", + data->ioc_nid - kqswnal_data.kqn_elanid, + kqswnal_data.kqn_nid_offset); + kqswnal_data.kqn_nid_offset = + data->ioc_nid - kqswnal_data.kqn_elanid; + kqswnal_lib.ni.nid = data->ioc_nid; + return (0); + + default: + return (-EINVAL); + } +} + +void __exit +kqswnal_finalise (void) +{ + switch (kqswnal_data.kqn_init) + { + default: + LASSERT (0); + + case KQN_INIT_ALL: + PORTAL_SYMBOL_UNREGISTER (kqswnal_ni); + /* fall through */ + + case KQN_INIT_PTL: + PtlNIFini (kqswnal_ni); + lib_fini (&kqswnal_lib); + /* fall through */ + + case KQN_INIT_DATA: + break; + + case KQN_INIT_NOTHING: + return; + } + + /**********************************************************************/ + /* Make router stop her calling me and fail any more call-ins */ + kpr_shutdown (&kqswnal_data.kqn_router); + + /**********************************************************************/ + /* flag threads to terminate, wake them and wait for them to die */ + + kqswnal_data.kqn_shuttingdown = 1; + wake_up_all (&kqswnal_data.kqn_sched_waitq); + + while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { + CDEBUG(D_NET, "waiting for %d threads to terminate\n", + atomic_read (&kqswnal_data.kqn_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + /**********************************************************************/ + /* close elan comms */ + + if (kqswnal_data.kqn_eprx_small != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); + + if (kqswnal_data.kqn_eprx_large != NULL) + ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); + + if (kqswnal_data.kqn_eptx != NULL) + ep_free_large_xmtr (kqswnal_data.kqn_eptx); + + /**********************************************************************/ + /* No more threads. No more portals, router or comms callbacks! + * I control the horizontals and the verticals... + */ + + /**********************************************************************/ + /* Complete any blocked forwarding packets with error + */ + + while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + while (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -EHOSTUNREACH); + } + + /**********************************************************************/ + /* Wait for router to complete any packets I sent her + */ + + kpr_deregister (&kqswnal_data.kqn_router); + + + /**********************************************************************/ + /* Unmap message buffers and free all descriptors and buffers + */ + + if (kqswnal_data.kqn_eprxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, 0, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle); + } + + if (kqswnal_data.kqn_eptxdmahandle != NULL) + { + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, 0, + KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + + elan3_dma_release(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle); + } + + if (kqswnal_data.kqn_txds != NULL) + { + int i; + + for (i = 0; i < KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS; i++) + { + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + + if (ktx->ktx_buffer != NULL) + PORTAL_FREE(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + } + + PORTAL_FREE(kqswnal_data.kqn_txds, + sizeof (kqswnal_tx_t) * (KQSW_NTXMSGS + + KQSW_NNBLK_TXMSGS)); + } + + if (kqswnal_data.kqn_rxds != NULL) + { + int i; + int j; + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + for (j = 0; j < krx->krx_npages; j++) + if (krx->krx_pages[j] != NULL) + __free_page (krx->krx_pages[j]); + } + + PORTAL_FREE(kqswnal_data.kqn_rxds, + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGS_LARGE)); + } + + /* resets flags, pointers to NULL etc */ + memset(&kqswnal_data, 0, sizeof (kqswnal_data)); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); + + printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kqswnal_initialise (void) +{ + ELAN3_DMA_REQUEST dmareq; + int rc; + int i; + int elan_page_idx; + int pkmem = atomic_read(&portal_kmemory); + + LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); + + kqswnal_api.forward = kqswnal_forward; + kqswnal_api.shutdown = kqswnal_shutdown; + kqswnal_api.yield = kqswnal_yield; + kqswnal_api.validate = NULL; /* our api validate is a NOOP */ + kqswnal_api.lock = kqswnal_lock; + kqswnal_api.unlock = kqswnal_unlock; + kqswnal_api.nal_data = &kqswnal_data; + + kqswnal_lib.nal_data = &kqswnal_data; + + /* ensure all pointers NULL etc */ + memset (&kqswnal_data, 0, sizeof (kqswnal_data)); + + kqswnal_data.kqn_cb = &kqswnal_lib; + + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); + spin_lock_init (&kqswnal_data.kqn_idletxd_lock); + init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); + + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); + INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); + + spin_lock_init (&kqswnal_data.kqn_sched_lock); + init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); + + spin_lock_init (&kqswnal_data.kqn_statelock); + + /* pointers/lists/locks initialised */ + kqswnal_data.kqn_init = KQN_INIT_DATA; + + /**********************************************************************/ + /* Find the first Elan device */ + + kqswnal_data.kqn_epdev = ep_device (0); + if (kqswnal_data.kqn_epdev == NULL) + { + CERROR ("Can't get elan device 0\n"); + return (-ENOMEM); + } + + kqswnal_data.kqn_nid_offset = 0; + kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_epdev); + kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_epdev); + + /**********************************************************************/ + /* Get the transmitter */ + + kqswnal_data.kqn_eptx = ep_alloc_large_xmtr (kqswnal_data.kqn_epdev); + if (kqswnal_data.kqn_eptx == NULL) + { + CERROR ("Can't allocate transmitter\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Get the receivers */ + + kqswnal_data.kqn_eprx_small = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_SMALL, + KQSW_EP_ENVELOPES_SMALL); + if (kqswnal_data.kqn_eprx_small == NULL) + { + CERROR ("Can't install small msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_eprx_large = ep_install_large_rcvr (kqswnal_data.kqn_epdev, + EP_SVC_LARGE_PORTALS_LARGE, + KQSW_EP_ENVELOPES_LARGE); + if (kqswnal_data.kqn_eprx_large == NULL) + { + CERROR ("Can't install large msg receiver\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for transmit buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEREAD; + + rc = elan3_dma_reserve(kqswnal_data.kqn_epdev->DmaState, + KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), + &dmareq, &kqswnal_data.kqn_eptxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Reserve Elan address space for receive buffers */ + + dmareq.Waitfn = DDI_DMA_SLEEP; + dmareq.ElanAddr = (E3_Addr) 0; + dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; + dmareq.Perm = ELAN_PERM_REMOTEWRITE; + + rc = elan3_dma_reserve (kqswnal_data.kqn_epdev->DmaState, + KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + + KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, + &dmareq, &kqswnal_data.kqn_eprxdmahandle); + if (rc != DDI_SUCCESS) + { + CERROR ("Can't reserve rx dma space\n"); + kqswnal_finalise (); + return (-ENOMEM); + } + + /**********************************************************************/ + /* Allocate/Initialise transmit descriptors */ + + PORTAL_ALLOC(kqswnal_data.kqn_txds, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + if (kqswnal_data.kqn_txds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* clear flags, null pointers etc */ + memset(kqswnal_data.kqn_txds, 0, + sizeof(kqswnal_tx_t) * (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS)); + for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) + { + int premapped_pages; + kqswnal_tx_t *ktx = &kqswnal_data.kqn_txds[i]; + int basepage = i * KQSW_NTXMSGPAGES; + + PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + if (ktx->ktx_buffer == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + /* Map pre-allocated buffer NOW, to save latency on transmit */ + premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, + KQSW_TX_BUFFER_SIZE); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, + basepage, &ktx->ktx_ebuffer); + + ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ + ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ + + if (i < KQSW_NTXMSGS) + ktx->ktx_idle = &kqswnal_data.kqn_idletxds; + else + ktx->ktx_idle = &kqswnal_data.kqn_nblk_idletxds; + + list_add_tail (&ktx->ktx_list, ktx->ktx_idle); + } + + /**********************************************************************/ + /* Allocate/Initialise receive descriptors */ + + PORTAL_ALLOC (kqswnal_data.kqn_rxds, + sizeof (kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE)); + if (kqswnal_data.kqn_rxds == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + memset(kqswnal_data.kqn_rxds, 0, /* clear flags, null pointers etc */ + sizeof(kqswnal_rx_t) * (KQSW_NRXMSGS_SMALL+KQSW_NRXMSGS_LARGE)); + + elan_page_idx = 0; + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + E3_Addr elanaddr; + int j; + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + if (i < KQSW_NRXMSGS_SMALL) + { + krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; + krx->krx_eprx = kqswnal_data.kqn_eprx_small; + } + else + { + krx->krx_npages = KQSW_NRXMSGPAGES_LARGE; + krx->krx_eprx = kqswnal_data.kqn_eprx_large; + } + + LASSERT (krx->krx_npages > 0); + for (j = 0; j < krx->krx_npages; j++) + { + krx->krx_pages[j] = alloc_page(GFP_KERNEL); + if (krx->krx_pages[j] == NULL) + { + kqswnal_finalise (); + return (-ENOMEM); + } + + LASSERT(page_address(krx->krx_pages[j]) != NULL); + + elan3_dvma_kaddr_load(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eprxdmahandle, + page_address(krx->krx_pages[j]), + PAGE_SIZE, elan_page_idx, + &elanaddr); + elan_page_idx++; + + if (j == 0) + krx->krx_elanaddr = elanaddr; + + /* NB we assume a contiguous */ + LASSERT (elanaddr == krx->krx_elanaddr + j * PAGE_SIZE); + } + } + LASSERT (elan_page_idx == + (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + + (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); + + /**********************************************************************/ + /* Network interface ready to initialise */ + + rc = PtlNIInit(kqswnal_init, 32, 4, 0, &kqswnal_ni); + if (rc != 0) + { + CERROR ("PtlNIInit failed %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + + kqswnal_data.kqn_init = KQN_INIT_PTL; + + /**********************************************************************/ + /* Queue receives, now that it's OK to run their completion callbacks */ + + for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + { + kqswnal_rx_t *krx = &kqswnal_data.kqn_rxds[i]; + + /* NB this enqueue can allocate/sleep (attr == 0) */ + rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, + krx->krx_elanaddr, + krx->krx_npages * PAGE_SIZE, 0); + if (rc != 0) + { + CERROR ("failed ep_queue_receive %d\n", rc); + kqswnal_finalise (); + return (-ENOMEM); + } + } + + /**********************************************************************/ + /* Spawn scheduling threads */ + for (i = 0; i < smp_num_cpus; i++) + { + rc = kqswnal_thread_start (kqswnal_scheduler, NULL); + if (rc != 0) + { + CERROR ("failed to spawn scheduling thread: %d\n", rc); + kqswnal_finalise (); + return (rc); + } + } + + /**********************************************************************/ + /* Connect to the router */ + rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); + CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); + + rc = kportal_nal_register (QSWNAL, &kqswnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + kqswnal_finalise (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(kqswnal_ni); + kqswnal_data.kqn_init = KQN_INIT_ALL; + + printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d " + "(Routing %s, initial mem %d)\n", + kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes, + kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + + +MODULE_AUTHOR("W. Marcus Miller <marcusm@llnl.gov>"); +MODULE_DESCRIPTION("Kernel Quadrics Switch NAL v1.00"); +MODULE_LICENSE("GPL"); + +module_init (kqswnal_initialise); +module_exit (kqswnal_finalise); + +EXPORT_SYMBOL (kqswnal_ni); diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h new file mode 100644 index 0000000..88ab74f --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef _QSWNAL_H +#define _QSWNAL_H +#define EXPORT_SYMTAB + +#ifdef PROPRIETARY_ELAN +# include <qsw/kernel.h> +#else +# include <qsnet/kernel.h> +#endif + +#undef printf /* nasty QSW #define */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <elan3/elanregs.h> +#include <elan3/elandev.h> +#include <elan3/elanvp.h> +#include <elan3/elan3mmu.h> +#include <elan3/elanctxt.h> +#include <elan3/elandebug.h> +#include <elan3/urom_addrs.h> +#include <elan3/busops.h> +#include <elan3/kcomm.h> + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/locks.h> +#include <linux/unistd.h> +#include <net/sock.h> +#include <linux/uio.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/segment.h> + +#define DEBUG_SUBSYSTEM S_QSWNAL + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +#define KQSW_CHECKSUM 0 +#if KQSW_CHECKSUM +typedef unsigned long kqsw_csum_t; +#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) +#else +#define KQSW_CSUM_SIZE 0 +#endif +#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) + +/* + * Elan NAL + */ +#define EP_SVC_LARGE_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ +#define EP_SVC_LARGE_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ +/* NB small/large message sizes are GLOBAL constants */ + +/* + * Performance Tuning defines + * NB no mention of PAGE_SIZE for interoperability + */ +#if PTL_LARGE_MTU +# define KQSW_MAXPAYLOAD (256<<10) /* biggest message this NAL will cope with */ +#else +# define KQSW_MAXPAYLOAD (64<<10) /* biggest message this NAL will cope with */ +#endif + +#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ + +#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ + +#define KQSW_NTXMSGS 8 /* # normal transmit messages */ +#define KQSW_NNBLK_TXMSGS 128 /* # reserved transmit messages if can't block */ + +#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ +#define KQSW_EP_ENVELOPES_LARGE 128 /* # large ep envelopes */ + +#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ +#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ + +#define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ + +/* + * derived constants + */ + +#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +/* The pre-allocated tx buffer (hdr + small payload) */ + +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +/* Reserve elan address space for pre-allocated and pre-mapped transmit + * buffer and a full payload too. Extra pages allow for page alignment */ + +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) + +#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +/* receive hdr/payload always contiguous and page aligned */ +#define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) +/* biggest complete packet we can receive (or transmit) */ + + +typedef struct +{ + struct list_head krx_list; /* enqueue -> thread */ + EP_RCVR *krx_eprx; /* port to post receives to */ + EP_RXD *krx_rxd; /* receive descriptor (for repost) */ + E3_Addr krx_elanaddr; /* Elan address of buffer (contiguous in elan vm) */ + int krx_npages; /* # pages in receive buffer */ + int krx_nob; /* Number Of Bytes received into buffer */ + kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ + struct page *krx_pages[KQSW_NRXMSGPAGES_LARGE]; /* pages allocated */ + struct iovec krx_iov[KQSW_NRXMSGPAGES_LARGE]; /* iovec for forwarding */ +} kqswnal_rx_t; + +typedef struct +{ + struct list_head ktx_list; /* enqueue idle/delayed */ + struct list_head *ktx_idle; /* where to put when idle */ + char ktx_state; /* What I'm doing */ + uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ + int ktx_npages; /* pages reserved for mapping messages */ + int ktx_nmappedpages; /* # pages mapped for current message */ + EP_IOVEC ktx_iov[EP_MAXFRAG]; /* msg frags (elan vaddrs) */ + int ktx_niov; /* # message frags */ + int ktx_port; /* destination ep port */ + ptl_nid_t ktx_nid; /* destination node */ + void *ktx_args[2]; /* completion passthru */ + E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ + char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ +} kqswnal_tx_t; + +#define KTX_IDLE 0 /* MUST BE ZERO (so zeroed ktx is idle) */ +#define KTX_SENDING 1 /* local send */ +#define KTX_FORWARDING 2 /* routing a packet */ + +typedef struct +{ + char kqn_init; /* what's been initialised */ + char kqn_shuttingdown; /* I'm trying to shut down */ + atomic_t kqn_nthreads; /* # threads still running */ + + kqswnal_rx_t *kqn_rxds; /* all the receive descriptors */ + kqswnal_tx_t *kqn_txds; /* all the transmit descriptors */ + + struct list_head kqn_idletxds; /* transmit descriptors free to use */ + struct list_head kqn_nblk_idletxds; /* reserve of */ + spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ + wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ + struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ + + spinlock_t kqn_sched_lock; /* serialise packet schedulers */ + wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ + + struct list_head kqn_readyrxds; /* rxds full of data */ + struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_delayedtxds; /* delayed transmits */ + + spinlock_t kqn_statelock; /* cb_cli/cb_sti */ + nal_cb_t *kqn_cb; /* -> kqswnal_lib */ + EP_DEV *kqn_epdev; /* elan device */ + EP_XMTR *kqn_eptx; /* elan transmitter */ + EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ + EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ + ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ + ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ + kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ + + ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ + int kqn_nnodes; /* this cluster's size */ + int kqn_elanid; /* this nodes's elan ID */ +} kqswnal_data_t; + +/* kqn_init state */ +#define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ +#define KQN_INIT_DATA 1 +#define KQN_INIT_PTL 2 +#define KQN_INIT_ALL 3 + +extern nal_cb_t kqswnal_lib; +extern nal_t kqswnal_api; +extern kqswnal_data_t kqswnal_data; + +extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); +extern void kqswnal_rxhandler(EP_RXD *rxd); +extern int kqswnal_scheduler (void *); +extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); + +static inline ptl_nid_t +kqswnal_elanid2nid (int elanid) +{ + return (kqswnal_data.kqn_nid_offset + elanid); +} + +static inline int +kqswnal_nid2elanid (ptl_nid_t nid) +{ + /* not in this cluster? */ + if (nid < kqswnal_data.kqn_nid_offset || + nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes) + return (-1); + + return (nid - kqswnal_data.kqn_nid_offset); +} + +static inline void +kqswnal_requeue_rx (kqswnal_rx_t *krx) +{ + ep_requeue_receive (krx->krx_rxd, kqswnal_rxhandler, krx, + krx->krx_elanaddr, krx->krx_npages * PAGE_SIZE); +} + +static inline int +kqswnal_pages_spanned (void *base, int nob) +{ + unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT; + unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT; + + LASSERT (last_page >= first_page); /* can't wrap address space */ + return (last_page - first_page + 1); +} + +#if KQSW_CHECKSUM +static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) +{ + unsigned char *ptr = (unsigned char *)base; + + while (nob-- > 0) + sum += *ptr++; + + return (sum); +} +#endif + +#endif /* _QSWNAL_H */ diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c new file mode 100644 index 0000000..3b47a25 --- /dev/null +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -0,0 +1,1239 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Eric Barton <eric@bartonsoftware.com> + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * W. Marcus Miller - Based on ksocknal + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswnal.h" + +atomic_t kqswnal_packets_launched; +atomic_t kqswnal_packets_transmitted; +atomic_t kqswnal_packets_received; + + +/* + * LIB functions follow + * + */ +static int +kqswnal_read(nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": reading "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static int +kqswnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr, + size_t len) +{ + CDEBUG (D_NET, LPX64": writing "LPSZ" bytes from %p -> %p\n", + nal->ni.nid, len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + + return (0); +} + +static void * +kqswnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return (buf); +} + +static void +kqswnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +static void +kqswnal_printf (nal_cb_t * nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + + +static void +kqswnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->kqn_statelock, *flags); +} + + +static void +kqswnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kqswnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->kqn_statelock, *flags); +} + + +static int +kqswnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + if (nid == nal->ni.nid) + *dist = 0; /* it's me */ + else if (kqswnal_nid2elanid (nid) >= 0) + *dist = 1; /* it's my peer */ + else + *dist = 2; /* via router */ + return (0); +} + +void +kqswnal_unmap_tx (kqswnal_tx_t *ktx) +{ + if (ktx->ktx_nmappedpages == 0) + return; + + CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", + ktx, ktx->ktx_niov, ktx->ktx_basepage, ktx->ktx_nmappedpages); + + LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); + LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= + kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); + + elan3_dvma_unload(kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ktx->ktx_basepage, ktx->ktx_nmappedpages); + ktx->ktx_nmappedpages = 0; +} + +int +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int nob, int niov, ptl_kiov_t *kiov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + char *ptr; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = kiov->kiov_len; + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + /* each frag fits in a page */ + LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + + nmapped++; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + /* XXX this is really crap, but we'll have to kmap until + * EKC has a page (rather than vaddr) mapping interface */ + + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, page %d, %d total\n", + ktx, nfrags, ptr, fraglen, basepage, nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + ptr, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + + kunmap (kiov->kiov_page); + + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage++; + kiov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +int +kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int nob, int niov, struct iovec *iov) +{ + int nfrags = ktx->ktx_niov; + const int maxfrags = sizeof (ktx->ktx_iov)/sizeof (ktx->ktx_iov[0]); + int nmapped = ktx->ktx_nmappedpages; + int maxmapped = ktx->ktx_npages; + uint32_t basepage = ktx->ktx_basepage + nmapped; + + LASSERT (nmapped <= maxmapped); + LASSERT (nfrags <= maxfrags); + LASSERT (niov > 0); + LASSERT (nob > 0); + + do { + int fraglen = iov->iov_len; + long npages = kqswnal_pages_spanned (iov->iov_base, fraglen); + + /* nob exactly spans the iovs */ + LASSERT (fraglen <= nob); + + nmapped += npages; + if (nmapped > maxmapped) { + CERROR("Can't map message in %d pages (max %d)\n", + nmapped, maxmapped); + return (-EMSGSIZE); + } + + if (nfrags == maxfrags) { + CERROR("Message too fragmented in Elan VM (max %d frags)\n", + maxfrags); + return (-EMSGSIZE); + } + + CDEBUG(D_NET, + "%p[%d] loading %p for %d, pages %d for %ld, %d total\n", + ktx, nfrags, iov->iov_base, fraglen, basepage, npages, + nmapped); + + elan3_dvma_kaddr_load (kqswnal_data.kqn_epdev->DmaState, + kqswnal_data.kqn_eptxdmahandle, + iov->iov_base, fraglen, + basepage, &ktx->ktx_iov[nfrags].Base); + /* keep in loop for failure case */ + ktx->ktx_nmappedpages = nmapped; + + if (nfrags > 0 && /* previous frag mapped */ + ktx->ktx_iov[nfrags].Base == /* contiguous with this one */ + (ktx->ktx_iov[nfrags-1].Base + ktx->ktx_iov[nfrags-1].Len)) + /* just extend previous */ + ktx->ktx_iov[nfrags - 1].Len += fraglen; + else { + ktx->ktx_iov[nfrags].Len = fraglen; + nfrags++; /* new frag */ + } + + basepage += npages; + iov++; + niov--; + nob -= fraglen; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + ktx->ktx_niov = nfrags; + CDEBUG (D_NET, "%p got %d frags over %d pages\n", + ktx, ktx->ktx_niov, ktx->ktx_nmappedpages); + + return (0); +} + +void +kqswnal_put_idle_tx (kqswnal_tx_t *ktx) +{ + kpr_fwd_desc_t *fwd = NULL; + struct list_head *idle = ktx->ktx_idle; + unsigned long flags; + + kqswnal_unmap_tx (ktx); /* release temporary mappings */ + ktx->ktx_state = KTX_IDLE; + + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + list_add (&ktx->ktx_list, idle); + + /* reserved for non-blocking tx */ + if (idle == &kqswnal_data.kqn_nblk_idletxds) { + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + return; + } + + /* anything blocking for a tx descriptor? */ + if (!list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ + { + CDEBUG(D_NET,"wakeup fwd\n"); + + fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, + kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + } + + if (waitqueue_active (&kqswnal_data.kqn_idletxd_waitq)) /* process? */ + { + /* local sender waiting for tx desc */ + CDEBUG(D_NET,"wakeup process\n"); + wake_up (&kqswnal_data.kqn_idletxd_waitq); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + if (fwd == NULL) + return; + + /* schedule packet for forwarding again */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +kqswnal_tx_t * +kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) +{ + unsigned long flags; + kqswnal_tx_t *ktx = NULL; + + for (;;) { + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kqswnal_data.kqn_idletxds)) { + ktx = list_entry (kqswnal_data.kqn_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* "normal" descriptor pool is empty */ + + if (fwd != NULL) { /* forwarded packet => queue for idle txd */ + CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); + list_add_tail (&fwd->kprfd_list, + &kqswnal_data.kqn_idletxd_fwdq); + break; + } + + /* doing a local transmit */ + if (!may_block) { + if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { + CERROR ("intr tx desc pool exhausted\n"); + break; + } + + ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + break; + } + + /* block for idle tx */ + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + CDEBUG (D_NET, "blocking for tx desc\n"); + wait_event (kqswnal_data.kqn_idletxd_waitq, + !list_empty (&kqswnal_data.kqn_idletxds)); + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); + + /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ + LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); + return (ktx); +} + +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +{ + switch (ktx->ktx_state) { + case KTX_FORWARDING: /* router asked me to forward this packet */ + kpr_fwd_done (&kqswnal_data.kqn_router, + (kpr_fwd_desc_t *)ktx->ktx_args[0], error); + break; + + case KTX_SENDING: /* packet sourced locally */ + lib_finalize (&kqswnal_lib, ktx->ktx_args[0], + (lib_msg_t *)ktx->ktx_args[1]); + break; + + default: + LASSERT (0); + } + + kqswnal_put_idle_tx (ktx); +} + +static void +kqswnal_txhandler(EP_TXD *txd, void *arg, int status) +{ + kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + + LASSERT (txd != NULL); + LASSERT (ktx != NULL); + + CDEBUG(D_NET, "txd %p, arg %p status %d\n", txd, arg, status); + + if (status == EP_SUCCESS) + atomic_inc (&kqswnal_packets_transmitted); + + if (status != EP_SUCCESS) + { + CERROR ("kqswnal: Transmit failed with %d\n", status); + status = -EIO; + } + + kqswnal_tx_done (ktx, status); +} + +int +kqswnal_launch (kqswnal_tx_t *ktx) +{ + /* Don't block for transmit descriptor if we're in interrupt context */ + int attr = in_interrupt() ? (EP_NO_SLEEP | EP_NO_ALLOC) : 0; + int dest = kqswnal_nid2elanid (ktx->ktx_nid); + long flags; + int rc; + + LASSERT (dest >= 0); /* must be a peer */ + rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, + ktx->ktx_port, attr, kqswnal_txhandler, + ktx, ktx->ktx_iov, ktx->ktx_niov); + if (rc == 0) + atomic_inc (&kqswnal_packets_launched); + + if (rc != ENOMEM) + return (rc); + + /* can't allocate ep txd => queue for later */ + + LASSERT (in_interrupt()); /* not called by thread (not looping) */ + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + return (0); +} + + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + default: + return ("<UNKNOWN>"); + } +} + +static void +kqswnal_cerror_hdr(ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + CERROR("P3 Header at %p of type %s\n", hdr, type_str); + CERROR(" From nid/pid "LPU64"/%u", NTOH__u64(hdr->src_nid), + NTOH__u32(hdr->src_pid)); + CERROR(" To nid/pid "LPU64"/%u\n", NTOH__u64(hdr->dest_nid), + NTOH__u32(hdr->dest_pid)); + + switch (NTOH__u32(hdr->type)) { + case PTL_MSG_PUT: + CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + NTOH__u64 (hdr->msg.put.match_bits)); + CERROR(" Length %d, offset %d, hdr data "LPX64"\n", + NTOH__u32(PTL_HDR_LENGTH(hdr)), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + CERROR(" Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", + NTOH__u32 (hdr->msg.get.ptl_index), + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + CERROR(" Length %d, src offset %d\n", + NTOH__u32 (hdr->msg.get.sink_length), + NTOH__u32 (hdr->msg.get.src_offset)); + break; + + case PTL_MSG_ACK: + CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + NTOH__u32 (hdr->msg.ack.mlength)); + break; + + case PTL_MSG_REPLY: + CERROR(" dst md "LPX64"."LPX64", length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + NTOH__u32 (PTL_HDR_LENGTH(hdr))); + } + +} /* end of print_hdr() */ + +static int +kqswnal_sendmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + kqswnal_tx_t *ktx; + int rc; + ptl_nid_t gatewaynid; +#if KQSW_CHECKSUM + int i; + kqsw_csum_t csum; + int sumnob; +#endif + + /* NB, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 + " pid %u\n", payload_nob, payload_niov, nid, pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* It must be OK to kmap() if required */ + LASSERT (payload_kiov == NULL || !in_interrupt ()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + if (payload_nob > KQSW_MAXPAYLOAD) { + CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", + payload_nob, KQSW_MAXPAYLOAD); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + if (kqswnal_nid2elanid (gatewaynid) < 0) { + CERROR("Bad gateway "LPX64" for "LPX64"\n", + gatewaynid, nid); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + nid = gatewaynid; + } + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (ktx == NULL) { + kqswnal_cerror_hdr (hdr); + lib_finalize (&kqswnal_lib, private, cookie); + } + + memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ + +#if KQSW_CHECKSUM + csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); + memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); + for (csum = 0, i = 0, sumnob = payload_nob; sumnob > 0; i++) { + if (payload_kiov != NULL) { + ptl_kiov_t *kiov = &payload_kiov[i]; + char *addr = ((char *)kmap (kiov->kiov_page)) + + kiov->kiov_offset; + + csum = kqsw_csum (csum, addr, MIN (sumnob, kiov->kiov_len)); + sumnob -= kiov->kiov_len; + } else { + struct iovec *iov = &payload_iov[i]; + + csum = kqsw_csum (csum, iov->iov_base, MIN (sumnob, kiov->iov_len)); + sumnob -= iov->iov_len; + } + } + memcpy(ktx->ktx_buffer +sizeof(*hdr) +sizeof(csum), &csum,sizeof(csum)); +#endif + + /* Set up first frag from pre-mapped buffer (it's at least the + * portals header) */ + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; + ktx->ktx_iov[0].Len = KQSW_HDR_SIZE; + ktx->ktx_niov = 1; + + if (payload_nob > 0) { /* got some payload (something more to do) */ + /* make a single contiguous message? */ + if (payload_nob <= KQSW_TX_MAXCONTIG) { + /* copy payload to ktx_buffer, immediately after hdr */ + if (payload_kiov != NULL) + lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_kiov, payload_nob); + else + lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, + payload_niov, payload_iov, payload_nob); + /* first frag includes payload */ + ktx->ktx_iov[0].Len += payload_nob; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov (ktx, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov (ktx, payload_nob, + payload_niov, payload_iov); + if (rc != 0) { + kqswnal_put_idle_tx (ktx); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + } + } + + ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_SENDING; /* => lib_finalize() on completion */ + ktx->ktx_args[0] = private; + ktx->ktx_args[1] = cookie; + + rc = kqswnal_launch (ktx); + if (rc != 0) { /* failed? */ + CERROR ("Failed to send packet to "LPX64": %d\n", nid, rc); + lib_finalize (&kqswnal_lib, private, cookie); + return (-1); + } + + CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + return (0); +} + +static int +kqswnal_send (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, payload_iov, NULL, payload_nob)); +} + +static int +kqswnal_send_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + ptl_kiov_t *payload_kiov, + size_t payload_nob) +{ + return (kqswnal_sendmsg (nal, private, cookie, hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, payload_nob)); +} + +int kqswnal_fwd_copy_contig = 0; + +void +kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + int rc; + kqswnal_tx_t *ktx; + struct iovec *iov = fwd->kprfd_iov; + int niov = fwd->kprfd_niov; + int nob = fwd->kprfd_nob; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + /* The router wants this NAL to forward a packet */ + CDEBUG (D_NET, "forwarding [%p] to "LPX64", %d frags %d bytes\n", + fwd, nid, niov, nob); + + LASSERT (niov > 0); + + ktx = kqswnal_get_idle_tx (fwd, FALSE); + if (ktx == NULL) /* can't get txd right now */ + return; /* fwd will be scheduled when tx desc freed */ + + if (nid == kqswnal_lib.ni.nid) /* gateway is me */ + nid = fwd->kprfd_target_nid; /* target is final dest */ + + if (kqswnal_nid2elanid (nid) < 0) { + CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); + rc = -EHOSTUNREACH; + goto failed; + } + + if (nob > KQSW_NRXMSGBYTES_LARGE) { + CERROR ("Can't forward [%p] to "LPX64 + ": size %d bigger than max packet size %ld\n", + fwd, nid, nob, (long)KQSW_NRXMSGBYTES_LARGE); + rc = -EMSGSIZE; + goto failed; + } + + if ((kqswnal_fwd_copy_contig || niov > 1) && + nob <= KQSW_TX_BUFFER_SIZE) + { + /* send from ktx's pre-allocated/mapped contiguous buffer? */ + lib_copy_iov2buf (ktx->ktx_buffer, niov, iov, nob); + ktx->ktx_iov[0].Base = ktx->ktx_ebuffer; /* already mapped */ + ktx->ktx_iov[0].Len = nob; + ktx->ktx_niov = 1; + } + else + { + /* zero copy */ + ktx->ktx_niov = 0; /* no frags mapped yet */ + rc = kqswnal_map_tx_iov (ktx, nob, niov, iov); + if (rc != 0) + goto failed; + } + + ktx->ktx_port = (nob <= (sizeof (ptl_hdr_t) + KQSW_SMALLPAYLOAD)) ? + EP_SVC_LARGE_PORTALS_SMALL : EP_SVC_LARGE_PORTALS_LARGE; + ktx->ktx_nid = nid; + ktx->ktx_state = KTX_FORWARDING; /* kpr_put_packet() on completion */ + ktx->ktx_args[0] = fwd; + + rc = kqswnal_launch (ktx); + if (rc == 0) + return; + + failed: + LASSERT (rc != 0); + CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); + + kqswnal_put_idle_tx (ktx); + /* complete now (with failure) */ + kpr_fwd_done (&kqswnal_data.kqn_router, fwd, rc); +} + +void +kqswnal_fwd_callback (void *arg, int error) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; + + /* The router has finished forwarding this packet */ + + if (error != 0) + { + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid),error); + } + + kqswnal_requeue_rx (krx); +} + +void +kqswnal_rx (kqswnal_rx_t *krx) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address (krx->krx_pages[0]); + ptl_nid_t dest_nid = NTOH__u64 (hdr->dest_nid); + int nob; + int niov; + + if (dest_nid == kqswnal_lib.ni.nid) { /* It's for me :) */ + /* NB krx requeued when lib_parse() calls back kqswnal_recv */ + lib_parse (&kqswnal_lib, hdr, krx); + return; + } + +#if KQSW_CHECKSUM + CERROR ("checksums for forwarded packets not implemented\n"); + LBUG (); +#endif + if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ + { + CERROR("dropping packet from "LPX64" for "LPX64 + ": target is peer\n", NTOH__u64(hdr->src_nid), dest_nid); + kqswnal_requeue_rx (krx); + return; + } + + /* NB forwarding may destroy iov; rebuild every time */ + for (nob = krx->krx_nob, niov = 0; nob > 0; nob -= PAGE_SIZE, niov++) + { + LASSERT (niov < krx->krx_npages); + krx->krx_iov[niov].iov_base= page_address(krx->krx_pages[niov]); + krx->krx_iov[niov].iov_len = MIN(PAGE_SIZE, nob); + } + + kpr_fwd_init (&krx->krx_fwd, dest_nid, + krx->krx_nob, niov, krx->krx_iov, + kqswnal_fwd_callback, krx); + + kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); +} + +/* Receive Interrupt Handler: posts to schedulers */ +void +kqswnal_rxhandler(EP_RXD *rxd) +{ + long flags; + int nob = ep_rxd_len (rxd); + int status = ep_rxd_status (rxd); + kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); + + CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", + rxd, krx, nob, status); + + LASSERT (krx != NULL); + + krx->krx_rxd = rxd; + krx->krx_nob = nob; + + /* must receive a whole header to be able to parse */ + if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) + { + /* receives complete with failure when receiver is removed */ + if (kqswnal_data.kqn_shuttingdown) + return; + + CERROR("receive status failed with status %d nob %d\n", + ep_rxd_status(rxd), nob); + kqswnal_requeue_rx (krx); + return; + } + + atomic_inc (&kqswnal_packets_received); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); +} + +#if KQSW_CHECKSUM +void +kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) +{ + ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_pages[0]); + + CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 + ", dpid %d, spid %d, type %d\n", + ishdr ? "Header" : "Payload", krx, + NTOH__u64(hdr->dest_nid), NTOH__u64(hdr->src_nid) + NTOH__u32(hdr->dest_pid), NTOH__u32(hdr->src_pid), + NTOH__u32(hdr->type)); + + switch (NTOH__u32 (hdr->type)) + { + case PTL_MSG_ACK: + CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 + " len %u\n", + NTOH__u32(hdr->msg.ack.mlength), + hdr->msg.ack.dst_wmd.handle_cookie, + hdr->msg.ack.dst_wmd.handle_idx, + NTOH__u64(hdr->msg.ack.match_bits), + NTOH__u32(hdr->msg.ack.length)); + break; + case PTL_MSG_PUT: + CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 + " len %u off %u data "LPX64"\n", + NTOH__u32(hdr->msg.put.ptl_index), + hdr->msg.put.ack_wmd.handle_cookie, + hdr->msg.put.ack_wmd.handle_idx, + NTOH__u64(hdr->msg.put.match_bits), + NTOH__u32(hdr->msg.put.length), + NTOH__u32(hdr->msg.put.offset), + hdr->msg.put.hdr_data); + break; + case PTL_MSG_GET: + CERROR ("GET: <>\n"); + break; + case PTL_MSG_REPLY: + CERROR ("REPLY: <>\n"); + break; + default: + CERROR ("TYPE?: <>\n"); + } +} +#endif + +static int +kqswnal_recvmsg (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + int page; + char *page_ptr; + int page_nob; + char *iov_ptr; + int iov_nob; + int frag; +#if KQSW_CHECKSUM + kqsw_csum_t senders_csum; + kqsw_csum_t payload_csum = 0; + kqsw_csum_t hdr_csum = kqsw_csum(0, page_address(krx->krx_pages[0]), + sizeof(ptl_hdr_t)); + size_t csum_len = mlen; + int csum_frags = 0; + int csum_nob = 0; + static atomic_t csum_counter; + int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; + + atomic_inc (&csum_counter); + + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); + if (senders_csum != hdr_csum) + kqswnal_csum_error (krx, 1); +#endif + CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); + + /* What was actually received must be >= payload. + * This is an LASSERT, as lib_finalize() doesn't have a completion status. */ + LASSERT (krx->krx_nob >= KQSW_HDR_SIZE + mlen); + LASSERT (mlen <= rlen); + + /* It must be OK to kmap() if required */ + LASSERT (kiov == NULL || !in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + if (mlen != 0) + { + page = 0; + page_ptr = ((char *) page_address(krx->krx_pages[0])) + + KQSW_HDR_SIZE; + page_nob = PAGE_SIZE - KQSW_HDR_SIZE; + + LASSERT (niov > 0); + if (kiov != NULL) { + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + + for (;;) + { + /* We expect the iov to exactly match mlen */ + LASSERT (iov_nob <= mlen); + + frag = MIN (page_nob, iov_nob); + memcpy (iov_ptr, page_ptr, frag); +#if KQSW_CHECKSUM + payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); + csum_nob += frag; + csum_frags++; +#endif + mlen -= frag; + if (mlen == 0) + break; + + page_nob -= frag; + if (page_nob != 0) + page_ptr += frag; + else + { + page++; + LASSERT (page < krx->krx_npages); + page_ptr = page_address(krx->krx_pages[page]); + page_nob = PAGE_SIZE; + } + + iov_nob -= frag; + if (iov_nob != 0) + iov_ptr += frag; + else if (kiov != NULL) { + kunmap (kiov->kiov_page); + kiov++; + niov--; + LASSERT (niov > 0); + iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + iov_nob = kiov->kiov_len; + } else { + iov++; + niov--; + LASSERT (niov > 0); + iov_ptr = iov->iov_base; + iov_nob = iov->iov_len; + } + } + + if (kiov != NULL) + kunmap (kiov->kiov_page); + } + +#if KQSW_CHECKSUM + memcpy (&senders_csum, ((char *)page_address (krx->krx_pages[0])) + + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), sizeof(kqsw_csum_t)); + + if (csum_len != rlen) + CERROR("Unable to checksum data in user's buffer\n"); + else if (senders_csum != payload_csum) + kqswnal_csum_error (krx, 0); + + if (csum_verbose) + CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " + "csum_nob %d\n", + hdr_csum, payload_csum, csum_frags, csum_nob); +#endif + lib_finalize(nal, private, cookie); + + kqswnal_requeue_rx (krx); + + return (rlen); +} + +static int +kqswnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, iov, NULL, mlen, rlen)); +} + +static int +kqswnal_recv_pages (nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + ptl_kiov_t *kiov, + size_t mlen, + size_t rlen) +{ + return (kqswnal_recvmsg (nal, private, cookie, niov, NULL, kiov, mlen, rlen)); +} + +int +kqswnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kqswnal_data.kqn_nthreads); + return (0); +} + +void +kqswnal_thread_fini (void) +{ + atomic_dec (&kqswnal_data.kqn_nthreads); +} + +int +kqswnal_scheduler (void *arg) +{ + kqswnal_rx_t *krx; + kqswnal_tx_t *ktx; + kpr_fwd_desc_t *fwd; + long flags; + int rc; + int counter = 0; + int did_something; + + kportal_daemonize ("kqswnal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + + while (!kqswnal_data.kqn_shuttingdown) + { + did_something = FALSE; + + if (!list_empty (&kqswnal_data.kqn_readyrxds)) + { + krx = list_entry(kqswnal_data.kqn_readyrxds.next, + kqswnal_rx_t, krx_list); + list_del (&krx->krx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + kqswnal_rx (krx); + + did_something = TRUE; + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) + { + ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, + kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + rc = kqswnal_launch (ktx); + if (rc != 0) /* failed: ktx_nid down? */ + { + CERROR("Failed delayed transmit to "LPX64 + ": %d\n", ktx->ktx_nid, rc); + kqswnal_tx_done (ktx, rc); + } + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + if (!list_empty (&kqswnal_data.kqn_delayedfwds)) + { + fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); + list_del (&fwd->kprfd_list); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_fwd_packet (NULL, fwd); + + did_something = TRUE; + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == KQSW_RESCHED) { + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); + + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, + kqswnal_data.kqn_shuttingdown || + !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_delayedtxds) || + !list_empty(&kqswnal_data.kqn_delayedfwds)); + LASSERT (rc == 0); + } else if (current->need_resched) + schedule (); + + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + + kqswnal_thread_fini (); + return (0); +} + +nal_cb_t kqswnal_lib = +{ + nal_data: &kqswnal_data, /* NAL private data */ + cb_send: kqswnal_send, + cb_send_pages: kqswnal_send_pages, + cb_recv: kqswnal_recv, + cb_recv_pages: kqswnal_recv_pages, + cb_read: kqswnal_read, + cb_write: kqswnal_write, + cb_malloc: kqswnal_malloc, + cb_free: kqswnal_free, + cb_printf: kqswnal_printf, + cb_cli: kqswnal_cli, + cb_sti: kqswnal_sti, + cb_dist: kqswnal_dist +}; diff --git a/lustre/portals/knals/scimacnal/.cvsignore b/lustre/portals/knals/scimacnal/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/knals/scimacnal/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/knals/scimacnal/Makefile.am b/lustre/portals/knals/scimacnal/Makefile.am new file mode 100644 index 0000000..6da31f0 --- /dev/null +++ b/lustre/portals/knals/scimacnal/Makefile.am @@ -0,0 +1,11 @@ +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = kscimacnal +modulenet_DATA = kscimacnal.o +EXTRA_PROGRAMS = kscimacnal + +DEFS = +kscimacnal_SOURCES = scimacnal.c scimacnal_cb.c scimacnal.h diff --git a/lustre/portals/knals/scimacnal/README.scimacnal b/lustre/portals/knals/scimacnal/README.scimacnal new file mode 100644 index 0000000..d4c6a49 --- /dev/null +++ b/lustre/portals/knals/scimacnal/README.scimacnal @@ -0,0 +1,14 @@ + +scimacnal - A NAL for the Scali ScaMAC midlayer. + +The ScaMAC midlayer is a simplified API to the SCI high performance +interconnect. + +In order to use this NAL you'll need to tune scimac to use larger buffers. +See scimac.conf in this directory for an example. + +Overall performance and stability isn't great but this can be attributed +to the scimac driver which apparently is in need of some development. + +TODO: +Routing isn't yet implemented. diff --git a/lustre/portals/knals/scimacnal/scimac.conf b/lustre/portals/knals/scimacnal/scimac.conf new file mode 100644 index 0000000..bfb6d02 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimac.conf @@ -0,0 +1,35 @@ +# Configuration file for the scimac driver - lustre friendly settings +# + +# The maximal number of message headers to use in the system. +scimac_max_no_hdrs = 32 + +# The maximal number of eager buffers to use in the system. +scimac_max_no_ebufs = 8 + +# The maximal size in bytes of each eager buffer. +scimac_max_ebuf_size = 65536 + +# Enable use of a kernel thread to defer reception of packets. +# Default is to use a tasklet (sw interrupt). +scimac_use_ulevel_recv = 1 + +# The maximal number of packets queued for transfer per path at any one time. +scimac_max_send_queuelen = 2000 + +# The packet retransmit time in milliseconds. +# The time elapsed since a packet was attempted sent until the packet is resent. +scimac_pkt_rexmit_time = 200 + +# The packet's maximal retransmit time in milliseconds. +# The total time that a packet will be attempted sent before it is dropped. +scimac_max_rexmit_time = 5000 + +# The lowest valid node identifier in the system. +scimac_min_nodeid_number = 0x100 + +# The largest valid node identifier in the system. +scimac_max_nodeid_number = 0xff00 + +# The incremental nodeid step in the system. +scimac_nodeid_increment = 0x100 diff --git a/lustre/portals/knals/scimacnal/scimacnal.c b/lustre/portals/knals/scimacnal/scimacnal.c new file mode 100644 index 0000000..1066d69 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal.c @@ -0,0 +1,219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson <nikke@hpc2n.umu.se> + + * Based on gmnal, which is based on ksocknal and qswnal + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + + +#include "scimacnal.h" + +ptl_handle_ni_t kscimacnal_ni; +nal_t kscimacnal_api; + +kscimacnal_data_t kscimacnal_data; + +kpr_nal_interface_t kscimacnal_router_interface = { + kprni_nalid: SCIMACNAL, + kprni_arg: NULL, + kprni_fwd: kscimacnal_fwd_packet, +}; + + +static int kscimacnal_forward(nal_t *nal, + int id, + void *args, size_t args_len, + void *ret, size_t ret_len) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + lib_dispatch(nal_cb, ksci, id, args, ret); /* nal needs ksci */ + return PTL_OK; +} + + +static void kscimacnal_lock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_cli(nal_cb,flags); +} + + +static void kscimacnal_unlock(nal_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *ksci = nal->nal_data; + nal_cb_t *nal_cb = ksci->ksci_cb; + + + LASSERT (nal == &kscimacnal_api); + LASSERT (ksci == &kscimacnal_data); + LASSERT (nal_cb == &kscimacnal_lib); + + nal_cb->cb_sti(nal_cb,flags); +} + + +static int kscimacnal_shutdown(nal_t *nal, int ni) +{ + LASSERT (nal == &kscimacnal_api); + return 0; +} + + +static void kscimacnal_yield( nal_t *nal ) +{ + LASSERT (nal == &kscimacnal_api); + + if (current->need_resched) + schedule(); + return; +} + + +static nal_t *kscimacnal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + int nnids = 512; /* FIXME: Need ScaMac funktion to get #nodes */ + + CDEBUG(D_NET, "calling lib_init with nid 0x%Lx nnids %d\n", kscimacnal_data.ksci_nid, nnids); + lib_init(&kscimacnal_lib, kscimacnal_data.ksci_nid, 0, nnids,ptl_size, ac_size); + return &kscimacnal_api; +} + + +/* Called by kernel at module unload time */ +static void __exit +kscimacnal_finalize(void) +{ + /* FIXME: How should the shutdown procedure really look? */ + kscimacnal_data.ksci_shuttingdown=1; + + PORTAL_SYMBOL_UNREGISTER(kscimacnal_ni); + + PtlNIFini(kscimacnal_ni); + lib_fini(&kscimacnal_lib); + + mac_finish(kscimacnal_data.ksci_machandle); + + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory)); + + return; +} + + +/* Called by kernel at module insertion time */ +static int __init +kscimacnal_initialize(void) +{ + int rc; + unsigned long nid=0; + mac_handle_t *machandle = NULL; + + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory)); + + kscimacnal_api.forward = kscimacnal_forward; + kscimacnal_api.shutdown = kscimacnal_shutdown; + kscimacnal_api.yield = kscimacnal_yield; + kscimacnal_api.validate = NULL; /* our api validate is a NOOP */ + kscimacnal_api.lock= kscimacnal_lock; + kscimacnal_api.unlock= kscimacnal_unlock; + kscimacnal_api.nal_data = &kscimacnal_data; + + kscimacnal_lib.nal_data = &kscimacnal_data; + + memset(&kscimacnal_data, 0, sizeof(kscimacnal_data)); + + kscimacnal_data.ksci_cb = &kscimacnal_lib; + + /* We're not using this, but cli/sti callbacks does... ??? */ + spin_lock_init(&kscimacnal_data.ksci_dispatch_lock); + + /* FIXME: We only support one adapter for now */ + machandle = mac_init(0, MAC_SAPID_LUSTRE, kscimacnal_rx, + &kscimacnal_data); + + if(!machandle) { + CERROR("mac_init() failed\n"); + return -1; + } + + kscimacnal_data.ksci_machandle = machandle; + + /* Make sure the scimac MTU is tuned */ + if(mac_get_mtusize(machandle) < SCIMACNAL_MTU) { + CERROR("scimac mtu of %ld smaller than SCIMACNAL MTU of %d\n", + mac_get_mtusize(machandle), SCIMACNAL_MTU); + CERROR("Consult README.scimacnal for more information\n"); + mac_finish(machandle); + return -1; + } + + /* Get the node ID */ + /* mac_get_physaddrlen() is a function instead of define, sigh */ + LASSERT(mac_get_physaddrlen(machandle) <= sizeof(nid)); + if(mac_get_physaddr(machandle, (mac_physaddr_t *) &nid)) { + CERROR("mac_get_physaddr() failed\n"); + mac_finish(machandle); + return -1; + } + nid = ntohl(nid); + kscimacnal_data.ksci_nid = nid; + + + /* Initialize Network Interface */ + /* FIXME: What do the magic numbers mean? Documentation anyone? */ + rc = PtlNIInit(kscimacnal_init, 32, 4, 0, &kscimacnal_ni); + if (rc) { + CERROR("PtlNIInit failed %d\n", rc); + mac_finish(machandle); + return (-ENOMEM); + } + + PORTAL_SYMBOL_REGISTER(kscimacnal_ni); + + /* We're done now, it's OK for the RX callback to do stuff */ + kscimacnal_data.ksci_init = 1; + + return 0; +} + + +MODULE_AUTHOR("Niklas Edmundsson <nikke@hpc2n.umu.se>"); +MODULE_DESCRIPTION("Kernel Scali ScaMAC SCI NAL v0.0"); +MODULE_LICENSE("GPL"); + +module_init (kscimacnal_initialize); +module_exit (kscimacnal_finalize); + +EXPORT_SYMBOL(kscimacnal_ni); diff --git a/lustre/portals/knals/scimacnal/scimacnal.h b/lustre/portals/knals/scimacnal/scimacnal.h new file mode 100644 index 0000000..1ff180e --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal.h @@ -0,0 +1,85 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson <nikke@hpc2n.umu.se> + */ + + +#ifndef _SCIMACNAL_H +#define _SCIMACNAL_H + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/locks.h> +#include <linux/unistd.h> +#include <linux/init.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/segment.h> +#include <asm/page.h> /* For PAGE_SIZE */ + +#define DEBUG_SUBSYSTEM S_UNDEFINED + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +#include <scamac.h> + +#ifndef MAC_SAPID_LUSTRE +#define MAC_SAPID_LUSTRE MAC_SAPID_TEST1 +#endif /* MAC_SAPID_LUSTRE */ + +#define SCIMACNAL_MTU 65536 +/* FIXME: What is really the MTU of lustre? */ +#if PTL_MD_MAX_IOV*PAGE_SIZE > SCIMACNAL_MTU +#error Max MTU of ScaMAC is 64k, PTL_MD_MAX_IOV*PAGE_SIZE is bigger. +#endif + +typedef struct { + mac_handle_t *handle; + mac_mblk_t *msg; + mac_msg_type_t type; + void *userdata; +} kscimacnal_rx_t; + + +typedef struct { + nal_cb_t *ktx_nal; + void *ktx_private; + lib_msg_t *ktx_cookie; + ptl_hdr_t ktx_hdr; +} kscimacnal_tx_t; + + +typedef struct { + char ksci_init; + char ksci_shuttingdown; + ptl_nid_t ksci_nid; + nal_cb_t *ksci_cb; + spinlock_t ksci_dispatch_lock; + mac_handle_t *ksci_machandle; +} kscimacnal_data_t; + +extern kscimacnal_data_t kscimacnal_data; +extern nal_t kscimacnal_api; +extern nal_cb_t kscimacnal_lib; + +void kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +void kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, void *userdata); + + +#endif /* _SCIMACNAL_H */ diff --git a/lustre/portals/knals/scimacnal/scimacnal_cb.c b/lustre/portals/knals/scimacnal/scimacnal_cb.c new file mode 100644 index 0000000..7e4a2e8 --- /dev/null +++ b/lustre/portals/knals/scimacnal/scimacnal_cb.c @@ -0,0 +1,468 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8:cindent: + * + * Copyright (C) 2003 High Performance Computing Center North (HPC2N) + * Author: Niklas Edmundsson <nikke@hpc2n.umu.se> + + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "scimacnal.h" + +static int +kscimacnal_read (nal_cb_t *nal, void *private, + void *dst_addr, user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static int +kscimacnal_write(nal_cb_t *nal, void *private, + user_ptr dst_addr, void *src_addr, size_t len) +{ + CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr ); + memcpy( dst_addr, src_addr, len ); + return 0; +} + + +static void * +kscimacnal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + return buf; +} + + +static void +kscimacnal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + + +static void +kscimacnal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + if (portal_debug & D_NET) { + va_start( ap, fmt ); + vsnprintf( msg, sizeof(msg), fmt, ap ); + va_end( ap ); + + printk("CPUId: %d %s",smp_processor_id(), msg); + } +} + + +static void +kscimacnal_cli(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_lock_irqsave(&data->ksci_dispatch_lock,*flags); +} + + +static void +kscimacnal_sti(nal_cb_t *nal, unsigned long *flags) +{ + kscimacnal_data_t *data= nal->nal_data; + + spin_unlock_irqrestore(&data->ksci_dispatch_lock,*flags); +} + + +static int +kscimacnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* FIXME: Network distance has a meaning, but is there no easy + * way to figure it out (depends on routing) */ + + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + + +static +char * get_mac_error(mac_status_t status) +{ + switch(status) { + case MAC_MSG_STAT_OK: + return "MAC_MSG_STAT_OK"; + case MAC_MSG_STAT_FREED: + return "MAC_MSG_STAT_FREED"; + case MAC_MSG_STAT_ABORTED: + return "MAC_MSG_STAT_ABORTED"; + case MAC_MSG_STAT_TIMEDOUT: + return "MAC_MSG_STAT_TIMEDOUT"; + case MAC_MSG_STAT_NODEUNREACH: + return "MAC_MSG_STAT_NODEUNREACH"; + case MAC_MSG_STAT_NETDOWN: + return "MAC_MSG_STAT_NETDOWN"; + case MAC_MSG_STAT_RESET: + return "MAC_MSG_STAT_RESET"; + case MAC_MSG_STAT_INITFAILED: + return "MAC_MSG_STAT_INITFAILED"; + case MAC_MSG_STAT_SYNCFAILED: + return "MAC_MSG_STAT_SYNCFAILED"; + case MAC_MSG_STAT_BADPROTO: + return "MAC_MSG_STAT_BADPROTO"; + case MAC_MSG_STAT_NOBUFSPACE: + return "MAC_MSG_STAT_NOBUFSPACE"; + case MAC_MSG_STAT_CONGESTION: + return "MAC_MSG_STAT_CONGESTION"; + case MAC_MSG_STAT_OTHER: + return "MAC_MSG_STAT_OTHER"; + default: + return "Unknown error"; + } +} + + +/* FIXME add routing code here ? */ + +/* Called by ScaMac when transmission is complete (ie. message is released) */ +static void +kscimacnal_txrelease(mac_mblk_t *msg, mac_msg_status_t status, void *context) +{ + kscimacnal_tx_t *ktx = (kscimacnal_tx_t *)context; + int err=0; + + LASSERT (ktx != NULL); + + /* Euh, there is no feedback when transmission fails?! */ + switch(status) { + case MAC_MSG_STAT_OK: /* normal */ + break; + default: + CERROR("%s (%d):\n", get_mac_error(status), status); + err = -EIO; + break; + } + + lib_finalize(ktx->ktx_nal, ktx->ktx_private, ktx->ktx_cookie); + + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); +} + + +/* Called by portals when it wants to send a message. + * Since ScaMAC has it's own TX thread we don't bother setting up our own. */ +static int +kscimacnal_send(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + size_t payload_len) +{ + kscimacnal_tx_t *ktx=NULL; + kscimacnal_data_t *ksci = nal->nal_data; + int rc=0; + int buf_len = sizeof(ptl_hdr_t) + payload_len; + mac_mblk_t *msg=NULL, *lastblk, *newblk; + unsigned long physaddr; + + + CDEBUG(D_NET, "sending %d bytes from %p to nid 0x%Lx niov: %d\n", + payload_len, payload_iov, nid, payload_niov); + + LASSERT(ksci != NULL); + + LASSERT(hdr != NULL); + + /* Do real check if we can send this */ + if (buf_len > mac_get_mtusize(ksci->ksci_machandle)) { + CERROR("kscimacnal:request exceeds TX MTU size (%ld).\n", + mac_get_mtusize(ksci->ksci_machandle)); + return -EINVAL; + } + + + /* save transaction info for later finalize and cleanup */ + PORTAL_ALLOC(ktx, (sizeof(kscimacnal_tx_t))); + if (!ktx) { + return -ENOMEM; + } + + /* *SIGH* hdr is a stack variable in the calling function, so we + * need to copy it to a buffer. Zerocopy magic (or is it just + * deferred memcpy?) is annoying sometimes. */ + memcpy(&ktx->ktx_hdr, hdr, sizeof(ptl_hdr_t)); + + /* First, put the header in the main message mblk */ + msg = mac_alloc_mblk(&ktx->ktx_hdr, sizeof(ptl_hdr_t), + kscimacnal_txrelease, ktx); + if (!msg) { + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(msg, sizeof(ptl_hdr_t)); + lastblk=msg; + + /* Allocate additional mblks for each iov as needed. + * Essentially lib_copy_iov2buf with a twist or two */ + while (payload_len > 0) + { + ptl_size_t nob; + + LASSERT (payload_niov > 0); + + nob = MIN (payload_iov->iov_len, payload_len); + + /* We don't need a callback on the additional mblks, since + * all release callbacks seems to be called when the entire + * message has been sent */ + newblk=mac_alloc_mblk(payload_iov->iov_base, nob, NULL, NULL); + if(!newblk) { + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return -ENOMEM; + } + mac_put_mblk(newblk, nob); + mac_link_mblk(lastblk, newblk); + lastblk=newblk; + + payload_len -= nob; + payload_niov--; + payload_iov++; + } + + ktx->ktx_nal = nal; + ktx->ktx_private = private; + ktx->ktx_cookie = cookie; + + CDEBUG(D_NET, "mac_send %d bytes to nid: 0x%Lx\n", buf_len, nid); + + physaddr = htonl(nid); + + if((rc=mac_send(ksci->ksci_machandle, msg, + (mac_physaddr_t *) &physaddr))) { + CERROR("kscimacnal: mac_send() failed, rc=%d\n", rc); + mac_free_msg(msg); + PORTAL_FREE(ktx, (sizeof(kscimacnal_tx_t))); + return rc; + } + + return 0; +} + + +void +kscimacnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + CERROR ("forwarding not implemented\n"); +} + + +/* Process a received portals packet */ +/* Called by the ScaMac RX thread when a packet is received */ +void +kscimacnal_rx(mac_handle_t *handle, mac_mblk_t *msg, mac_msg_type_t type, + void *userdata) +{ + ptl_hdr_t *hdr = NULL; + kscimacnal_rx_t krx; + mac_size_t size; + kscimacnal_data_t *ksci = userdata; + + LASSERT(ksci != NULL); + + if ( !ksci->ksci_init || ksci->ksci_shuttingdown || + type == MAC_MSG_TYPE_CTRL || type == MAC_MSG_TYPE_OTHER ) { + /* We're not interested in messages not for us, ignore */ + mac_free_msg(msg); + return; + } + + size = mac_msg_size(msg); + + CDEBUG(D_NET,"msg %p type %d, size %ld bytes (%ld mblks)\n", + msg, type, size, mac_msg_mblks(msg)); + + if( size < sizeof( ptl_hdr_t ) ) { + /* XXX what's this for? */ + if (ksci->ksci_shuttingdown) + return; + CERROR("kscimacnal: did not receive complete portal header," + "size= %ld\n", size); + /* Free the message before exiting */ + mac_free_msg(msg); + return; + } + + /* Provide everything we know */ + krx.handle = handle; + krx.msg = msg; + krx.type = type; + krx.userdata = userdata; + + /* mac_msg_next returns the next mblk with unread data */ + hdr = mac_get_mblk(mac_msg_next(msg), sizeof(ptl_hdr_t) ); + + if(!hdr) { + CERROR("kscimacnal: no data block in message %p\n", msg); + mac_free_msg(msg); + return; + } + + if ( hdr->dest_nid == kscimacnal_lib.ni.nid ) { + PROF_START(lib_parse); + /* sets wanted_len, iovs etc and calls our callback */ + lib_parse(&kscimacnal_lib, hdr, &krx); + PROF_FINISH(lib_parse); +#if 0 /* FIXME: Is it possible to detect this? */ + } else if (kgmnal_ispeer(hdr->dest_nid)) { + /* should have gone direct to peer */ + CERROR("dropping packet from 0x%llx to 0x%llx:" + "target is a peer\n", + hdr->src_nid, hdr->dest_nid); + kgmnal_requeue_rx(&krx); +#endif /* if 0 FIXME */ + } else { + /* forward to gateway */ + CERROR("forwarding not implemented, mynid=0x%llx dest=0x%llx\n", + kscimacnal_lib.ni.nid, hdr->dest_nid); + } + + mac_free_msg(msg); + + CDEBUG(D_NET, "msg %p: Done\n", msg); +} + + +/* Called by portals to process a recieved packet */ +static int kscimacnal_recv(nal_cb_t *nal, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + size_t mlen, + size_t rlen) +{ + kscimacnal_rx_t *krx = private; + mac_mblk_t *mblk; + void *src; + mac_size_t pkt_len; + ptl_size_t iovused=0; + + LASSERT (krx != NULL); + LASSERT (krx->msg != NULL); + + CDEBUG(D_NET,"msg %p: mlen=%d, rlen=%d, niov=%d\n", + krx->msg, mlen, rlen, niov); + + /* What was actually received must be >= what sender claims to have + * sent. This is an LASSERT, since lib-move doesn't check cb return + * code yet. Also, rlen seems to be negative when mlen==0 so don't + * assert on that. + */ + LASSERT (mlen==0 || mac_msg_size(krx->msg) >= sizeof(ptl_hdr_t)+rlen); + LASSERT (mlen==0 || mlen <= rlen); + + PROF_START(memcpy); + + /* mac_msg_next returns next mblk with unread data (ie. can + * be same mblk */ + while (mlen != 0 && (mblk = mac_msg_next(krx->msg))) { + pkt_len = mac_mblk_len(mblk); + src = mac_get_mblk(mblk, pkt_len); /* Next unread block */ + + CDEBUG(D_NET,"msg %p: mblk: %p pkt_len: %ld src: %p\n", + krx->msg, mblk, pkt_len, src); + + LASSERT(src != NULL); + + /* Essentially lib_copy_buf2iov but with continuation support, + * we "gracefully" thrash the argument vars ;) */ + while (pkt_len > 0) { + ptl_size_t nob; + + LASSERT (niov > 0); + + LASSERT(iovused < iov->iov_len); + + nob = MIN (iov->iov_len-iovused, pkt_len); + CDEBUG(D_NET, "iovbase: %p iovlen: %d src: %p nob: %d " + "iovused: %d\n", + iov->iov_base, iov->iov_len, + src, nob, iovused); + + memcpy (iov->iov_base+iovused, src, nob); + pkt_len -= nob; + src += nob; + + if(nob+iovused < iov->iov_len) { + /* We didn't use all of the iov */ + iovused+=nob; + } + else { + niov--; + iov++; + iovused=0; + } + } + } + PROF_FINISH(memcpy); + + CDEBUG(D_NET, "Calling lib_finalize.\n"); + + PROF_START(lib_finalize); + lib_finalize(nal, private, cookie); + PROF_FINISH(lib_finalize); + + CDEBUG(D_NET, "Done.\n"); + + return rlen; +} + + +nal_cb_t kscimacnal_lib = { + nal_data: &kscimacnal_data, /* NAL private data */ + cb_send: kscimacnal_send, + cb_send_pages: NULL, /* Ignore for now */ + cb_recv: kscimacnal_recv, + cb_recv_pages: NULL, + cb_read: kscimacnal_read, + cb_write: kscimacnal_write, + cb_malloc: kscimacnal_malloc, + cb_free: kscimacnal_free, + cb_printf: kscimacnal_printf, + cb_cli: kscimacnal_cli, + cb_sti: kscimacnal_sti, + cb_dist: kscimacnal_dist +}; diff --git a/lustre/portals/knals/socknal/.cvsignore b/lustre/portals/knals/socknal/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/knals/socknal/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/knals/socknal/Makefile.am b/lustre/portals/knals/socknal/Makefile.am new file mode 100644 index 0000000..437d7fc --- /dev/null +++ b/lustre/portals/knals/socknal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ksocknal +modulenet_DATA = ksocknal.o +EXTRA_PROGRAMS = ksocknal + +DEFS = +ksocknal_SOURCES = socknal.c socknal_cb.c socknal.h diff --git a/lustre/portals/knals/socknal/Makefile.mk b/lustre/portals/knals/socknal/Makefile.mk new file mode 100644 index 0000000..46edf01 --- /dev/null +++ b/lustre/portals/knals/socknal/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Kernelenv + +obj-y += ksocknal.o +ksocknal-objs := socknal.o socknal_cb.o + diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c new file mode 100644 index 0000000..91d971c --- /dev/null +++ b/lustre/portals/knals/socknal/socknal.c @@ -0,0 +1,860 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +ptl_handle_ni_t ksocknal_ni; +static nal_t ksocknal_api; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +ksock_nal_data_t ksocknal_data; +#else +static ksock_nal_data_t ksocknal_data; +#endif + +kpr_nal_interface_t ksocknal_router_interface = { + kprni_nalid: SOCKNAL, + kprni_arg: &ksocknal_data, + kprni_fwd: ksocknal_fwd_packet, +}; + + +int +ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ksocknal_send needs k */ + return PTL_OK; +} + +int +ksocknal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ksocknal_close_sock(0); /* close all sockets */ +} + +void +ksocknal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ksocknal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ksocknal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ksocknal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", (ptl_nid_t)0); + lib_init(&ksocknal_lib, (ptl_nid_t)0, 0, 10, ptl_size, ac_size); + return (&ksocknal_api); +} + +/* + * EXTRA functions follow + */ + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ksocknal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ksocknal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->nid); + + ni->nid = nid; + return (0); +} + +void +ksocknal_bind_irq (unsigned int irq, int cpu) +{ +#if (defined(CONFIG_SMP) && CPU_AFFINITY) + char cmdline[64]; + char *argv[] = {"/bin/sh", + "-c", + cmdline, + NULL}; + char *envp[] = {"HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + + snprintf (cmdline, sizeof (cmdline), + "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); + + printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); + + /* FIXME: Find a better method of setting IRQ affinity... + */ + + call_usermodehelper (argv[0], argv, envp); +#endif +} + +int +ksocknal_add_sock (ptl_nid_t nid, int fd, int bind_irq) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + ksock_sched_t *sched = NULL; + unsigned int irq = 0; + struct net_device *dev = NULL; + int ret; + int idx; + ENTRY; + + LASSERT (!in_interrupt()); + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + sock->sk->allocation = GFP_NOFS; /* don't call info fs for alloc */ + + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_saved_data_ready = sock->sk->data_ready; + conn->ksnc_saved_write_space = sock->sk->write_space; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ksocknal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + +#warning check it is OK to derefence sk->dst_cache->dev like this... + lock_sock (conn->ksnc_sock->sk); + + if (conn->ksnc_sock->sk->dst_cache != NULL) { + dev = conn->ksnc_sock->sk->dst_cache->dev; + if (dev != NULL) { + irq = dev->irq; + if (irq >= NR_IRQS) { + CERROR ("Unexpected IRQ %x\n", irq); + irq = 0; + } + } + } + + release_sock (conn->ksnc_sock->sk); + + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (irq == 0 || + ksocknal_data.ksnd_irq_info[irq] == SOCKNAL_IRQ_UNASSIGNED) { + /* This is a software NIC, or we haven't associated it with + * a CPU yet */ + + /* Choose the CPU with the fewest connections */ + sched = ksocknal_data.ksnd_schedulers; + for (idx = 1; idx < SOCKNAL_N_SCHED; idx++) + if (sched->kss_nconns > + ksocknal_data.ksnd_schedulers[idx].kss_nconns) + sched = &ksocknal_data.ksnd_schedulers[idx]; + + if (irq != 0) { /* Hardware NIC */ + /* Remember which scheduler we chose */ + idx = sched - ksocknal_data.ksnd_schedulers; + + LASSERT (idx < SOCKNAL_IRQ_SCHED_MASK); + + if (bind_irq) /* remember if we will bind below */ + idx |= SOCKNAL_IRQ_BOUND; + + ksocknal_data.ksnd_irq_info[irq] = idx; + } + } else { + /* This is a hardware NIC, associated with a CPU */ + idx = ksocknal_data.ksnd_irq_info[irq]; + + /* Don't bind again if we've bound already */ + if ((idx & SOCKNAL_IRQ_BOUND) != 0) + bind_irq = 0; + + sched = &ksocknal_data.ksnd_schedulers[idx & SOCKNAL_IRQ_SCHED_MASK]; + } + + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + list_add(&conn->ksnc_list, &ksocknal_data.ksnd_socklist); + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (bind_irq && /* irq binding required */ + irq != 0) /* hardware NIC */ + ksocknal_bind_irq (irq, sched - ksocknal_data.ksnd_schedulers); + + /* NOW it's safe to get called back when socket is ready... */ + sock->sk->user_data = conn; + sock->sk->data_ready = ksocknal_data_ready; + sock->sk->write_space = ksocknal_write_space; + + /* ...which I call right now to get things going */ + ksocknal_data_ready (sock->sk, 0); + ksocknal_write_space (sock->sk); + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ksocknal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid == 0) { /* close ALL connections */ + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ksocknal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ksocknal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + write_unlock_irqrestore (&ksocknal_data.ksnd_socklist_lock, flags); + + if (nid && list_empty (&death_row)) + return (-ENOENT); + + while (!list_empty (&death_row)) { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + /* NB I _have_ to restore the callback, rather than storing + * a noop, since the socket could survive past this module + * being unloaded!! */ + conn->ksnc_sock->sk->data_ready = conn->ksnc_saved_data_ready; + conn->ksnc_sock->sk->write_space = conn->ksnc_saved_write_space; + + /* OK; no more callbacks, but they could be in progress now, + * so wait for them to complete... */ + write_lock_irqsave (&ksocknal_data.ksnd_socklist_lock, flags); + + /* ...however if I get the lock before a callback gets it, + * this will make them noop + */ + conn->ksnc_sock->sk->user_data = NULL; + + /* And drop the scheduler's connection count while I've got + * the exclusive lock */ + conn->ksnc_scheduler->kss_nconns--; + + write_unlock_irqrestore(&ksocknal_data.ksnd_socklist_lock, + flags); + + ksocknal_put_conn (conn); /* drop ref for ksnd_socklist */ + } + + return (0); +} + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + return &(sk->tp_pinfo.af_tcp); +} +#else +struct tcp_opt *sock2tcp_opt(struct sock *sk) +{ + struct tcp_sock *s = (struct tcp_sock *)sk; + return &s->tcp; +} +#endif + +void +ksocknal_push_conn (ksock_conn_t *conn) +{ + struct sock *sk = conn->ksnc_sock->sk; + struct tcp_opt *tp = sock2tcp_opt(sk); + int nonagle; + int val = 1; + int rc; + mm_segment_t oldmm; + + lock_sock (sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock (sk); + + oldmm = get_fs (); + set_fs (KERNEL_DS); + + rc = sk->prot->setsockopt (sk, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof (val)); + LASSERT (rc == 0); + + set_fs (oldmm); + + lock_sock (sk); + tp->nonagle = nonagle; + release_sock (sk); +} + +/* Passing in a zero nid pushes all connections */ +int +ksocknal_push_sock (ptl_nid_t nid) +{ + ksock_conn_t *conn; + struct list_head *tmp; + int index; + int i; + + if (nid != 0) { + conn = ksocknal_get_conn (nid); + + if (conn == NULL) + return (-ENOENT); + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + + return (0); + } + + /* NB we can't remove connections from the socket list so we have to + * cope with them being removed from under us... + */ + for (index = 0; ; index++) { + read_lock (&ksocknal_data.ksnd_socklist_lock); + + i = 0; + conn = NULL; + + list_for_each (tmp, &ksocknal_data.ksnd_socklist) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc (&conn->ksnc_refcount); // take a ref + break; + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + if (conn == NULL) + break; + + ksocknal_push_conn (conn); + ksocknal_put_conn (conn); + } + + return (0); +} + +ksock_conn_t * +ksocknal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ksocknal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ksocknal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", + nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ksocknal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ksocknal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (conn->ksnc_sock->sk->data_ready != ksocknal_data_ready); + LASSERT (conn->ksnc_sock->sk->write_space != ksocknal_write_space); + LASSERT (conn->ksnc_sock->sk->user_data == NULL); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) { + ksocknal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ksocknal_data.ksnd_reaper_list); + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); +} + +int +ksocknal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ksocknal_add_sock(data->ioc_nid, data->ioc_fd, + data->ioc_flags); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ksocknal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ksocknal_set_mynid (data->ioc_nid); + break; + } + case NAL_CMD_PUSH_CONNECTION: { + rc = ksocknal_push_sock (data->ioc_nid); + break; + } + } + + return rc; +} + +void +ksocknal_free_buffers (void) +{ + if (ksocknal_data.ksnd_fmbs != NULL) { + ksock_fmb_t *fmb = (ksock_fmb_t *)ksocknal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; + i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); + i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ksocknal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ksocknal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ksocknal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + + SOCKNAL_NNBLK_LTXS)); + + if (ksocknal_data.ksnd_schedulers != NULL) + PORTAL_FREE (ksocknal_data.ksnd_schedulers, + sizeof (ksock_sched_t) * SOCKNAL_N_SCHED); +} + +void __exit +ksocknal_module_fini (void) +{ + int i; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(SOCKNAL); + PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ksocknal_ni); + lib_fini(&ksocknal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ksocknal_data.ksnd_socklist)); + LASSERT (list_empty (&ksocknal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = + &ksocknal_data.ksnd_schedulers[i]; + + LASSERT (list_empty (&kss->kss_tx_conns)); + LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (kss->kss_nconns == 0); + } + + /* stop router calling me */ + kpr_shutdown (&ksocknal_data.ksnd_router); + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all (&ksocknal_data.ksnd_reaper_waitq); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) + wake_up_all(&ksocknal_data.ksnd_schedulers[i].kss_waitq); + + while (atomic_read (&ksocknal_data.ksnd_nthreads) != 0) { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ksocknal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ksocknal_data.ksnd_router); + + ksocknal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + + +int __init +ksocknal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ksocknal_api.forward = ksocknal_api_forward; + ksocknal_api.shutdown = ksocknal_api_shutdown; + ksocknal_api.yield = ksocknal_api_yield; + ksocknal_api.validate = NULL; /* our api validate is a NOOP */ + ksocknal_api.lock = ksocknal_api_lock; + ksocknal_api.unlock = ksocknal_api_unlock; + ksocknal_api.nal_data = &ksocknal_data; + + ksocknal_lib.nal_data = &ksocknal_data; + + memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ksocknal_data.ksnd_socklist); + rwlock_init(&ksocknal_data.ksnd_socklist_lock); + + ksocknal_data.ksnd_nal_cb = &ksocknal_lib; + spin_lock_init (&ksocknal_data.ksnd_nal_cb_lock); + + spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); + + spin_lock_init(&ksocknal_data.ksnd_idle_ltx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_lock_init (&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD (&ksocknal_data.ksnd_reaper_list); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + memset (&ksocknal_data.ksnd_irq_info, SOCKNAL_IRQ_UNASSIGNED, + sizeof (ksocknal_data.ksnd_irq_info)); + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + + PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + sizeof(ksock_sched_t) * SOCKNAL_N_SCHED); + if (ksocknal_data.ksnd_schedulers == NULL) + RETURN(-ENOMEM); + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; + + spin_lock_init (&kss->kss_lock); + INIT_LIST_HEAD (&kss->kss_rx_conns); + INIT_LIST_HEAD (&kss->kss_tx_conns); +#if SOCKNAL_ZC + INIT_LIST_HEAD (&kss->kss_zctxdone_list); +#endif + init_waitqueue_head (&kss->kss_waitq); + } + + CERROR ("ltx "LPSZ", total "LPSZ"\n", sizeof (ksock_ltx_t), + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + PORTAL_ALLOC(ksocknal_data.ksnd_ltxs, + sizeof(ksock_ltx_t) * (SOCKNAL_NLTXS +SOCKNAL_NNBLK_LTXS)); + if (ksocknal_data.ksnd_ltxs == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ksocknal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ksocknal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ksocknal_data.ksnd_idle_ltx_list : + &ksocknal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ksocknal_init, 32, 4, 0, &ksocknal_ni); + if (rc != 0) { + CERROR("ksocknal: PtlNIInit failed: error %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ksocknal_ni, ~0); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_PTL; // flag PtlNIInit() called + + for (i = 0; i < SOCKNAL_N_SCHED; i++) { + rc = ksocknal_thread_start (ksocknal_scheduler, + &ksocknal_data.ksnd_schedulers[i]); + if (rc != 0) { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", + i, rc); + ksocknal_module_fini (); + RETURN (rc); + } + } + + rc = ksocknal_thread_start (ksocknal_reaper, NULL); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ksocknal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ksocknal_data.ksnd_router, + &ksocknal_router_interface); + if (rc != 0) { + CDEBUG(D_NET, "Can't initialise routing interface " + "(rc = %d): not routing\n", rc); + } else { + /* Only allocate forwarding buffers if I'm on a gateway */ + + PORTAL_ALLOC(ksocknal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + if (ksocknal_data.ksnd_fmbs == NULL) { + ksocknal_module_fini (); + RETURN(-ENOMEM); + } + + /* NULL out buffer pointers etc */ + memset(ksocknal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + + SOCKNAL_LARGE_FWD_NMSGS); i++) { + ksock_fmb_t *fmb = + &((ksock_fmb_t *)ksocknal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_small_fmp; + } else { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ksocknal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) { + fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) { + ksocknal_module_fini (); + return (-ENOMEM); + } + + LASSERT(page_address (fmb->fmb_pages[j]) != + NULL); + } + + list_add(&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + } + + rc = kportal_nal_register(SOCKNAL, &ksocknal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + ksocknal_module_fini (); + return (rc); + } + + PORTAL_SYMBOL_REGISTER(ksocknal_ni); + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " + "mem %d)\n", + kpr_routing (&ksocknal_data.ksnd_router) ? + "enabled" : "disabled", pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); + +EXPORT_SYMBOL (ksocknal_ni); diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h new file mode 100644 index 0000000..86cdeb0 --- /dev/null +++ b/lustre/portals/knals/socknal/socknal.h @@ -0,0 +1,292 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/uio.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <linux/kmod.h> +#include <asm/uaccess.h> +#include <asm/segment.h> + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +#define SOCKNAL_N_SCHED num_online_cpus() /* # socknal schedulers */ + +#if PTL_LARGE_MTU +# define SOCKNAL_MAX_FWD_PAYLOAD (256<<10) /* biggest payload I can forward */ +#else +# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ +#endif + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +typedef struct /* pool of forwarding buffers */ +{ + spinlock_t fmp_lock; /* serialise */ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + struct list_head kss_tx_conns; /* conn waiting to be written */ +#if SOCKNAL_ZC + struct list_head kss_zctxdone_list; /* completed ZC transmits */ +#endif + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + int kss_nconns; /* # connections assigned to this scheduler */ +} ksock_sched_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + ksock_sched_t *ksnd_schedulers; /* scheduler state */ + + kpr_router_t ksnd_router; /* THE router */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + spinlock_t ksnd_idle_ltx_lock; /* serialise ltx alloc/free */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + unsigned char ksnd_irq_info[NR_IRQS]; /* irq->scheduler lookup */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +#define SOCKNAL_IRQ_BOUND 0x80 /* flag we _did_ bind already */ +#define SOCKNAL_IRQ_SCHED_MASK 0x7f /* we assume < 127 CPUs */ +#define SOCKNAL_IRQ_UNASSIGNED 0xff /* flag unassigned */ + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments and 0 or more ptl_kiov_t fragments. Forwarded + * messages, or messages from an MD with PTL_MD_KIOV _not_ set have 0 + * ptl_kiov_t fragments. Messages from an MD with PTL_MD_KIOV set, have 1 + * struct iovec fragment (the header) and up to PTL_MD_MAX_IOV ptl_kiov_t + * fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, if the message + * requires forwarding or will be received into mapped memory, up to + * PTL_MD_MAX_IOV struct iovec fragments describe the target memory. + * Otherwise up to PTL_MD_MAX_IOV ptl_kiov_t fragments are used. + */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet iovec frags */ + struct iovec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + ptl_kiov_t *tx_kiov; /* packet page frags */ +#if SOCKNAL_ZC + ksock_sched_t *tx_sched; /* who to wake on callback */ + zccd_t tx_zccd; /* zero copy callback descriptor */ +#endif +} ksock_tx_t; + +#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the tx frag descriptors: hdr is always 1 iovec + * and payload is PTL_MD_MAX of either type. */ +typedef struct +{ + struct iovec hdr; + union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + } payload; +} ksock_txiovspace_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + ksock_txiovspace_t ltx_iov_space; /* where to stash frag descriptors */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the address of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or PTL_MD_MAX_IOV frags of payload of either type. */ +typedef union { + struct iovec iov[PTL_MD_MAX_IOV]; + ptl_kiov_t kiov[PTL_MD_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + volatile int ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct iovec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + volatile int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ksocknal_add_sock (ptl_nid_t nid, int fd, int client); +extern int ksocknal_close_sock(ptl_nid_t nid); +extern int ksocknal_set_mynid(ptl_nid_t nid); +extern int ksocknal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ksocknal_get_conn (ptl_nid_t nid); +extern void _ksocknal_put_conn (ksock_conn_t *conn); +extern void ksocknal_close_conn (ksock_conn_t *conn); + +static inline void +ksocknal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ksocknal_put_conn (conn); +} + +extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); +extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); +extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ksocknal_scheduler (void *arg); +extern int ksocknal_reaper (void *arg); +extern void ksocknal_data_ready(struct sock *sk, int n); +extern void ksocknal_write_space(struct sock *sk); + + +extern nal_cb_t ksocknal_lib; +extern ksock_nal_data_t ksocknal_data; diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c new file mode 100644 index 0000000..6147d8a --- /dev/null +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -0,0 +1,1613 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socknal.h" + +atomic_t ksocknal_packets_received; +atomic_t ksocknal_packets_launched; +atomic_t ksocknal_packets_being_sent; + +#if SOCKNAL_ZC +int ksocknal_do_zc = 1; +int ksocknal_zc_min_frag = 2048; +#endif + +/* + * LIB functions follow + * + */ +int +ksocknal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ksocknal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ksocknal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ksocknal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ksocknal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ksocknal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ksocknal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ksocknal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ksocknal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ksocknal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) { + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + if (!list_empty (&ksocknal_data.ksnd_idle_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) { + if (!list_empty(&ksocknal_data.ksnd_idle_nblk_ltx_list)) { + ltx = list_entry(ksocknal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore(&ksocknal_data.ksnd_idle_ltx_lock, + flags); + + wait_event (ksocknal_data.ksnd_idle_ltx_waitq, + !list_empty (&ksocknal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + return (ltx); +} + +#if SOCKNAL_ZC +struct page * +ksocknal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (page == NULL || + !VALID_PAGE (page)) + return (NULL); + + return (page); +} +#endif + +int +ksocknal_send_iov (struct socket *sock, ksock_tx_t *tx, int more) +{ + struct iovec *iov = tx->tx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; +#if SOCKNAL_ZC + int offset = vaddr & (PAGE_SIZE - 1); + int zcsize = MIN (fragsize, PAGE_SIZE - offset); + struct page *page; +#endif + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (tx->tx_niov > 0); + more |= (tx->tx_niov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + zcsize >= ksocknal_zc_min_frag && + (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { + + CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", + (void *)vaddr, page, page_address(page), offset, zcsize); + + more |= (zcsize < fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, zcsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + /* NB don't pass tx's iov; sendmsg may or may not update it */ + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_iov++; + tx->tx_niov--; + return (1); +} + +int +ksocknal_send_kiov (struct socket *sock, ksock_tx_t *tx, int more) +{ + ptl_kiov_t *kiov = tx->tx_kiov; + int fragsize = kiov->kiov_len; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only send 1 frag at a time. */ + LASSERT (fragsize <= tx->tx_nob); + LASSERT (offset + fragsize <= PAGE_SIZE); + LASSERT (tx->tx_nkiov > 0); + more |= (tx->tx_nkiov > 1); + +#if SOCKNAL_ZC + if (ksocknal_do_zc && + (sock->sk->route_caps & NETIF_F_SG) && + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + fragsize >= ksocknal_zc_min_frag) { + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, fragsize); + + rc = tcp_sendpage_zccd(sock, page, offset, fragsize, + more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT, + &tx->tx_zccd); + } else +#endif + { + char *addr = ((char *)kmap (page)) + offset; + struct iovec fragiov = {.iov_base = addr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = more ? (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT + }; + mm_segment_t oldmm = get_fs(); + + set_fs (KERNEL_DS); + rc = sock->sk->prot->sendmsg(sock->sk, &msg, fragsize); + set_fs (oldmm); + kunmap (page); + } + + if (rc <= 0) + return (rc); + + tx->tx_nob -= rc; + + if (rc < fragsize) { + /* didn't send whole frag */ + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + /* everything went */ + LASSERT (rc == fragsize); + tx->tx_kiov++; + tx->tx_nkiov--; + return (1); +} + +int +ksocknal_sendmsg (struct socket *sock, ksock_tx_t *tx, int more) +{ + int rc; + int sent_some = 0; + ENTRY; + + LASSERT (!in_interrupt()); + + for (;;) { + if (tx->tx_niov != 0) + rc = ksocknal_send_iov (sock, tx, more || tx->tx_nkiov != 0); + else + rc = ksocknal_send_kiov (sock, tx, more); + + /* Interpret a zero rc the same as -EAGAIN (Adaptech TOE) */ + if (rc <= 0) /* error or partial send */ + RETURN ((sent_some || rc == -EAGAIN) ? 0 : rc); + + if (tx->tx_nob == 0) /* sent everything */ + RETURN (0); + + sent_some = 1; + } +} + +int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int fragsize = iov->iov_len; + unsigned long vaddr = (unsigned long)iov->iov_base; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + iov->iov_base = (void *)(vaddr + rc); + iov->iov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_iov++; + conn->ksnc_rx_niov--; + return (1); +} + +int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + unsigned long vaddr = ((unsigned long)kmap (page)) + offset; + struct iovec fragiov = { .iov_base = (void *)vaddr, + .iov_len = fragsize}; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &fragiov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + mm_segment_t oldmm = get_fs(); + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (fragsize <= conn->ksnc_rx_nob_wanted); + LASSERT (conn->ksnc_rx_nkiov > 0); + LASSERT (offset + fragsize <= PAGE_SIZE); + + set_fs (KERNEL_DS); + rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); + /* NB this is just a boolean............................^ */ + set_fs (oldmm); + kunmap (page); + + if (rc <= 0) + return (rc); + + conn->ksnc_rx_nob_wanted -= rc; + conn->ksnc_rx_nob_left -= rc; + + if (rc < fragsize) { + kiov->kiov_offset = offset + rc; + kiov->kiov_len = fragsize - rc; + return (-EAGAIN); + } + + LASSERT (rc == fragsize); + conn->ksnc_rx_kiov++; + conn->ksnc_rx_nkiov--; + return (1); +} + +int +ksocknal_recvmsg (ksock_conn_t *conn) +{ + int rc; + int got_some = 0; + ENTRY; + + LASSERT (!in_interrupt ()); + + for (;;) { + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + /* CAVEAT EMPTOR: we return... + * <= 0 for error (0 == EOF) and > 0 for success (unlike sendmsg()) */ + + if (rc <= 0) /* error/EOF or partial receive */ + RETURN ((got_some || rc == -EAGAIN) ? 1 : rc); + + if (conn->ksnc_rx_nob_wanted == 0) + RETURN (1); + + got_some = 0; + } +} + +#if SOCKNAL_ZC +void +ksocknal_zc_callback (zccd_t *zcd) +{ + ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); + ksock_sched_t *sched = tx->tx_sched; + unsigned long flags; + ENTRY; + + /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); + EXIT; +} +#endif + +void +ksocknal_tx_done (ksock_tx_t *tx) +{ + long flags; + ksock_ltx_t *ltx; + ENTRY; + + atomic_dec (&ksocknal_packets_being_sent); + + if (tx->tx_isfwd) { /* was a forwarded packet? */ + kpr_fwd_done (&ksocknal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + EXIT; + return; + } + + /* local send */ + ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ksocknal_data.ksnd_idle_ltx_lock, flags); + + list_add_tail (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ksocknal_data.ksnd_idle_ltx_list && + waitqueue_active (&ksocknal_data.ksnd_idle_ltx_waitq)) + wake_up (&ksocknal_data.ksnd_idle_ltx_waitq); + + spin_unlock_irqrestore (&ksocknal_data.ksnd_idle_ltx_lock, flags); + EXIT; +} + +void +ksocknal_process_transmit (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + LASSERT (!list_empty (&sched->kss_tx_conns)); + conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + /* assume transmit will complete now, so dequeue while I've got lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0;/* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ksocknal_sendmsg (conn->ksnc_sock, tx, + !list_empty (&conn->ksnc_tx_queue)); /* more to come? */ + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc != 0) { +#warning FIXME: handle socket errors properly + CERROR("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + /* kid on for now the whole packet went. + * NB when we handle the error better, we'll still need to + * block for zccd completion. + */ + tx->tx_nob = 0; + } + + if (tx->tx_nob == 0) /* nothing left to send */ + { + /* everything went; assume more can go, so prevent write_space locking */ + conn->ksnc_tx_ready = 1; + + ksocknal_put_conn (conn); /* release packet's ref */ + atomic_inc (&ksocknal_packets_being_sent); +#if SOCKNAL_ZC + if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { + /* zccd skbufs are still in-flight. Release my + * initial ref on zccd, so callback can occur */ + zccd_put (&tx->tx_zccd); + } else +#endif + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + } else { + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) {/* nothing to write */ + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); +} + +void +ksocknal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + unsigned long flags; + ksock_sched_t *sched = conn->ksnc_scheduler; + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete portals header. + */ + LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + + lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); + + CDEBUG (D_NET, "type %d, nob %d niov %d nkiov %d\n", + ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, tx->tx_nob, + tx->tx_niov, tx->tx_nkiov); + +#if SOCKNAL_ZC + zccd_init (&tx->tx_zccd, ksocknal_zc_callback); + /* NB this sets 1 ref on zccd, so the callback can only occur + * after I've released this ref */ + tx->tx_sched = sched; +#endif + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + + atomic_inc (&ksocknal_packets_launched); +} + +ksock_conn_t * +ksocknal_send_target (ptl_nid_t nid) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + int rc; + + if ((conn = ksocknal_get_conn (nid)) == NULL) { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) { + CERROR("Can't route to "LPX64": router error %d\n", + nid, rc); + return (NULL); + } + + if ((conn = ksocknal_get_conn (gatewaynid)) == NULL) { + CERROR ("Can't route to "LPX64": gateway "LPX64 + " is not a peer\n", nid, gatewaynid); + return (NULL); + } + } + + return (conn); +} + +ksock_ltx_t * +ksocknal_setup_hdr (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type) +{ + ksock_ltx_t *ltx; + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ksocknal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) { + CERROR ("Can't allocate tx desc\n"); + return (NULL); + } + + /* Init local send packet (storage for hdr, finalize() args) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + /* Init common ltx_tx */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr); + + /* We always have 1 mapped frag for the header */ + ltx->ltx_tx.tx_niov = 1; + ltx->ltx_tx.tx_iov = <x->ltx_iov_space.hdr; + ltx->ltx_tx.tx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_tx.tx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + ltx->ltx_tx.tx_kiov = NULL; + ltx->ltx_tx.tx_nkiov = 0; + + return (ltx); +} + +int +ksocknal_send (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it + * + * Also, the return code from this procedure is ignored. + * If we can't send, we must still complete with lib_finalize(). + * We'll have to wait for 3.2 to return an error event. + */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64 + " pid %d\n", payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) { + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + lib_finalize (&ksocknal_lib, private, cookie); + return (-1); + } + + /* append the payload_iovs to the one pointing at the header */ + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + memcpy (ltx->ltx_tx.tx_iov + 1, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +int +ksocknal_send_pages (nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_iov, size_t payload_len) +{ + ksock_ltx_t *ltx; + ksock_conn_t *conn; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it until we can rely on it */ + + CDEBUG(D_NET, + "sending "LPSZ" bytes in %d mapped frags to nid: "LPX64" pid %d\n", + payload_len, payload_niov, nid, pid); + + conn = ksocknal_send_target (nid); + if (conn == NULL) + return (-1); + + ltx = ksocknal_setup_hdr (nal, private, cookie, hdr, type); + if (ltx == NULL) { + ksocknal_put_conn (conn); + return (-1); + } + + LASSERT (ltx->ltx_tx.tx_niov == 1 && ltx->ltx_tx.tx_nkiov == 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + ltx->ltx_tx.tx_kiov = ltx->ltx_iov_space.payload.kiov; + memcpy (ltx->ltx_tx.tx_kiov, payload_iov, + payload_niov * sizeof (*payload_iov)); + ltx->ltx_tx.tx_nkiov = payload_niov; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + + ksocknal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + /* I'm the gateway; must be the last hop */ + if (nid == ksocknal_lib.ni.nid) + nid = fwd->kprfd_target_nid; + + conn = ksocknal_get_conn (nid); + if (conn == NULL) { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + tx->tx_nkiov = 0; + tx->tx_kiov = NULL; + + ksocknal_launch_packet (conn, tx); +} + +int +ksocknal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ksocknal_data.ksnd_nthreads); + return (0); +} + +void +ksocknal_thread_fini (void) +{ + atomic_dec (&ksocknal_data.ksnd_nthreads); +} + +void +ksocknal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ksock_fmb_pool_t *fmp = fmb->fmb_pool; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn = NULL; + ksock_sched_t *sched; + long flags; + + if (error != 0) + CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", + NTOH__u64(hdr->src_nid), NTOH__u64(hdr->dest_nid), + error); + else + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", + NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid)); + + spin_lock_irqsave (&fmp->fmp_lock, flags); + + list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); + + if (!list_empty (&fmp->fmp_blocked_conns)) { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + } + + spin_unlock_irqrestore (&fmp->fmp_lock, flags); + + if (conn == NULL) + return; + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + + spin_unlock_irqrestore (&sched->kss_lock, flags); +} + +ksock_fmb_t * +ksocknal_get_idle_fmb (ksock_conn_t *conn) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + long flags; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (ksocknal_data.ksnd_fmbs != NULL); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ksocknal_data.ksnd_small_fmp; + else + pool = &ksocknal_data.ksnd_large_fmp; + + spin_lock_irqsave (&pool->fmp_lock, flags); + + if (!list_empty (&pool->fmp_idle_fmbs)) { + fmb = list_entry(pool->fmp_idle_fmbs.next, + ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + spin_unlock_irqrestore (&pool->fmp_lock, flags); + + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + + spin_unlock_irqrestore (&pool->fmp_lock, flags); + return (NULL); +} + + +int +ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + /* copy header */ + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); + + if (payload_nob == 0) { /* got complete packet already */ + atomic_inc (&ksocknal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, 1, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* forward it now */ + kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) { /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + } else { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = + page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, dest_nid, + packet_nob, niov, fmb->fmb_iov, + ksocknal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / + sizeof (struct iovec)); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = + (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = + fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy(&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], + (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), dest_nid, payload_nob); + return (0); +} + +void +ksocknal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + ptl_nid_t dest_nid = NTOH__u64 (conn->ksnc_hdr.dest_nid); + int body_len = NTOH__u32 (PTL_HDR_LENGTH(&conn->ksnc_hdr)); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + if (body_len < 0) { /* length corrupt (overflow) */ + CERROR("dropping packet from "LPX64" for "LPX64": packet " + "size %d illegal\n", NTOH__u64 (conn->ksnc_hdr.src_nid), + dest_nid, body_len); + ksocknal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (ksocknal_data.ksnd_fmbs == NULL) { /* not forwarding */ + CERROR("dropping packet from "LPX64" for "LPX64": not " + "forwarding\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) { /* too big to forward */ + CERROR ("dropping packet from "LPX64" for "LPX64 + ": packet size %d too big\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid, body_len); + /* on to new packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + /* should have gone direct */ + conn2 = ksocknal_get_conn (conn->ksnc_hdr.dest_nid); + if (conn2 != NULL) { + CERROR ("dropping packet from "LPX64" for "LPX64 + ": target is a peer\n", conn->ksnc_hdr.src_nid, + conn->ksnc_hdr.dest_nid); + ksocknal_put_conn (conn2); /* drop ref from get above */ + + /* on to next packet (skip this one's body) */ + ksocknal_new_packet (conn, body_len); + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + return (1); + } + + /* Set up to skip as much a possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ksocknal_process_receive (ksock_sched_t *sched, long *irq_flags) +{ + ksock_conn_t *conn; + ksock_fmb_t *fmb; + int rc; + + /* NB: sched->ksnc_lock lock held */ + + LASSERT (!list_empty (&sched->kss_rx_conns)); + conn = list_entry(sched->kss_rx_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + spin_unlock_irqrestore (&sched->kss_lock, *irq_flags); + + CDEBUG(D_NET, "sched %p conn %p\n", sched, conn); + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* doesn't need a forwarding buffer */ + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) + goto try_read; + + get_fmb: + fmb = ksocknal_get_idle_fmb (conn); + if (fmb == NULL) { /* conn descheduled waiting for idle fmb */ + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + return; + } + + if (ksocknal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0;/* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + rc = ksocknal_recvmsg(conn); + + if (rc == 0) + goto out; + if (rc < 0) { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read %p: %d\n", conn, rc); + goto out; + } + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + /* got all I wanted, assume there's more - prevent data_ready locking */ + conn->ksnc_rx_ready = 1; + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: + /* It's not for me */ + if (conn->ksnc_hdr.type != PTL_MSG_HELLO && + NTOH__u64(conn->ksnc_hdr.dest_nid) != ksocknal_lib.ni.nid) { + ksocknal_fwd_parse (conn); + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + goto get_fmb; /* => go get a fwd msg buffer */ + default: + LBUG (); + } + /* Not Reached */ + } + + PROF_START(lib_parse); + /* sets wanted_len, iovs etc */ + lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ksocknal_packets_received); + /* packet is done now */ + lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", + conn, NTOH__u64 (conn->ksnc_hdr.src_nid), + NTOH__u64 (conn->ksnc_hdr.dest_nid), + conn->ksnc_rx_nob_left); + + atomic_inc (&ksocknal_packets_received); + + /* ksocknal_init_fmb() put router desc. in conn->ksnc_cookie */ + kpr_fwd_start (&ksocknal_data.ksnd_router, + (kpr_fwd_desc_t *)conn->ksnc_cookie); + + /* no slop in forwarded packets */ + LASSERT (conn->ksnc_rx_nob_left == 0); + + ksocknal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&sched->kss_lock, *irq_flags); + + /* no data there to read? */ + if (!conn->ksnc_rx_ready) { + /* let socket callback schedule again */ + conn->ksnc_rx_scheduled = 0; + ksocknal_put_conn (conn); /* release scheduler's ref */ + } else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); +} + +int +ksocknal_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + memcpy (conn->ksnc_rx_iov, iov, niov * sizeof (*iov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int +ksocknal_recv_pages (nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + + LASSERT (mlen <= rlen); + LASSERT (niov <= PTL_MD_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_nkiov = niov; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + memcpy (conn->ksnc_rx_kiov, kiov, niov * sizeof (*kiov)); + + LASSERT (mlen == + lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + return (rlen); +} + +int ksocknal_scheduler (void *arg) +{ + ksock_sched_t *sched = (ksock_sched_t *)arg; + unsigned long flags; + int rc; + int nloops = 0; + int id = sched - ksocknal_data.ksnd_schedulers; + char name[16]; +#if (CONFIG_SMP && CPU_AFFINITY) +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + int cpu = cpu_logical_map(id % num_online_cpus()); +#else +#warning "Take care of architecure specific logical APIC map" + int cpu = 1; /* Have to change later. */ +#endif /* LINUX_VERSION_CODE */ + + set_cpus_allowed (current, 1 << cpu); + id = cpu; +#endif /* CONFIG_SMP && CPU_AFFINITY */ + + snprintf (name, sizeof (name),"ksocknald[%d]", id); + kportal_daemonize (name); + kportal_blockallsigs (); + + spin_lock_irqsave (&sched->kss_lock, flags); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + did_something = 1; + /* drops & regains kss_lock */ + ksocknal_process_receive (sched, &flags); + } + + if (!list_empty (&sched->kss_tx_conns)) { + did_something = 1; + /* drops and regains kss_lock */ + ksocknal_process_transmit (sched, &flags); + } +#if SOCKNAL_ZC + if (!list_empty (&sched->kss_zctxdone_list)) { + ksock_tx_t *tx = + list_entry(sched->kss_zctxdone_list.next, + ksock_tx_t, tx_list); + did_something = 1; + + list_del (&tx->tx_list); + spin_unlock_irqrestore (&sched->kss_lock, flags); + + ksocknal_tx_done (tx); + + spin_lock_irqsave (&sched->kss_lock, flags); + } +#endif + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_irqrestore (&sched->kss_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ +#if SOCKNAL_ZC + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns) || + !list_empty(&sched->kss_zctxdone_list)); +#else + rc = wait_event_interruptible (sched->kss_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&sched->kss_rx_conns) || + !list_empty(&sched->kss_tx_conns)); +#endif + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&sched->kss_lock, flags); + } + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + ksocknal_thread_fini (); + return (0); +} + +void +ksocknal_data_ready (struct sock *sk, int n) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + ENTRY; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->data_ready != &ksocknal_data_ready); + sk->data_ready (sk, n); + } else if (!conn->ksnc_rx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_rx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_receive may have cleared while I blocked for the lock) */ + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); + + EXIT; +} + +void +ksocknal_write_space (struct sock *sk) +{ + unsigned long flags; + ksock_conn_t *conn; + ksock_sched_t *sched; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_socklist_lock); + + conn = sk->user_data; + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_close_sock */ + LASSERT (sk->write_space != &ksocknal_write_space); + sk->write_space (sk); + } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + clear_bit (SOCK_NOSPACE, &sk->socket->flags); + + if (!conn->ksnc_tx_ready) { /* new news */ + /* Set ASAP in case of concurrent calls to me */ + conn->ksnc_tx_ready = 1; + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + /* Set again (process_transmit may have + cleared while I blocked for the lock) */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && // not being progressed + !list_empty(&conn->ksnc_tx_queue)){//packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&sched->kss_waitq)) + wake_up (&sched->kss_waitq); + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + } + } + + read_unlock (&ksocknal_data.ksnd_socklist_lock); +} + +int +ksocknal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ksocknal_reaper"); + kportal_blockallsigs (); + + while (!ksocknal_data.ksnd_shuttingdown) { + spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ksocknal_data.ksnd_reaper_list)) { + conn = NULL; + } else { + conn = list_entry (ksocknal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ksocknal_close_conn (conn); + else { + rc = wait_event_interruptible (ksocknal_data.ksnd_reaper_waitq, + ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ksocknal_thread_fini (); + return (0); +} + +nal_cb_t ksocknal_lib = { + nal_data: &ksocknal_data, /* NAL private data */ + cb_send: ksocknal_send, + cb_send_pages: ksocknal_send_pages, + cb_recv: ksocknal_recv, + cb_recv_pages: ksocknal_recv_pages, + cb_read: ksocknal_read, + cb_write: ksocknal_write, + cb_callback: ksocknal_callback, + cb_malloc: ksocknal_malloc, + cb_free: ksocknal_free, + cb_printf: ksocknal_printf, + cb_cli: ksocknal_cli, + cb_sti: ksocknal_sti, + cb_dist: ksocknal_dist +}; diff --git a/lustre/portals/knals/toenal/.cvsignore b/lustre/portals/knals/toenal/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/knals/toenal/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/knals/toenal/Makefile.am b/lustre/portals/knals/toenal/Makefile.am new file mode 100644 index 0000000..9bfff64 --- /dev/null +++ b/lustre/portals/knals/toenal/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../../Rules.linux + +MODULE = ktoenal +modulenet_DATA = ktoenal.o +EXTRA_PROGRAMS = ktoenal + +DEFS = +ktoenal_SOURCES = toenal.c toenal_cb.c toenal.h diff --git a/lustre/portals/knals/toenal/toenal.c b/lustre/portals/knals/toenal/toenal.c new file mode 100644 index 0000000..1f5dc38 --- /dev/null +++ b/lustre/portals/knals/toenal/toenal.c @@ -0,0 +1,629 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Kedar Sovani <kedar@calsoftinc.com> + * Author: Amey Inamdar <amey@calsoftinc.com> + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include <linux/poll.h> +#include "toenal.h" + +ptl_handle_ni_t ktoenal_ni; +static nal_t ktoenal_api; +static ksock_nal_data_t ktoenal_data; + +/* +ksocknal_interface_t ktoenal_interface = { + ksni_add_sock: ktoenal_add_sock, + ksni_close_sock: ktoenal_close_sock, + ksni_set_mynid: ktoenal_set_mynid, +}; +*/ + +kpr_nal_interface_t ktoenal_router_interface = { + kprni_nalid: TOENAL, + kprni_arg: &ktoenal_data, + kprni_fwd: ktoenal_fwd_packet, +}; + + +int +ktoenal_api_forward(nal_t *nal, int id, void *args, size_t args_len, + void *ret, size_t ret_len) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + + lib_dispatch(nal_cb, k, id, args, ret); /* ktoenal_send needs k */ + return PTL_OK; +} + +int +ktoenal_api_shutdown(nal_t *nal, int ni) +{ + CDEBUG (D_NET, "closing all connections\n"); + + return ktoenal_close_sock(0); /* close all sockets */ +} + +void +ktoenal_api_yield(nal_t *nal) +{ + our_cond_resched(); + return; +} + +void +ktoenal_api_lock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_cli(nal_cb,flags); +} + +void +ktoenal_api_unlock(nal_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *k; + nal_cb_t *nal_cb; + + k = nal->nal_data; + nal_cb = k->ksnd_nal_cb; + nal_cb->cb_sti(nal_cb,flags); +} + +nal_t * +ktoenal_init(int interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t ac_size, ptl_pid_t requested_pid) +{ + CDEBUG(D_NET, "calling lib_init with nid "LPX64"\n", + ktoenal_data.ksnd_mynid); + lib_init(&ktoenal_lib, ktoenal_data.ksnd_mynid, 0, 10, ptl_size, + ac_size); + return (&ktoenal_api); +} + +/* + * EXTRA functions follow + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define SOCKET_I(inode) (&(inode)->u.socket_i) +#endif +static __inline__ struct socket * +socki_lookup(struct inode *inode) +{ + return SOCKET_I(inode); +} + +int +ktoenal_set_mynid(ptl_nid_t nid) +{ + lib_ni_t *ni = &ktoenal_lib.ni; + + /* FIXME: we have to do this because we call lib_init() at module + * insertion time, which is before we have 'mynid' available. lib_init + * sets the NAL's nid, which it uses to tell other nodes where packets + * are coming from. This is not a very graceful solution to this + * problem. */ + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", nid, ni->nid); + + ktoenal_data.ksnd_mynid = nid; + ni->nid = nid; + return (0); +} + +int +ktoenal_add_sock (ptl_nid_t nid, int fd) +{ + unsigned long flags; + ksock_conn_t *conn; + struct file *file = NULL; + struct socket *sock = NULL; + int ret; + ENTRY; + + file = fget(fd); + if (file == NULL) + RETURN(-EINVAL); + + ret = -EINVAL; + sock = socki_lookup(file->f_dentry->d_inode); + if (sock == NULL) + GOTO(error, ret); + + ret = -ENOMEM; + PORTAL_ALLOC(conn, sizeof(*conn)); + if (!conn) + GOTO(error, ret); + + memset (conn, 0, sizeof (conn)); /* zero for consistency */ + file->f_flags |= O_NONBLOCK; /* Does this have any conflicts */ + conn->ksnc_file = file; + conn->ksnc_sock = sock; + conn->ksnc_peernid = nid; + atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for socklist */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + ktoenal_new_packet (conn, 0); + + INIT_LIST_HEAD (&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + list_add(&conn->ksnc_list, &ktoenal_data.ksnd_socklist); + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + ktoenal_data_ready(conn); + ktoenal_write_space(conn); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + /* Schedule pollthread so that it will poll + * for newly created socket + */ + + + CDEBUG(D_IOCTL, "conn [%p] registered for nid "LPX64"\n", + conn, conn->ksnc_peernid); + + /* Can't unload while connection active */ + PORTAL_MODULE_USE; + RETURN(0); + +error: + fput(file); + return (ret); +} + +/* Passing in a zero nid will close all connections */ +int +ktoenal_close_sock(ptl_nid_t nid) +{ + long flags; + ksock_conn_t *conn; + LIST_HEAD (death_row); + struct list_head *tmp; + + LASSERT (!in_interrupt()); + write_lock_irqsave (&ktoenal_data.ksnd_socklist_lock, flags); + + if (nid == 0) /* close ALL connections */ + { + /* insert 'death row' into the socket list... */ + list_add (&death_row, &ktoenal_data.ksnd_socklist); + /* ...extract and reinitialise the socket list itself... */ + list_del_init (&ktoenal_data.ksnd_socklist); + /* ...and voila, death row is the proud owner of all conns */ + } else list_for_each (tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry (tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + list_del (&conn->ksnc_list); + list_add (&conn->ksnc_list, &death_row); + break; + } + } + + + write_unlock_irqrestore (&ktoenal_data.ksnd_socklist_lock, flags); + + if (list_empty (&death_row)) + return (-ENOENT); + + do { + conn = list_entry (death_row.next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + ktoenal_put_conn (conn); /* drop ref for ksnd_socklist */ + } while (!list_empty (&death_row)); + + ktoenal_data.ksnd_slistchange = 1; + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + return (0); +} + + +ksock_conn_t * +ktoenal_get_conn (ptl_nid_t nid) +{ + struct list_head *tmp; + ksock_conn_t *conn; + + PROF_START(conn_list_walk); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_peernid == nid) + { + /* caller is referencing */ + atomic_inc (&conn->ksnc_refcount); + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "got conn [%p] -> "LPX64" (%d)\n", + conn, nid, atomic_read (&conn->ksnc_refcount)); + + PROF_FINISH(conn_list_walk); + return (conn); + } + } + + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + CDEBUG(D_NET, "No connection found when looking for nid "LPX64"\n", nid); + PROF_FINISH(conn_list_walk); + return (NULL); +} + +void +ktoenal_close_conn (ksock_conn_t *conn) +{ + CDEBUG (D_NET, "connection [%p] closed \n", conn); + + fput (conn->ksnc_file); + PORTAL_FREE (conn, sizeof (*conn)); + /* One less connection keeping us hanging on */ + PORTAL_MODULE_UNUSE; +} + +void +_ktoenal_put_conn (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "connection [%p] handed the black spot\n", conn); + + /* "But what is the black spot, captain?" I asked. + * "That's a summons, mate..." */ + + LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (!conn->ksnc_rx_scheduled); + + if (!in_interrupt()) + { + ktoenal_close_conn (conn); + return; + } + + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + list_add (&conn->ksnc_list, &ktoenal_data.ksnd_reaper_list); + wake_up (&ktoenal_data.ksnd_reaper_waitq); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); +} + +void +ktoenal_free_buffers (void) +{ + if (ktoenal_data.ksnd_fmbs != NULL) + { + ksock_fmb_t *fmb = (ksock_fmb_t *)ktoenal_data.ksnd_fmbs; + int i; + int j; + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++, fmb++) + for (j = 0; j < fmb->fmb_npages; j++) + if (fmb->fmb_pages[j] != NULL) + __free_page (fmb->fmb_pages[j]); + + PORTAL_FREE (ktoenal_data.ksnd_fmbs, + sizeof (ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + } + + if (ktoenal_data.ksnd_ltxs != NULL) + PORTAL_FREE (ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); +} + +int +ktoenal_cmd(struct portal_ioctl_data * data, void * private) +{ + int rc = -EINVAL; + + LASSERT (data != NULL); + + switch(data->ioc_nal_cmd) { + case NAL_CMD_REGISTER_PEER_FD: { + rc = ktoenal_add_sock(data->ioc_nid, data->ioc_fd); + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = ktoenal_close_sock(data->ioc_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + rc = ktoenal_set_mynid (data->ioc_nid); + break; + } + } + + return rc; +} + + +void __exit +ktoenal_module_fini (void) +{ + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + switch (ktoenal_data.ksnd_init) + { + default: + LASSERT (0); + + case SOCKNAL_INIT_ALL: + kportal_nal_unregister(TOENAL); + PORTAL_SYMBOL_UNREGISTER (ktoenal_ni); + /* fall through */ + + case SOCKNAL_INIT_PTL: + PtlNIFini(ktoenal_ni); + lib_fini(&ktoenal_lib); + /* fall through */ + + case SOCKNAL_INIT_DATA: + /* Module refcount only gets to zero when all connections + * have been closed so all lists must be empty */ + LASSERT (list_empty (&ktoenal_data.ksnd_socklist)); + LASSERT (list_empty (&ktoenal_data.ksnd_reaper_list)); + LASSERT (list_empty (&ktoenal_data.ksnd_rx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns)); + + kpr_shutdown (&ktoenal_data.ksnd_router); /* stop router calling me */ + + /* flag threads to terminate; wake and wait for them to die */ + ktoenal_data.ksnd_shuttingdown = 1; + wake_up_all (&ktoenal_data.ksnd_reaper_waitq); + wake_up_all (&ktoenal_data.ksnd_sched_waitq); + wake_up_process(ktoenal_data.ksnd_pollthread_tsk); + + while (atomic_read (&ktoenal_data.ksnd_nthreads) != 0) + { + CDEBUG (D_NET, "waitinf for %d threads to terminate\n", + atomic_read (&ktoenal_data.ksnd_nthreads)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + + kpr_deregister (&ktoenal_data.ksnd_router); + + ktoenal_free_buffers(); + /* fall through */ + + case SOCKNAL_INIT_NOTHING: + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); +} + +int __init +ktoenal_module_init (void) +{ + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + int j; + + /* packet descriptor must fit in a router descriptor's scratchpad */ + LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + + LASSERT (ktoenal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + + ktoenal_api.forward = ktoenal_api_forward; + ktoenal_api.shutdown = ktoenal_api_shutdown; + ktoenal_api.yield = ktoenal_api_yield; + ktoenal_api.validate = NULL; /* our api validate is a NOOP */ + ktoenal_api.lock = ktoenal_api_lock; + ktoenal_api.unlock = ktoenal_api_unlock; + ktoenal_api.nal_data = &ktoenal_data; + + ktoenal_lib.nal_data = &ktoenal_data; + + memset (&ktoenal_data, 0, sizeof (ktoenal_data)); /* zero pointers */ + + INIT_LIST_HEAD(&ktoenal_data.ksnd_socklist); + rwlock_init(&ktoenal_data.ksnd_socklist_lock); + + ktoenal_data.ksnd_nal_cb = &ktoenal_lib; + spin_lock_init (&ktoenal_data.ksnd_nal_cb_lock); + + spin_lock_init (&ktoenal_data.ksnd_sched_lock); + + init_waitqueue_head (&ktoenal_data.ksnd_sched_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_rx_conns); + INIT_LIST_HEAD (&ktoenal_data.ksnd_tx_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_small_fmp.fmp_blocked_conns); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_idle_fmbs); + INIT_LIST_HEAD(&ktoenal_data.ksnd_large_fmp.fmp_blocked_conns); + + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_nblk_ltx_list); + INIT_LIST_HEAD(&ktoenal_data.ksnd_idle_ltx_list); + init_waitqueue_head(&ktoenal_data.ksnd_idle_ltx_waitq); + + INIT_LIST_HEAD (&ktoenal_data.ksnd_reaper_list); + init_waitqueue_head(&ktoenal_data.ksnd_reaper_waitq); + spin_lock_init (&ktoenal_data.ksnd_reaper_lock); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_DATA; /* flag lists/ptrs/locks initialised */ + + PORTAL_ALLOC(ktoenal_data.ksnd_fmbs, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + if (ktoenal_data.ksnd_fmbs == NULL) + RETURN(-ENOMEM); + + /* NULL out buffer pointers etc */ + memset(ktoenal_data.ksnd_fmbs, 0, + sizeof(ksock_fmb_t) * (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS)); + + for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + SOCKNAL_LARGE_FWD_NMSGS); i++) + { + ksock_fmb_t *fmb = &((ksock_fmb_t *)ktoenal_data.ksnd_fmbs)[i]; + + if (i < SOCKNAL_SMALL_FWD_NMSGS) + { + fmb->fmb_npages = SOCKNAL_SMALL_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_small_fmp; + } + else + { + fmb->fmb_npages = SOCKNAL_LARGE_FWD_PAGES; + fmb->fmb_pool = &ktoenal_data.ksnd_large_fmp; + } + + LASSERT (fmb->fmb_npages > 0); + for (j = 0; j < fmb->fmb_npages; j++) + { + fmb->fmb_pages[j] = alloc_page(GFP_KERNEL); + + if (fmb->fmb_pages[j] == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + LASSERT (page_address (fmb->fmb_pages[j]) != NULL); + } + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + } + + PORTAL_ALLOC(ktoenal_data.ksnd_ltxs, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + if (ktoenal_data.ksnd_ltxs == NULL) + { + ktoenal_module_fini (); + return (-ENOMEM); + } + + /* Deterministic bugs please */ + memset (ktoenal_data.ksnd_ltxs, 0xeb, + sizeof (ksock_ltx_t) * (SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS)); + + for (i = 0; i < SOCKNAL_NLTXS + SOCKNAL_NNBLK_LTXS; i++) + { + ksock_ltx_t *ltx = &((ksock_ltx_t *)ktoenal_data.ksnd_ltxs)[i]; + + ltx->ltx_idle = i < SOCKNAL_NLTXS ? + &ktoenal_data.ksnd_idle_ltx_list : + &ktoenal_data.ksnd_idle_nblk_ltx_list; + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + } + + rc = PtlNIInit(ktoenal_init, 32, 4, 0, &ktoenal_ni); + if (rc != 0) + { + CERROR("ktoenal: PtlNIInit failed: error %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + PtlNIDebug(ktoenal_ni, ~0); + + ktoenal_data.ksnd_init = SOCKNAL_INIT_PTL; /* flag PtlNIInit() called */ + + ktoenal_data.ksnd_slistchange = 1; + for (i = 0; i < TOENAL_N_SCHED; i++) + { + rc = ktoenal_thread_start (ktoenal_scheduler, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); + ktoenal_module_fini (); + RETURN (rc); + } + } + + rc = ktoenal_thread_start (ktoenal_reaper, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal reaper: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = ktoenal_thread_start (ktoenal_pollthread, NULL); + if (rc != 0) + { + CERROR("Can't spawn socknal pollthread: %d\n", rc); + ktoenal_module_fini (); + RETURN (rc); + } + + rc = kpr_register(&ktoenal_data.ksnd_router, + &ktoenal_router_interface); + if (rc != 0) + CDEBUG (D_NET, "Can't initialise routing interface (rc = %d): not routing\n", rc); + + rc = kportal_nal_register(TOENAL, &ktoenal_cmd, NULL); + if (rc != 0) + CDEBUG(D_NET, "Can't initialise command interface (rc = %d)\n", + rc); + + PORTAL_SYMBOL_REGISTER(ktoenal_ni); + + /* flag everything initialised */ + ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL; + + printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n", + kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled", + pkmem); + + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); +MODULE_DESCRIPTION("Kernel TCP Socket NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(ktoenal_module_init); +module_exit(ktoenal_module_fini); + +EXPORT_SYMBOL (ktoenal_ni); diff --git a/lustre/portals/knals/toenal/toenal.h b/lustre/portals/knals/toenal/toenal.h new file mode 100644 index 0000000..f793d3b --- /dev/null +++ b/lustre/portals/knals/toenal/toenal.h @@ -0,0 +1,236 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Kedar Sovani <kedar@calsoftinc.com> + * Author: Amey Inamdar <amey@calsoftinc.com> + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <net/tcp.h> +#include <linux/uio.h> +#include <linux/sched.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/segment.h> + +#define DEBUG_SUBSYSTEM S_SOCKNAL + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +#define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ + +#define SOCKNAL_NLTXS 128 /* # normal transmit messages */ +#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ + +#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ +#define SOCKNAL_LARGE_FWD_NMSGS 32 /* # large messages I can be forwarding at any time */ + +#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ + +#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN (sizeof (ptl_hdr_t) + SOCKNAL_MAX_FWD_PAYLOAD) >> PAGE_SHIFT) + /* # pages in a large message fwd buffer */ + +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sndbuf*8)/10) + +#define TOENAL_N_SCHED 1 + +typedef struct /* pool of forwarding buffers */ +{ + struct list_head fmp_idle_fmbs; /* buffers waiting for a connection */ + struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ +} ksock_fmb_pool_t; + +typedef struct { + int ksnd_init; /* initialisation state */ + + struct list_head ksnd_socklist; /* all my connections */ + rwlock_t ksnd_socklist_lock; /* stabilise add/find/remove */ + + + ptl_nid_t ksnd_mynid; + nal_cb_t *ksnd_nal_cb; + spinlock_t ksnd_nal_cb_lock; /* lib cli/sti lock */ + + atomic_t ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + + kpr_router_t ksnd_router; /* THE router */ + + spinlock_t ksnd_sched_lock; /* serialise packet scheduling */ + wait_queue_head_t ksnd_sched_waitq; /* where scheduler(s) wait */ + + struct list_head ksnd_rx_conns; /* conn waiting to be read */ + struct list_head ksnd_tx_conns; /* conn waiting to be written */ + + void *ksnd_fmbs; /* all the pre-allocated FMBs */ + ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ + ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ + + void *ksnd_ltxs; /* all the pre-allocated LTXs */ + struct list_head ksnd_idle_ltx_list; /* where to get an idle LTX */ + struct list_head ksnd_idle_nblk_ltx_list; /* where to get an idle LTX if you can't block */ + wait_queue_head_t ksnd_idle_ltx_waitq; /* where to block for an idle LTX */ + + struct list_head ksnd_reaper_list; /* conn waiting to be reaped */ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + struct task_struct *ksnd_pollthread_tsk;/* task_struct for the poll thread */ + poll_table ksnd_pwait; /* poll wait table for the socket */ + int ksnd_slistchange; /* informs the pollthread that + * the socklist has changed */ +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_PTL 2 +#define SOCKNAL_INIT_ALL 3 + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + char tx_isfwd; /* forwarding / sourced here */ + int tx_nob; /* # packet bytes */ + int tx_niov; /* # packet frags */ + struct iovec *tx_iov; /* packet frags */ +} ksock_tx_t; + +typedef struct /* locally transmitted packet */ +{ + ksock_tx_t ltx_tx; /* send info */ + struct list_head *ltx_idle; /* where to put when idle */ + void *ltx_private; /* lib_finalize() callback arg */ + void *ltx_cookie; /* lib_finalize() callback arg */ + struct iovec ltx_iov[1 + PTL_MD_MAX_IOV]; /* msg frags */ + ptl_hdr_t ltx_hdr; /* buffer for packet header */ +} ksock_ltx_t; + +#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry (ptr, kpr_fwd_desc_t, kprfd_scratch) +/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ + +#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) +/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ + +/* NB list_entry() is used here as convenient macro for calculating a + * pointer to a struct from the addres of a member. + */ + +typedef struct /* Kernel portals Socket Forwarding message buffer */ +{ /* (socknal->router) */ + struct list_head fmb_list; /* queue idle */ + kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ + int fmb_npages; /* # pages allocated */ + ksock_fmb_pool_t *fmb_pool; /* owning pool */ + struct page *fmb_pages[SOCKNAL_LARGE_FWD_PAGES]; + struct iovec fmb_iov[SOCKNAL_LARGE_FWD_PAGES]; +} ksock_fmb_t; + +#define SOCKNAL_RX_HEADER 1 /* reading header */ +#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ +#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ +#define SOCKNAL_RX_SLOP 4 /* skipping body */ +#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ +#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ + +typedef struct +{ + struct list_head ksnc_list; /* stash on global socket list */ + struct file *ksnc_file; /* socket filp */ + struct socket *ksnc_sock; /* socket */ + ptl_nid_t ksnc_peernid; /* who's on the other end */ + atomic_t ksnc_refcount; /* # users */ + + /* READER */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_ready; /* data ready to read */ + int ksnc_rx_scheduled; /* being progressed */ + int ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # frags */ + struct iovec ksnc_rx_iov[1 + PTL_MD_MAX_IOV]; /* the frags */ + + void *ksnc_cookie; /* rx lib_finalize passthru arg */ + ptl_hdr_t ksnc_hdr; /* where I read headers into */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + unsigned long ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + +} ksock_conn_t; + +extern int ktoenal_add_sock (ptl_nid_t nid, int fd); +extern int ktoenal_close_sock(ptl_nid_t nid); +extern int ktoenal_set_mynid(ptl_nid_t nid); +extern int ktoenal_push_sock(ptl_nid_t nid); +extern ksock_conn_t *ktoenal_get_conn (ptl_nid_t nid); +extern void _ktoenal_put_conn (ksock_conn_t *conn); +extern void ktoenal_close_conn (ksock_conn_t *conn); + +static inline void +ktoenal_put_conn (ksock_conn_t *conn) +{ + CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", + conn, conn->ksnc_peernid, atomic_read (&conn->ksnc_refcount)); + + if (atomic_dec_and_test (&conn->ksnc_refcount)) + _ktoenal_put_conn (conn); +} + +extern int ktoenal_thread_start (int (*fn)(void *arg), void *arg); +extern int ktoenal_new_packet (ksock_conn_t *conn, int skip); +extern void ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); +extern int ktoenal_scheduler (void *arg); +extern int ktoenal_reaper (void *arg); +extern int ktoenal_pollthread (void *arg); +extern void ktoenal_data_ready(ksock_conn_t *conn); +extern void ktoenal_write_space(ksock_conn_t *conn); + + +extern nal_cb_t ktoenal_lib; +extern ksock_nal_data_t ktoenal_data; diff --git a/lustre/portals/knals/toenal/toenal_cb.c b/lustre/portals/knals/toenal/toenal_cb.c new file mode 100644 index 0000000..ec37f6f --- /dev/null +++ b/lustre/portals/knals/toenal/toenal_cb.c @@ -0,0 +1,1219 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Kedar Sovani <kedar@calsoftinc.com> + * Author: Amey Inamdar <amey@calsoftinc.com> + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/poll.h> +#include "toenal.h" + +atomic_t ktoenal_packets_received; +long ktoenal_packets_launched; +long ktoenal_packets_transmitted; + +/* + * LIB functions follow + * + */ +int +ktoenal_read(nal_cb_t *nal, void *private, void *dst_addr, + user_ptr src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": reading %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, + void *src_addr, size_t len) +{ + CDEBUG(D_NET, LPX64": writing %ld bytes from %p -> %p\n", + nal->ni.nid, (long)len, src_addr, dst_addr); + + memcpy( dst_addr, src_addr, len ); + return 0; +} + +int +ktoenal_callback (nal_cb_t * nal, void *private, lib_eq_t *eq, + ptl_event_t *ev) +{ + CDEBUG(D_NET, LPX64": callback eq %p ev %p\n", + nal->ni.nid, eq, ev); + + if (eq->event_callback != NULL) + eq->event_callback(ev); + + return 0; +} + +void * +ktoenal_malloc(nal_cb_t *nal, size_t len) +{ + void *buf; + + PORTAL_ALLOC(buf, len); + + if (buf != NULL) + memset(buf, 0, len); + + return (buf); +} + +void +ktoenal_free(nal_cb_t *nal, void *buf, size_t len) +{ + PORTAL_FREE(buf, len); +} + +void +ktoenal_printf(nal_cb_t *nal, const char *fmt, ...) +{ + va_list ap; + char msg[256]; + + va_start (ap, fmt); + vsnprintf (msg, sizeof (msg), fmt, ap); /* sprint safely */ + va_end (ap); + + msg[sizeof (msg) - 1] = 0; /* ensure terminated */ + + CDEBUG (D_NET, "%s", msg); +} + +void +ktoenal_cli(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data = nal->nal_data; + + spin_lock(&data->ksnd_nal_cb_lock); +} + +void +ktoenal_sti(nal_cb_t *nal, unsigned long *flags) +{ + ksock_nal_data_t *data; + data = nal->nal_data; + + spin_unlock(&data->ksnd_nal_cb_lock); +} + +int +ktoenal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if ktoenal_get_conn(nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->ni.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +ksock_ltx_t * +ktoenal_get_ltx (int may_block) +{ + long flags; + ksock_ltx_t *ltx = NULL; + + for (;;) + { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&ktoenal_data.ksnd_idle_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_ltx_list.next, ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + break; + } + + if (!may_block) + { + if (!list_empty (&ktoenal_data.ksnd_idle_nblk_ltx_list)) + { + ltx = list_entry (ktoenal_data.ksnd_idle_nblk_ltx_list.next, + ksock_ltx_t, ltx_tx.tx_list); + list_del (<x->ltx_tx.tx_list); + } + break; + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + wait_event (ktoenal_data.ksnd_idle_ltx_waitq, + !list_empty (&ktoenal_data.ksnd_idle_ltx_list)); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + return (ltx); +} + +int +ktoenal_sendmsg (struct file *sock, struct iovec *iov, int niov, int nob, int flags) +{ + /* NB This procedure "consumes" iov (actually we do, tcp_sendmsg doesn't) + */ + mm_segment_t oldmm; + int rc; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + oldmm = get_fs(); + set_fs (KERNEL_DS); + +#ifdef PORTAL_DEBUG + { + int total_nob; + int i; + + for (i = total_nob = 0; i < niov; i++) + total_nob += iov[i].iov_len; + + LASSERT (nob == total_nob); + } +#endif + LASSERT (!in_interrupt()); + + rc = sock->f_op->writev(sock, iov, niov, NULL); + + set_fs (oldmm); + + if (rc > 0) /* sent something? */ + { + nob = rc; /* consume iov */ + for (;;) + { + LASSERT (niov > 0); + + if (iov->iov_len >= nob) + { + iov->iov_len -= nob; + iov->iov_base = (void *)(((unsigned long)iov->iov_base) + nob); + break; + } + nob -= iov->iov_len; + iov->iov_len = 0; + iov++; + niov--; + } + } + + return (rc); +} + +int +ktoenal_recvmsg(struct file *sock, struct iovec *iov, int niov, int toread) +{ + /* NB This procedure "consumes" iov (actually tcp_recvmsg does) + */ + mm_segment_t oldmm; + int ret, i, len = 0, origlen = 0; + + PROF_START(our_recvmsg); + for(i = 0; i < niov; i++) { + len += iov[i].iov_len; + if(len >= toread) + break; + } + + if(len >= toread) { + origlen = iov[i].iov_len; + iov[i].iov_len -= (len - toread); + } + else { /* i == niov */ + i = niov - 1; + } + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + ret = sock->f_op->readv(sock, iov, i + 1, NULL); + + set_fs(oldmm); + + if(origlen) + iov[i].iov_len = origlen; + + PROF_FINISH(our_recvmsg); + return ret; +} + +void +ktoenal_process_transmit (ksock_conn_t *conn, long *irq_flags) +{ + ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + int rc; + + LASSERT (conn->ksnc_tx_scheduled); + LASSERT (conn->ksnc_tx_ready); + LASSERT (!list_empty (&conn->ksnc_tx_queue)); + + /* assume transmit will complete now, so dequeue while I've got the lock */ + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + LASSERT (tx->tx_nob > 0); + + conn->ksnc_tx_ready = 0; /* write_space may race with me and set ready */ + mb(); /* => clear BEFORE trying to write */ + + rc = ktoenal_sendmsg (conn->ksnc_file, + tx->tx_iov, tx->tx_niov, tx->tx_nob, + list_empty (&conn->ksnc_tx_queue) ? + MSG_DONTWAIT : (MSG_DONTWAIT | MSG_MORE)); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_nob, rc); + + if (rc < 0) /* error */ + { + if (rc == -EAGAIN) /* socket full => */ + rc = 0; /* nothing sent */ + else + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal send(%d) %p: %d\n", tx->tx_nob, conn, rc); + rc = tx->tx_nob; /* kid on for now whole packet went */ + } + } + + if (rc == tx->tx_nob) /* everything went */ + { + conn->ksnc_tx_ready = 1; /* assume more can go (ASAP) */ + ktoenal_put_conn (conn); /* release packet's ref */ + + if (tx->tx_isfwd) /* was a forwarded packet? */ + { + kpr_fwd_done (&ktoenal_data.ksnd_router, + KSOCK_TX_2_KPR_FWD_DESC (tx), 0); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + } + else /* local send */ + { + ksock_ltx_t *ltx = KSOCK_TX_2_KSOCK_LTX (tx); + + lib_finalize (&ktoenal_lib, ltx->ltx_private, ltx->ltx_cookie); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + list_add (<x->ltx_tx.tx_list, ltx->ltx_idle); + + /* normal tx desc => wakeup anyone blocking for one */ + if (ltx->ltx_idle == &ktoenal_data.ksnd_idle_ltx_list && + waitqueue_active (&ktoenal_data.ksnd_idle_ltx_waitq)) + wake_up (&ktoenal_data.ksnd_idle_ltx_waitq); + } + ktoenal_packets_transmitted++; + } + else + { + tx->tx_nob -= rc; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + /* back onto HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if (!conn->ksnc_tx_ready || /* no space to write now */ + list_empty (&conn->ksnc_tx_queue)) /* nothing to write */ + { + conn->ksnc_tx_scheduled = 0; /* not being scheduled */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); +} + +void +ktoenal_launch_packet (ksock_conn_t *conn, ksock_tx_t *tx) +{ + long flags; + int nob = tx->tx_nob; + struct iovec *iov = tx->tx_iov; + int niov = 1; + + LASSERT (nob >= sizeof (ptl_hdr_t)); + + /* Truncate iov to exactly match total packet length + * since socket sendmsg pays no attention to requested length. + */ + for (;;) + { + LASSERT (niov <= tx->tx_niov); + LASSERT (iov->iov_len >= 0); + + if (iov->iov_len >= nob) + { + iov->iov_len = nob; + break; + } + nob -= iov->iov_len; + iov++; + niov++; + } + tx->tx_niov = niov; + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) /* not scheduled to send */ + { + list_add_tail (&conn->ksnc_tx_list, &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + atomic_inc (&conn->ksnc_refcount); /* extra ref for scheduler */ + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + ktoenal_packets_launched++; + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +int +ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, size_t payload_len) +{ + ptl_nid_t gatewaynid; + ksock_conn_t *conn; + ksock_ltx_t *ltx; + int rc; + int i; + + /* By this point, as it happens, we have absolutely no idea what + * 'private' is. It might be ksock_nal_data or it might be ksock_conn. + * Ha ha, isn't that a funny joke? + * + * FIXME: this is not the right way to fix this; the right way is to + * always pass in the same kind of structure. This is hard right now. + * To revisit this issue, set a breakpoint in here and watch for when + * it's called from lib_finalize. I think this occurs when we send a + * packet as a side-effect of another packet, such as when an ACK has + * been requested. -phil */ + + CDEBUG(D_NET, "sending %d bytes from [%d](%p,%d)... to nid: " + LPX64" pid %d\n", (int)payload_len, payload_niov, + payload_niov > 0 ? payload_iov[0].iov_base : NULL, + (int)(payload_niov > 0 ? payload_iov[0].iov_len : 0), nid, pid); + + if ((conn = ktoenal_get_conn (nid)) == NULL) + { + /* It's not a peer; try to find a gateway */ + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + if (rc != 0) + { + CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); + return (-1); + } + + if ((conn = ktoenal_get_conn (gatewaynid)) == NULL) + { + CERROR ("Can't route to "LPX64": gateway "LPX64" is not a peer\n", + nid, gatewaynid); + return (-1); + } + } + + /* This transmit has now got a ref on conn */ + + /* I may not block for a transmit descriptor if I might block the + * receiver, or an interrupt handler. */ + ltx = ktoenal_get_ltx (!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt ())); + if (ltx == NULL) + { + CERROR ("Can't allocate tx desc\n"); + ktoenal_put_conn (conn); + return (-1); + } + + /* Init common (to sends and forwards) packet part */ + ltx->ltx_tx.tx_isfwd = 0; + ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_len; + ltx->ltx_tx.tx_niov = 1 + payload_niov; + ltx->ltx_tx.tx_iov = ltx->ltx_iov; + + /* Init local send packet (storage for hdr, finalize() args, iov) */ + ltx->ltx_hdr = *hdr; + ltx->ltx_private = private; + ltx->ltx_cookie = cookie; + + ltx->ltx_iov[0].iov_base = <x->ltx_hdr; + ltx->ltx_iov[0].iov_len = sizeof (ltx->ltx_hdr); + + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + for (i = 0; i < payload_niov; i++) + { + ltx->ltx_iov[1 + i].iov_base = payload_iov[i].iov_base; + ltx->ltx_iov[1 + i].iov_len = payload_iov[i].iov_len; + } + + ktoenal_launch_packet (conn, <x->ltx_tx); + return (0); +} + +void +ktoenal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + ksock_conn_t *conn; + ptl_nid_t nid = fwd->kprfd_gateway_nid; + ksock_tx_t *tx = (ksock_tx_t *)&fwd->kprfd_scratch; + + CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, + fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + + if (nid == ktoenal_lib.ni.nid) /* I'm the gateway; must be the last hop */ + nid = fwd->kprfd_target_nid; + + conn = ktoenal_get_conn (nid); + if (conn == NULL) + { + CERROR ("[%p] fwd to "LPX64" isn't a peer\n", fwd, nid); + kpr_fwd_done (&ktoenal_data.ksnd_router, fwd, -EHOSTUNREACH); + return; + } + + /* This forward has now got a ref on conn */ + + tx->tx_isfwd = 1; /* This is a forwarding packet */ + tx->tx_nob = fwd->kprfd_nob; + tx->tx_niov = fwd->kprfd_niov; + tx->tx_iov = fwd->kprfd_iov; + + ktoenal_launch_packet (conn, tx); +} + +int +ktoenal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&ktoenal_data.ksnd_nthreads); + return (0); +} + +void +ktoenal_thread_fini (void) +{ + atomic_dec (&ktoenal_data.ksnd_nthreads); +} + +void +ktoenal_fmb_callback (void *arg, int error) +{ + ksock_fmb_t *fmb = (ksock_fmb_t *)arg; + ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(fmb->fmb_pages[0]); + ksock_conn_t *conn; + long flags; + + CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + if (error != 0) + CERROR ("Failed to route packet from "LPX64" to "LPX64": %d\n", + hdr->src_nid, hdr->dest_nid, error); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + list_add (&fmb->fmb_list, &fmb->fmb_pool->fmp_idle_fmbs); + + if (!list_empty (&fmb->fmb_pool->fmp_blocked_conns)) + { + conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + CDEBUG (D_NET, "Scheduling conn %p\n", conn); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); +} + +ksock_fmb_t * +ktoenal_get_idle_fmb (ksock_conn_t *conn) +{ + /* NB called with sched lock held */ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + ksock_fmb_pool_t *pool; + ksock_fmb_t *fmb; + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + + if (packet_nob <= SOCKNAL_SMALL_FWD_PAGES * PAGE_SIZE) + pool = &ktoenal_data.ksnd_small_fmp; + else + pool = &ktoenal_data.ksnd_large_fmp; + + if (!list_empty (&pool->fmp_idle_fmbs)) + { + fmb = list_entry (pool->fmp_idle_fmbs.next, ksock_fmb_t, fmb_list); + list_del (&fmb->fmb_list); + return (fmb); + } + + /* deschedule until fmb free */ + + conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + + list_add_tail (&conn->ksnc_rx_list, + &pool->fmp_blocked_conns); + return (NULL); +} + + +int +ktoenal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) +{ + int payload_nob = conn->ksnc_rx_nob_left; + int packet_nob = sizeof (ptl_hdr_t) + payload_nob; + int niov; /* at least the header */ + int nob; + + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); + LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); + LASSERT (payload_nob >= 0); + LASSERT (packet_nob <= fmb->fmb_npages * PAGE_SIZE); + LASSERT (sizeof (ptl_hdr_t) < PAGE_SIZE); + + /* Got a forwarding buffer; copy the header we just read into the + * forwarding buffer. If there's payload start reading reading it + * into the buffer, otherwise the forwarding buffer can be kicked + * off immediately. + * + * NB fmb->fmb_iov spans the WHOLE packet. + * conn->ksnc_rx_iov spans just the payload. + */ + + fmb->fmb_iov[0].iov_base = page_address (fmb->fmb_pages[0]); + + memcpy (fmb->fmb_iov[0].iov_base, &conn->ksnc_hdr, sizeof (ptl_hdr_t)); /* copy header */ + + if (payload_nob == 0) /* got complete packet already */ + { + atomic_inc (&ktoenal_packets_received); + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (immediate)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, packet_nob); + + fmb->fmb_iov[0].iov_len = sizeof (ptl_hdr_t); + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, 1, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + kpr_fwd_start (&ktoenal_data.ksnd_router, &fmb->fmb_fwd); /* forward it now */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + return (1); + } + + niov = 1; + if (packet_nob <= PAGE_SIZE) /* whole packet fits in first page */ + fmb->fmb_iov[0].iov_len = packet_nob; + else + { + fmb->fmb_iov[0].iov_len = PAGE_SIZE; + nob = packet_nob - PAGE_SIZE; + + do + { + LASSERT (niov < fmb->fmb_npages); + fmb->fmb_iov[niov].iov_base = page_address (fmb->fmb_pages[niov]); + fmb->fmb_iov[niov].iov_len = MIN (PAGE_SIZE, nob); + nob -= PAGE_SIZE; + niov++; + } while (nob > 0); + } + + kpr_fwd_init (&fmb->fmb_fwd, conn->ksnc_hdr.dest_nid, + packet_nob, niov, fmb->fmb_iov, + ktoenal_fmb_callback, fmb); + + /* stash router's descriptor ready for call to kpr_fwd_start */ + conn->ksnc_cookie = &fmb->fmb_fwd; + + conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ + + /* payload is desc's iov-ed buffer, but skipping the hdr */ + LASSERT (niov <= sizeof (conn->ksnc_rx_iov) / sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_iov[0].iov_base = (void *)(((unsigned long)fmb->fmb_iov[0].iov_base) + sizeof (ptl_hdr_t)); + conn->ksnc_rx_iov[0].iov_len = fmb->fmb_iov[0].iov_len - sizeof (ptl_hdr_t); + + if (niov > 1) + memcpy (&conn->ksnc_rx_iov[1], &fmb->fmb_iov[1], (niov - 1) * sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, payload_nob); + return (0); +} + +void +ktoenal_fwd_parse (ksock_conn_t *conn) +{ + ksock_conn_t *conn2; + int body_len; + + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); + LASSERT (conn->ksnc_rx_scheduled); + + switch (conn->ksnc_hdr.type) + { + case PTL_MSG_GET: + case PTL_MSG_ACK: + body_len = 0; + break; + case PTL_MSG_PUT: + body_len = conn->ksnc_hdr.msg.put.length; + break; + case PTL_MSG_REPLY: + body_len = conn->ksnc_hdr.msg.reply.length; + break; + default: + /* Unrecognised packet type */ + CERROR ("Unrecognised packet type %d from "LPX64" for "LPX64"\n", + conn->ksnc_hdr.type, conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + /* Ignore this header and go back to reading a new packet. */ + ktoenal_new_packet (conn, 0); + return; + } + + if (body_len < 0) /* length corrupt */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d illegal\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, 0); /* on to new packet */ + return; + } + + if (body_len > SOCKNAL_MAX_FWD_PAYLOAD) /* too big to forward */ + { + CERROR ("dropping packet from "LPX64" for "LPX64": packet size %d too big\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, body_len); + ktoenal_new_packet (conn, body_len); /* on to new packet (skip this one's body) */ + return; + } + + conn2 = ktoenal_get_conn (conn->ksnc_hdr.dest_nid); /* should have gone direct */ + if (conn2 != NULL) + { + CERROR ("dropping packet from "LPX64" for "LPX64": target is a peer\n", + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid); + ktoenal_put_conn (conn2); /* drop ref from get above */ + + ktoenal_new_packet (conn, body_len); /* on to next packet (skip this one's body) */ + return; + } + + conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ + conn->ksnc_rx_nob_left = body_len; /* stash packet size */ + conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ +} + +int +ktoenal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ktoenal_slop_buffer[4096]; + + int nob; + int niov; + int skipped; + + if (nob_to_skip == 0) /* right at next packet boundary now */ + { + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; + conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); + conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; + conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); + conn->ksnc_rx_niov = 1; + return (1); + } + + /* set up to skip as much a possible now */ + /* if there's more left (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + skipped = 0; + niov = 0; + + do + { + nob = MIN (nob_to_skip, sizeof (ktoenal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ktoenal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof (conn->ksnc_rx_iov)/sizeof (conn->ksnc_rx_iov[0])); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = skipped; + return (0); +} + +void +ktoenal_process_receive (ksock_conn_t *conn, long *irq_flags) +{ + ksock_fmb_t *fmb; + int len; + LASSERT (atomic_read (&conn->ksnc_refcount) > 0); + LASSERT (conn->ksnc_rx_scheduled); + LASSERT (conn->ksnc_rx_ready); + + /* NB: sched lock held */ + CDEBUG(D_NET, "conn %p\n", conn); + + if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) /* doesn't need a forwarding buffer */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto try_read; + } + + get_fmb: + /* NB: sched lock held */ + fmb = ktoenal_get_idle_fmb (conn); + if (fmb == NULL) /* conn descheduled waiting for idle fmb */ + return; + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (ktoenal_init_fmb (conn, fmb)) /* packet forwarded ? */ + goto out; /* come back later for next packet */ + + try_read: + /* NB: sched lock NOT held */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + + LASSERT (conn->ksnc_rx_niov > 0); + LASSERT (conn->ksnc_rx_nob_wanted > 0); + + conn->ksnc_rx_ready = 0; /* data ready may race with me and set ready */ + mb(); /* => clear BEFORE trying to read */ + + /* NB ktoenal_recvmsg "consumes" the iov passed to it */ + len = ktoenal_recvmsg(conn->ksnc_file, + conn->ksnc_rx_iov, conn->ksnc_rx_niov, + conn->ksnc_rx_nob_wanted); + CDEBUG (D_NET, "%p read(%d) %d\n", conn, conn->ksnc_rx_nob_wanted, len); + + if (len <= 0) /* nothing ready (EAGAIN) or EOF or error */ + { + if (len != -EAGAIN && /* ! nothing to read now */ + len != 0) /* ! nothing to read ever */ + { +#warning FIXME: handle socket errors properly + CERROR ("Error socknal read(%d) %p: %d\n", + conn->ksnc_rx_nob_wanted, conn, len); + } + goto out; /* come back when there's data ready */ + } + + LASSERT (len <= conn->ksnc_rx_nob_wanted); + conn->ksnc_rx_nob_wanted -= len; + conn->ksnc_rx_nob_left -= len; + + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ + goto out; /* try again later */ + + conn->ksnc_rx_ready = 1; /* assume there's more to be had */ + + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: + if (conn->ksnc_hdr.dest_nid != ktoenal_lib.ni.nid) /* It's not for me */ + { + ktoenal_fwd_parse (conn); + switch (conn->ksnc_rx_state) + { + case SOCKNAL_RX_HEADER: /* skipped this packet (zero payload) */ + goto out; /* => come back later */ + case SOCKNAL_RX_SLOP: /* skipping this packet's body */ + goto try_read; /* => go read it */ + case SOCKNAL_RX_GET_FMB: /* forwarding */ + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + goto get_fmb; /* => go get a fwd msg buffer */ + default: + } + /* Not Reached */ + LBUG (); + } + + PROF_START(lib_parse); + lib_parse(&ktoenal_lib, &conn->ksnc_hdr, conn); /* sets wanted_len, iovs etc */ + PROF_FINISH(lib_parse); + + if (conn->ksnc_rx_nob_wanted != 0) /* need to get some payload? */ + { + conn->ksnc_rx_state = SOCKNAL_RX_BODY; + goto try_read; /* go read the payload */ + } + /* Fall through (completed packet for me) */ + + case SOCKNAL_RX_BODY: + atomic_inc (&ktoenal_packets_received); + lib_finalize(&ktoenal_lib, NULL, conn->ksnc_cookie); /* packet is done now */ + /* Fall through */ + + case SOCKNAL_RX_SLOP: + if (ktoenal_new_packet (conn, conn->ksnc_rx_nob_left)) /* starting new packet? */ + goto out; /* come back later */ + goto try_read; /* try to finish reading slop now */ + + case SOCKNAL_RX_BODY_FWD: + CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", conn, + conn->ksnc_hdr.src_nid, conn->ksnc_hdr.dest_nid, conn->ksnc_rx_nob_left); + + atomic_inc (&ktoenal_packets_received); + + /* ktoenal_init_fmb() stashed router descriptor in conn->ksnc_cookie */ + kpr_fwd_start (&ktoenal_data.ksnd_router, (kpr_fwd_desc_t *)conn->ksnc_cookie); + + LASSERT (conn->ksnc_rx_nob_left == 0); /* no slop in forwarded packets */ + + ktoenal_new_packet (conn, 0); /* on to next packet */ + goto out; /* (later) */ + + default: + } + + /* Not Reached */ + LBUG (); + + out: + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, *irq_flags); + + if (!conn->ksnc_rx_ready) /* no data there to read? */ + { + conn->ksnc_rx_scheduled = 0; /* let socket callback schedule again */ + ktoenal_put_conn (conn); /* release scheduler's ref */ + } + else /* let scheduler call me again */ + list_add_tail (&conn->ksnc_rx_list, &ktoenal_data.ksnd_rx_conns); +} + +int +ktoenal_recv(nal_cb_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, size_t mlen, size_t rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + int i; + + conn->ksnc_cookie = msg; + + LASSERT (niov <= PTL_MD_MAX_IOV); + for (i = 0; i < niov; i++) + { + conn->ksnc_rx_iov[i].iov_len = iov[i].iov_len; + conn->ksnc_rx_iov[i].iov_base = iov[i].iov_base; + } + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + return (rlen); +} + +int +ktoenal_scheduler (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + int nloops = 0; + + kportal_daemonize ("ktoenal_sched"); + kportal_blockallsigs (); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + while (!ktoenal_data.ksnd_shuttingdown) + { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&ktoenal_data.ksnd_rx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del (&conn->ksnc_rx_list); + + ktoenal_process_receive (conn, &flags); /* drops & regains ksnd_sched_lock */ + } + + if (!list_empty (&ktoenal_data.ksnd_tx_conns)) + { + did_something = 1; + conn = list_entry (ktoenal_data.ksnd_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + + list_del (&conn->ksnc_tx_list); + ktoenal_process_transmit (conn, &flags); /* drops and regains ksnd_sched_lock */ + } + + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) /* hogging CPU? */ + { + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible (ktoenal_data.ksnd_sched_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty (&ktoenal_data.ksnd_rx_conns) || + !list_empty (&ktoenal_data.ksnd_tx_conns)); + LASSERT (rc == 0); + } else + our_cond_resched(); + + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + } + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + ktoenal_thread_fini (); + return (0); +} + + +int +ktoenal_reaper (void *arg) +{ + unsigned long flags; + ksock_conn_t *conn; + int rc; + + kportal_daemonize ("ktoenal_reaper"); + kportal_blockallsigs (); + + while (!ktoenal_data.ksnd_shuttingdown) + { + spin_lock_irqsave (&ktoenal_data.ksnd_reaper_lock, flags); + + if (list_empty (&ktoenal_data.ksnd_reaper_list)) + conn = NULL; + else + { + conn = list_entry (ktoenal_data.ksnd_reaper_list.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_reaper_lock, flags); + + if (conn != NULL) + ktoenal_close_conn (conn); + else { + rc = wait_event_interruptible (ktoenal_data.ksnd_reaper_waitq, + ktoenal_data.ksnd_shuttingdown || + !list_empty(&ktoenal_data.ksnd_reaper_list)); + LASSERT (rc == 0); + } + } + + ktoenal_thread_fini (); + return (0); +} + +#define POLLREAD (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI) +#define POLLWRITE (POLLOUT | POLLWRNORM | POLLWRBAND) + +int +ktoenal_pollthread(void *arg) +{ + unsigned int mask; + struct list_head *tmp; + ksock_conn_t *conn; + + /* Save the task struct for waking it up */ + ktoenal_data.ksnd_pollthread_tsk = current; + + kportal_daemonize ("ktoenal_pollthread"); + kportal_blockallsigs (); + + poll_initwait(&ktoenal_data.ksnd_pwait); + + while(!ktoenal_data.ksnd_shuttingdown) { + + set_current_state(TASK_INTERRUPTIBLE); + + read_lock (&ktoenal_data.ksnd_socklist_lock); + list_for_each(tmp, &ktoenal_data.ksnd_socklist) { + + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + atomic_inc(&conn->ksnc_refcount); + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + mask = conn->ksnc_file->f_op->poll(conn->ksnc_file, + ktoenal_data.ksnd_slistchange ? + &ktoenal_data.ksnd_pwait : NULL); + + if(mask & POLLREAD) { + ktoenal_data_ready(conn); + + } + if (mask & POLLWRITE) { + ktoenal_write_space(conn); + + } + if (mask & (POLLERR | POLLHUP)) { + /* Do error processing */ + } + + read_lock (&ktoenal_data.ksnd_socklist_lock); + if(atomic_dec_and_test(&conn->ksnc_refcount)) + _ktoenal_put_conn(conn); + } + ktoenal_data.ksnd_slistchange = 0; + read_unlock (&ktoenal_data.ksnd_socklist_lock); + + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + if(ktoenal_data.ksnd_slistchange) { + poll_freewait(&ktoenal_data.ksnd_pwait); + poll_initwait(&ktoenal_data.ksnd_pwait); + } + } + poll_freewait(&ktoenal_data.ksnd_pwait); + ktoenal_thread_fini(); + return (0); +} + +void +ktoenal_data_ready (ksock_conn_t *conn) +{ + unsigned long flags; + ENTRY; + + if (!test_and_set_bit (0, &conn->ksnc_rx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail (&conn->ksnc_rx_list, + &ktoenal_data.ksnd_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + /* This is done to avoid the effects of a sequence + * of events in which the rx_ready is lost + */ + conn->ksnc_rx_ready=1; + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } + + EXIT; +} + +void +ktoenal_write_space (ksock_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "conn %p%s%s%s\n", + conn, + (conn == NULL) ? "" : (test_bit (0, &conn->ksnc_tx_ready) ? " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued")); + + + if (!test_and_set_bit (0, &conn->ksnc_tx_ready)) { + spin_lock_irqsave (&ktoenal_data.ksnd_sched_lock, flags); + + if (!list_empty (&conn->ksnc_tx_queue) && /* packets to send */ + !conn->ksnc_tx_scheduled) { /* not being progressed */ + + list_add_tail (&conn->ksnc_tx_list, + &ktoenal_data.ksnd_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_refcount); + + if (waitqueue_active (&ktoenal_data.ksnd_sched_waitq)) + wake_up (&ktoenal_data.ksnd_sched_waitq); + } + spin_unlock_irqrestore (&ktoenal_data.ksnd_sched_lock, flags); + } +} + +nal_cb_t ktoenal_lib = { + nal_data: &ktoenal_data, /* NAL private data */ + cb_send: ktoenal_send, + cb_recv: ktoenal_recv, + cb_read: ktoenal_read, + cb_write: ktoenal_write, + cb_callback: ktoenal_callback, + cb_malloc: ktoenal_malloc, + cb_free: ktoenal_free, + cb_printf: ktoenal_printf, + cb_cli: ktoenal_cli, + cb_sti: ktoenal_sti, + cb_dist: ktoenal_dist +}; diff --git a/lustre/portals/libcfs/.cvsignore b/lustre/portals/libcfs/.cvsignore new file mode 100644 index 0000000..67d1a3d --- /dev/null +++ b/lustre/portals/libcfs/.cvsignore @@ -0,0 +1,4 @@ +.deps +Makefile +Makefile.in +link-stamp diff --git a/lustre/portals/libcfs/Makefile.am b/lustre/portals/libcfs/Makefile.am new file mode 100644 index 0000000..20d7fbd --- /dev/null +++ b/lustre/portals/libcfs/Makefile.am @@ -0,0 +1,29 @@ +# Copyright (C) 2001, 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +MODULE = portals +modulenet_DATA = portals.o +EXTRA_PROGRAMS = portals + +LIBLINKS := lib-dispatch.c lib-eq.c lib-init.c lib-md.c lib-me.c lib-move.c lib-msg.c lib-ni.c lib-pid.c +APILINKS := api-eq.c api-errno.c api-init.c api-me.c api-ni.c api-wrap.c +LINKS = $(APILINKS) $(LIBLINKS) +DISTCLEANFILES = $(LINKS) link-stamp *.orig *.rej + +$(LINKS): link-stamp +link-stamp: + -list='$(LIBLINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + -list='$(APILINKS)'; for f in $$list; do echo $$f ; ln -sf $(srcdir)/../portals/$$f .; done + echo timestamp > link-stamp + +DEFS = +portals_SOURCES = $(LINKS) module.c proc.c debug.c + +# Don't distribute any patched files. +dist-hook: + list='$(EXT2C)'; for f in $$list; do rm -f $(distdir)/$$f; done + +include ../Rules.linux diff --git a/lustre/portals/libcfs/Makefile.mk b/lustre/portals/libcfs/Makefile.mk new file mode 100644 index 0000000..3196ea2 --- /dev/null +++ b/lustre/portals/libcfs/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include fs/lustre/portals/Kernelenv + +obj-y += libcfs.o +licfs-objs := module.o proc.o debug.o \ No newline at end of file diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c new file mode 100644 index 0000000..8d26dbb --- /dev/null +++ b/lustre/portals/libcfs/debug.c @@ -0,0 +1,830 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan <phil@clusterfs.com> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/notifier.h> +#include <linux/kernel.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <linux/interrupt.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <linux/completion.h> + +#include <linux/fs.h> +#include <linux/stat.h> +#include <asm/uaccess.h> +#include <asm/segment.h> +#include <linux/miscdevice.h> + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include <linux/kp30.h> + +#define DEBUG_OVERFLOW 1024 +static char *debug_buf = NULL; +static unsigned long debug_size = 0; +static atomic_t debug_off_a = ATOMIC_INIT(0); +static int debug_wrapped; +wait_queue_head_t debug_ctlwq; +#define DAEMON_SND_SIZE (64 << 10) + +/* + * used by the daemon to keep track the offset into debug_buffer for the next + * write to the file. Usually, the daemon is to write out buffer + * from debug_daemon_next_write upto debug_off + * variable usage + * Reader - portals_debug_msg() + * Writer - portals_debug_daemon() + * portals_debug_daemon_start() during daemon init time + * portals_debug_daemon_continue() to reset to debug_off + * portals_debug_clear_buffer() reset to debug_off for clear + * Note that *_start(), *_continue() & *clear_buffer() should serialized; + */ +static atomic_t debug_daemon_next_write; + +/* + * A debug_daemon can be in following states + * stopped - stopped state means there is no debug_daemon running. + * accordingly, it must be in paused state + * a daemon is in !stopped && !paused state after + * "lctl debug_daemon start" creates debug_daemon successfully + * Variable Usage + * Reader - portals_debug_daemon() + * portals_debug_set_daemon() routines + * Writer - portals_debug_set_daemon() routines + * portals_debug_daemon() on IO error + * paused - a debug_daemon state is changed from !paused into paused + * when "lctl debug_daemon paused" is issued + * "lctl debug_daemon continue" gets a daemon into !paused mode + * Reader - portals_debug_set_daemon() routines + * portals_debug_msg() + * Writer - portals_debug_set_daemon() on init + * portals_debug_daemon() + * + * Daemon state diagram. + * (stopped, paused) + * | <-- debug_daemon start + * V + * (!stopped, !paused) + * | <-- debug_daemon pause + * V + * (!stopped, paused) + * | <-- debug_daemon continue + * V + * (!stopped, !paused) + * | <-- debug_daemon stop + * V + * (stopped, paused) + * Overlapped - this is a state when CDEBUG is too fast for the daemon to + * write out the debug_bufferr. That is, debug_off is to + * overlap debug_daemon_next_write; + * Reader - portals_debug_msg() + * Writer - portals_debug_msg() + */ + +/* + * Description on Trace Daemon Synchronization + * + * Three categories of code are synchronizing between each other + * 1. lctl, portals_debug_set_daemon(), the user debug control code, + * as well as portals_debug_clear_buffer() + * 2. CDEBUG, portals_debug_msg(), the debug put messages routine + * 3. Daemon, portals_debug_daemon(), to write out debug log file + * + * + * Three different controls for synchronizations + * + * 1. debug_daemon_semaphore + * The usage of this semaphore is to serialize multiple lctl controls + * in manipulating debug daemon state. The semaphore serves as the + * gatekeeper to allow only one user control thread, at any giving time, + * to access debug daemon state and keeps the other user control requests + * in wait state until the current control request is serviced. + * + * 2. wait_queue_head_t lctl (paired with lctl_event flag) + * Lctl event is the event between portals_debug_set_daemon() and + * portals_debug_daemon(). Lctl is an indicator for portals_debug_daemon() + * to flush data out to file. portals_debug_daemon() is to use lctl event + * as signal channel to wakeup portals_debug_set_daemon() upon flush + * operation is done. + * + * Producer : + * portals_debug_daemon() uses to wake up + * portals_debug_set_daemon(), pause and stop, routines + * Consumer : + * portals_debug_set_daemon(), stop and pause operations, + * wait and sleep on the event + * + * 3. wait_queue_head_t daemon (paired with daemon_event flag) + * This is an event channel to wakeup portals_debug_daemon. Daemon + * wakes up to run whenever there is an event posted. Daemon handles + * 2 types of operations . 1. Writes data out to debug file, 2. Flushes + * file and terminates base on lctl event. + * File operation - + * Daemon is normally in a sleep state. + * Daemon is woken up through daemon event whenever CDEBUG is + * putting data over any 64K boundary. + * File flush and termination - + * On portals_debug_daemon_stop/pause() operations, lctl control + * is to wake up daemon through daemon event. + * + * We can't use sleep_on() and wake_up() to replace daemon event because + * portals_debug_daemon() must catch the wakeup operation posted by + * portals_debug_daemon_stop/pause(). Otherwise, stop and pause may + * stuck in lctl wait event. + * + * Producer : + * a. portals_debug_daemon_pause() and portals_debug_daemon_stop() + * uses the event to wake up portals_debug_daemon() + * b. portals_debug_msg() uses the event to wake up + * portals_debug_daemon() whenever the data output is acrossing + * a 64K bytes boundary. + * Consumer : + * portals_debug_daemon() wakes up upon daemon event. + * + * Sequence for portals_debug_daemon_stop() operation + * + * _Portals_debug_daemon_stop()_ _Daemon_ + * Wait_event(daemon) or running + * Paused = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Set force_flush flag if lctlevnt + * Flush data + * Wakeup_event (lctl) + * Wait_event(daemon) + * Stopped = 1; + * Wakeup_event (daemon) + * Wait_event(lctl) + * Exit daemon loop if (Stopped) + * Wakeup_event (lctl) + * Exit + * Return to user application + * + * + * _Portals_debug_msg()_ _Daemon_ + * Wait_event(daemon) or running + * If (WriteStart<64K<WriteEnd) + * Wakeup_event(daemon) + * Do file IO + * Wait_event(daemon) + */ +struct debug_daemon_state { + unsigned long overlapped; + unsigned long stopped; + atomic_t paused; + unsigned long lctl_event; /* event for lctl */ + wait_queue_head_t lctl; + unsigned long daemon_event; /* event for daemon */ + wait_queue_head_t daemon; +}; +static struct debug_daemon_state debug_daemon_state; +static DECLARE_MUTEX(debug_daemon_semaphore); + +static loff_t daemon_file_size_limit; +char debug_daemon_file_path[1024] = ""; + +spinlock_t portals_debug_lock = SPIN_LOCK_UNLOCKED; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +int handled_panic; /* to avoid recursive calls to notifiers */ +char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall"; + + +int portals_do_debug_dumplog(void *arg) +{ + struct file *file; + void *journal_info; + int rc; + mm_segment_t oldfs; + unsigned long debug_off; + + kportal_daemonize(""); + + reparent_to_init(); + journal_info = current->journal_info; + current->journal_info = NULL; + sprintf(debug_file_name, "%s.%ld", debug_file_path, CURRENT_TIME); + file = filp_open(debug_file_name, O_CREAT|O_TRUNC|O_RDWR, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for dumping: %ld\n", debug_file_name, + PTR_ERR(file)); + GOTO(out, PTR_ERR(file)); + } else { + printk(KERN_ALERT "dumping log to %s ... writing ...\n", + debug_file_name); + } + + debug_off = atomic_read(&debug_off_a); + oldfs = get_fs(); + set_fs(get_ds()); + if (debug_wrapped) { + rc = file->f_op->write(file, debug_buf + debug_off + 1, + debug_size-debug_off-1, &file->f_pos); + rc += file->f_op->write(file, debug_buf, debug_off + 1, + &file->f_pos); + } else { + rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos); + } + printk("wrote %d bytes\n", rc); + set_fs(oldfs); + + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc) + CERROR("sync returns %d\n", rc); + filp_close(file, 0); +out: + current->journal_info = journal_info; + wake_up(&debug_ctlwq); + return 0; +} + +int portals_debug_daemon(void *arg) +{ + struct file *file; + void *journal_info; + mm_segment_t oldfs; + unsigned long force_flush = 0; + unsigned long size, off, flags; + int rc; + + kportal_daemonize("ldebug_daemon"); + reparent_to_init(); + journal_info = current->journal_info; + current->journal_info = NULL; + + file = filp_open(debug_daemon_file_path, + O_CREAT|O_TRUNC|O_RDWR|O_LARGEFILE, 0644); + + if (!file || IS_ERR(file)) { + CERROR("cannot open %s for logging", debug_daemon_file_path); + GOTO(out1, PTR_ERR(file)); + } else { + printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n", + debug_daemon_file_path); + } + + debug_daemon_state.overlapped = 0; + debug_daemon_state.stopped = 0; + + spin_lock_irqsave(&portals_debug_lock, flags); + off = atomic_read(&debug_off_a) + 1; + if (debug_wrapped) + off = (off >= debug_size)? 0 : off; + else + off = 0; + atomic_set(&debug_daemon_next_write, off); + atomic_set(&debug_daemon_state.paused, 0); + spin_unlock_irqrestore(&portals_debug_lock, flags); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + while (1) { + unsigned long ending; + unsigned long start, tail; + long delta; + + debug_daemon_state.daemon_event = 0; + + ending = atomic_read(&debug_off_a); + start = atomic_read(&debug_daemon_next_write); + + /* check if paused is imposed by lctl ? */ + force_flush = !debug_daemon_state.lctl_event; + + delta = ending - start; + tail = debug_size - start; + size = (delta >= 0) ? delta : tail; + while (size && (force_flush || (delta < 0) || + (size >= DAEMON_SND_SIZE))) { + if (daemon_file_size_limit) { + int ssize = daemon_file_size_limit - file->f_pos; + if (size > ssize) + size = ssize; + } + + rc = file->f_op->write(file, debug_buf+start, + size, &file->f_pos); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon write error %d\n", rc); + goto out; + } + start += rc; + delta = ending - start; + tail = debug_size - start; + if (tail == 0) + start = 0; + if (delta >= 0) + size = delta; + else + size = (tail == 0) ? ending : tail; + if (daemon_file_size_limit == file->f_pos) { + // file wrapped around + file->f_pos = 0; + } + } + atomic_set(&debug_daemon_next_write, start); + if (force_flush) { + rc = file->f_op->fsync(file, file->f_dentry, 1); + if (rc < 0) { + printk(KERN_ALERT + "Debug_daemon sync error %d\n", rc); + goto out; + } + if (debug_daemon_state.stopped) + break; + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + } + wait_event(debug_daemon_state.daemon, + debug_daemon_state.daemon_event); + } +out: + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + set_fs(oldfs); + filp_close(file, 0); + current->journal_info = journal_info; +out1: + debug_daemon_state.lctl_event = 1; + wake_up(&debug_daemon_state.lctl); + return 0; +} + +void portals_debug_print(void) +{ + unsigned long dumplen = 64 * 1024; + char *start1, *start2; + char *end1, *end2; + unsigned long debug_off = atomic_read(&debug_off_a); + + start1 = debug_buf + debug_off - dumplen; + if (start1 < debug_buf) { + start1 += debug_size; + end1 = debug_buf + debug_size - 1; + start2 = debug_buf; + end2 = debug_buf + debug_off; + } else { + end1 = debug_buf + debug_off; + start2 = debug_buf + debug_off; + end2 = debug_buf + debug_off; + } + + while (start1 < end1) { + int count = MIN(1024, end1 - start1); + printk("%*s", count, start1); + start1 += 1024; + } + while (start2 < end2) { + int count = MIN(1024, end2 - start2); + printk("%*s", count, start2); + start2 += 1024; + } +} + +void portals_debug_dumplog(void) +{ + int rc; + ENTRY; + + init_waitqueue_head(&debug_ctlwq); + + rc = kernel_thread(portals_do_debug_dumplog, + NULL, CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + printk(KERN_ERR "cannot start dump thread\n"); + return; + } + sleep_on(&debug_ctlwq); +} + +int portals_debug_daemon_start(char *file, unsigned int size) +{ + int rc; + + if (!debug_daemon_state.stopped) + return -EALREADY; + + if (file != NULL) + strncpy(debug_daemon_file_path, file, 1024); + + init_waitqueue_head(&debug_daemon_state.lctl); + init_waitqueue_head(&debug_daemon_state.daemon); + + daemon_file_size_limit = size << 20; + + debug_daemon_state.lctl_event = 0; + rc = kernel_thread(portals_debug_daemon, NULL, 0); + if (rc < 0) { + printk(KERN_ERR "cannot start debug daemon thread\n"); + strncpy(debug_daemon_file_path, "\0", 1); + return rc; + } + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_pause(void) +{ + if (atomic_read(&debug_daemon_state.paused)) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.lctl_event = 0; + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + return 0; +} + +int portals_debug_daemon_continue(void) +{ + if (!atomic_read(&debug_daemon_state.paused)) + return -EINVAL; + if (debug_daemon_state.stopped) + return -EINVAL; + + debug_daemon_state.overlapped = 0; + atomic_set(&debug_daemon_next_write, atomic_read(&debug_off_a)); + atomic_set(&debug_daemon_state.paused, 0); + return 0; +} + +int portals_debug_daemon_stop(void) +{ + if (debug_daemon_state.stopped) + return -EALREADY; + + if (!atomic_read(&debug_daemon_state.paused)) + portals_debug_daemon_pause(); + + debug_daemon_state.lctl_event = 0; + debug_daemon_state.stopped = 1; + + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + wait_event(debug_daemon_state.lctl, debug_daemon_state.lctl_event); + + debug_daemon_file_path[0] = '\0'; + return 0; +} + +int portals_debug_set_daemon(unsigned int cmd, unsigned int length, + char *filename, unsigned int size) +{ + int rc = -EINVAL; + + down(&debug_daemon_semaphore); + switch (cmd) { + case DEBUG_DAEMON_START: + if (length && (filename[length -1] != '\0')) { + CERROR("Invalid filename for debug_daemon\n"); + rc = -EINVAL; + break; + } + rc = portals_debug_daemon_start(filename, size); + break; + case DEBUG_DAEMON_STOP: + rc = portals_debug_daemon_stop(); + break; + case DEBUG_DAEMON_PAUSE: + rc = portals_debug_daemon_pause(); + break; + case DEBUG_DAEMON_CONTINUE: + rc = portals_debug_daemon_continue(); + break; + default: + CERROR("unknown set_daemon cmd\n"); + } + up(&debug_daemon_semaphore); + return rc; +} + +static int panic_dumplog(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (handled_panic) + return 0; + else + handled_panic = 1; + + if (in_interrupt()) { + portals_debug_print(); + return 0; + } + + while (current->lock_depth >= 0) + unlock_kernel(); + portals_debug_dumplog(); + return 0; +} + +static struct notifier_block lustre_panic_notifier = { + notifier_call : panic_dumplog, + next : NULL, + priority : 10000 +}; + +int portals_debug_init(unsigned long bufsize) +{ + unsigned long debug_off = atomic_read(&debug_off_a); + if (debug_buf != NULL) + return -EALREADY; + + atomic_set(&debug_daemon_state.paused, 1); + debug_daemon_state.stopped = 1; + + debug_buf = vmalloc(bufsize + DEBUG_OVERFLOW); + if (debug_buf == NULL) + return -ENOMEM; + memset(debug_buf, 0, debug_size); + debug_wrapped = 0; + + printk(KERN_INFO "Portals: allocated %lu byte debug buffer at %p.\n", + bufsize, debug_buf); + atomic_set(&debug_off_a, debug_off); + notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); + debug_size = bufsize; + + return 0; +} + +int portals_debug_cleanup(void) +{ + notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + portals_debug_daemon_stop(); + + vfree(debug_buf); + atomic_set(&debug_off_a, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +int portals_debug_clear_buffer(void) +{ + unsigned long flags; + unsigned long state; + + if (debug_buf == NULL) + return -EINVAL; + + down(&debug_daemon_semaphore); + state = atomic_read(&debug_daemon_state.paused); + if (!state) + portals_debug_daemon_pause(); + spin_lock_irqsave(&portals_debug_lock, flags); + atomic_set(&debug_off_a, 0); + debug_wrapped = 0; + atomic_set(&debug_daemon_next_write, 0); + debug_daemon_state.overlapped = 0; + spin_unlock_irqrestore(&portals_debug_lock, flags); + + if (!state) + atomic_set(&debug_daemon_state.paused, 0); + up(&debug_daemon_semaphore); + + return 0; +} + +/* Debug markers, although printed by S_PORTALS + * should not be be marked as such. + */ +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_UNDEFINED +int portals_debug_mark_buffer(char *text) +{ + if (debug_buf == NULL) + return -EINVAL; + + CDEBUG(0, "*******************************************************************************\n"); + CDEBUG(0, "DEBUG MARKER: %s\n", text); + CDEBUG(0, "*******************************************************************************\n"); + + return 0; +} +#undef DEBUG_SUBSYSTEM +#define DEBUG_SUBSYSTEM S_PORTALS + +__s32 portals_debug_copy_to_user(char *buf, unsigned long len) +{ + int rc; + unsigned long debug_off; + unsigned long flags; + + if (len < debug_size) + return -ENOSPC; + + debug_off = atomic_read(&debug_off_a); + spin_lock_irqsave(&portals_debug_lock, flags); + if (debug_wrapped) { + /* All of this juggling with the 1s is to keep the trailing nul + * (which falls at debug_buf + debug_off) at the end of what we + * copy into user space */ + copy_to_user(buf, debug_buf + debug_off + 1, + debug_size - debug_off - 1); + copy_to_user(buf + debug_size - debug_off - 1, + debug_buf, debug_off + 1); + rc = debug_size; + } else { + copy_to_user(buf, debug_buf, debug_off); + rc = debug_off; + } + spin_unlock_irqrestore(&portals_debug_lock, flags); + + return rc; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + unsigned long stack, const char *format, ...) +{ + va_list ap; + unsigned long flags; + int max_nob; + int prefix_nob; + int msg_nob; + struct timeval tv; + unsigned long base_offset; + unsigned long debug_off; + + if (debug_buf == NULL) { + printk("portals_debug_msg: debug_buf is NULL!\n"); + return; + } + + spin_lock_irqsave(&portals_debug_lock, flags); + debug_off = atomic_read(&debug_off_a); + if (!atomic_read(&debug_daemon_state.paused)) { + unsigned long available; + long delta; + long v = atomic_read(&debug_daemon_next_write); + + delta = debug_off - v; + available = (delta>=0) ? debug_size-delta : -delta; + // Check if we still have enough debug buffer for CDEBUG + if (available < DAEMON_SND_SIZE) { + /* Drop CDEBUG packets until enough debug_buffer is + * available */ + if (debug_daemon_state.overlapped) + goto out; + /* If this is the first time, leave a marker in the + * output */ + debug_daemon_state.overlapped = 1; + ap = NULL; + format = "DEBUG MARKER: Debug buffer overlapped\n"; + } else /* More space just became available */ + debug_daemon_state.overlapped = 0; + } + + max_nob = debug_size - debug_off + DEBUG_OVERFLOW; + if (max_nob <= 0) { + spin_unlock_irqrestore(&portals_debug_lock, flags); + printk("logic error in portals_debug_msg: <0 bytes to write\n"); + return; + } + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + do_gettimeofday(&tv); + + prefix_nob = snprintf(debug_buf + debug_off, max_nob, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id(), + tv.tv_sec, tv.tv_usec); + max_nob -= prefix_nob; + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.extern_pid, stack); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d | %d+%lu): ", + file, line, fn, current->pid, + current->thread.mode.tt.extern_pid, stack); +#else + msg_nob = snprintf(debug_buf + debug_off + prefix_nob, max_nob, + "(%s:%d:%s() %d+%lu): ", + file, line, fn, current->pid, stack); +#endif + max_nob -= msg_nob; + + va_start(ap, format); + msg_nob += vsnprintf(debug_buf + debug_off + prefix_nob + msg_nob, + max_nob, format, ap); + max_nob -= msg_nob; + va_end(ap); + + /* Print to console, while msg is contiguous in debug_buf */ + /* NB safely terminated see above */ + if ((mask & D_EMERG) != 0) + printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob); + if ((mask & D_ERROR) != 0) + printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob); + else if (portal_printk) + printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob); + base_offset = debug_off & 0xFFFF; + + debug_off += prefix_nob + msg_nob; + if (debug_off > debug_size) { + memcpy(debug_buf, debug_buf + debug_size, + debug_off - debug_size + 1); + debug_off -= debug_size; + debug_wrapped = 1; + } + + atomic_set(&debug_off_a, debug_off); + if (!atomic_read(&debug_daemon_state.paused) && + ((base_offset+prefix_nob+msg_nob) >= DAEMON_SND_SIZE)) { + debug_daemon_state.daemon_event = 1; + wake_up(&debug_daemon_state.daemon); + } +out: + spin_unlock_irqrestore(&portals_debug_lock, flags); +} + +void portals_debug_set_level(unsigned int debug_level) +{ + printk("Setting portals debug level to %08x\n", debug_level); + portal_debug = debug_level; +} + +void portals_run_lbug_upcall(char * file, char *fn, int line) +{ + char *argv[6]; + char *envp[3]; + char buf[32]; + int rc; + + ENTRY; + snprintf (buf, sizeof buf, "%d", line); + + argv[0] = portals_upcall; + argv[1] = "LBUG"; + argv[2] = file; + argv[3] = fn; + argv[4] = buf; + argv[5] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " + "/proc/sys/portals/upcall\n", + argv[0], argv[1], argv[2], argv[3], argv[4], rc); + + } else { + CERROR("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); + } +} + + +EXPORT_SYMBOL(portals_debug_dumplog); +EXPORT_SYMBOL(portals_debug_msg); +EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c new file mode 100644 index 0000000..5e3fcb5 --- /dev/null +++ b/lustre/portals/libcfs/module.c @@ -0,0 +1,574 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_PORTALS + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/stat.h> +#include <asm/uaccess.h> +#include <asm/segment.h> +#include <linux/miscdevice.h> + +#include <portals/lib-p30.h> +#include <portals/p30.h> +#include <linux/kp30.h> +#include <linux/portals_compat25.h> + +#define PORTAL_MINOR 240 + +extern void (kping_client)(struct portal_ioctl_data *); + +struct nal_cmd_handler { + nal_cmd_handler_t nch_handler; + void * nch_private; +}; + +static struct nal_cmd_handler nal_cmd[NAL_MAX_NR + 1]; +struct semaphore nal_cmd_sem; + +#ifdef PORTAL_DEBUG +void +kportal_assertion_failed (char *expr, char *file, char *func, int line) +{ + portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK(), + "ASSERTION(%s) failed\n", expr); + LBUG_WITH_LOC(file, func, line); +} +#endif + +void +kportal_daemonize (char *str) +{ +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) + daemonize(str); +#else + daemonize(); + snprintf (current->comm, sizeof (current->comm), "%s", str); +#endif +} + +void +kportal_blockallsigs () +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); +} + +/* called when opening /dev/device */ +static int kportal_psdev_open(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + PORTAL_MODULE_USE; + RETURN(0); +} + +/* called when closing /dev/device */ +static int kportal_psdev_release(struct inode * inode, struct file * file) +{ + ENTRY; + + if (!inode) + RETURN(-EINVAL); + + PORTAL_MODULE_UNUSE; + RETURN(0); +} + +static inline void freedata(void *data, int len) +{ + PORTAL_FREE(data, len); +} + +static int +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET (kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_add_route (gateway_nalid, gateway_nid, lo_nid, hi_nid); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_del_route(ptl_nid_t target) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_del_route (target); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) +{ + int gateway_nalid; + ptl_nid_t gateway_nid; + ptl_nid_t lo_nid; + ptl_nid_t hi_nid; + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *) PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, + &hi_nid); + + if (rc == 0) { + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + + *gateway_nalidp = (__u32)gateway_nalid; + *gateway_nidp = (__u32)gateway_nid; + *lo_nidp = (__u32)lo_nid; + *hi_nidp = (__u32)hi_nid; + } + + PORTAL_SYMBOL_PUT (kpr_control_interface); + return (rc); +} + +static int +kportal_nal_cmd(int nal, struct portal_ioctl_data *data) +{ + int rc = -EINVAL; + + ENTRY; + + down(&nal_cmd_sem); + if (nal > 0 && nal <= NAL_MAX_NR && nal_cmd[nal].nch_handler) { + CDEBUG(D_IOCTL, "calling handler nal: %d, cmd: %d\n", nal, data->ioc_nal_cmd); + rc = nal_cmd[nal].nch_handler(data, nal_cmd[nal].nch_private); + } + up(&nal_cmd_sem); + RETURN(rc); +} + +ptl_handle_ni_t * +kportal_get_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + return (PORTAL_SYMBOL_GET(kqswnal_ni)); + case SOCKNAL: + return (PORTAL_SYMBOL_GET(ksocknal_ni)); + case TOENAL: + return (PORTAL_SYMBOL_GET(ktoenal_ni)); + case GMNAL: + return (PORTAL_SYMBOL_GET(kgmnal_ni)); + case TCPNAL: + /* userspace NAL */ + return (NULL); + case SCIMACNAL: + return (PORTAL_SYMBOL_GET(kscimacnal_ni)); + default: + /* A warning to a naive caller */ + CERROR ("unknown nal: %d\n", nal); + return (NULL); + } +} + +void +kportal_put_ni (int nal) +{ + + switch (nal) + { + case QSWNAL: + PORTAL_SYMBOL_PUT(kqswnal_ni); + break; + case SOCKNAL: + PORTAL_SYMBOL_PUT(ksocknal_ni); + break; + case TOENAL: + PORTAL_SYMBOL_PUT(ktoenal_ni); + break; + case GMNAL: + PORTAL_SYMBOL_PUT(kgmnal_ni); + break; + case TCPNAL: + /* A lesson to a malicious caller */ + LBUG (); + case SCIMACNAL: + PORTAL_SYMBOL_PUT(kscimacnal_ni); + break; + default: + CERROR ("unknown nal: %d\n", nal); + } +} + +int +kportal_nal_register(int nal, nal_cmd_handler_t handler, void * private) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Register NAL %d, handler: %p\n", nal, handler); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + if (nal_cmd[nal].nch_handler != NULL) + rc = -EBUSY; + else { + nal_cmd[nal].nch_handler = handler; + nal_cmd[nal].nch_private = private; + } + up(&nal_cmd_sem); + } + return rc; +} + +int +kportal_nal_unregister(int nal) +{ + int rc = 0; + + CDEBUG(D_IOCTL, "Unregister NAL %d\n", nal); + + if (nal > 0 && nal <= NAL_MAX_NR) { + down(&nal_cmd_sem); + nal_cmd[nal].nch_handler = NULL; + nal_cmd[nal].nch_private = NULL; + up(&nal_cmd_sem); + } + return rc; +} + + +static int kportal_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + char buf[1024]; + struct portal_ioctl_data *data; + + ENTRY; + + if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || + _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || + _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + RETURN(-EINVAL); + } + + if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { + CERROR("PORTALS ioctl: data error\n"); + RETURN(-EINVAL); + } + + data = (struct portal_ioctl_data *)buf; + + switch (cmd) { + case IOC_PORTAL_SET_DAEMON: + RETURN (portals_debug_set_daemon ( + (unsigned int) data->ioc_count, + (unsigned int) data->ioc_inllen1, + (char *) data->ioc_inlbuf1, + (unsigned int) data->ioc_misc)); + case IOC_PORTAL_GET_DEBUG: { + __s32 size = portals_debug_copy_to_user(data->ioc_pbuf1, + data->ioc_plen1); + + if (size < 0) + RETURN(size); + + data->ioc_size = size; + err = copy_to_user((char *)arg, data, sizeof(*data)); + RETURN(err); + } + case IOC_PORTAL_CLEAR_DEBUG: + portals_debug_clear_buffer(); + RETURN(0); + case IOC_PORTAL_PANIC: + if (!capable (CAP_SYS_BOOT)) + RETURN (-EPERM); + panic("debugctl-invoked panic"); + RETURN(0); + case IOC_PORTAL_MARK_DEBUG: + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') + RETURN(-EINVAL); + portals_debug_mark_buffer(data->ioc_inlbuf1); + RETURN(0); + case IOC_PORTAL_PING: { + void (*ping)(struct portal_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid "LPU64"\n", + data->ioc_count, data->ioc_nid); + ping = PORTAL_SYMBOL_GET(kping_client); + if (!ping) + CERROR("PORTAL_SYMBOL_GET failed\n"); + else { + ping(data); + PORTAL_SYMBOL_PUT(kping_client); + } + RETURN(0); + } + + case IOC_PORTAL_ADD_ROUTE: + CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, data->ioc_nid2, + data->ioc_nid3); + err = kportal_add_route(data->ioc_nal, data->ioc_nid, + MIN (data->ioc_nid2, data->ioc_nid3), + MAX (data->ioc_nid2, data->ioc_nid3)); + break; + + case IOC_PORTAL_DEL_ROUTE: + CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); + err = kportal_del_route (data->ioc_nid); + break; + + case IOC_PORTAL_GET_ROUTE: + CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); + err = kportal_get_route(data->ioc_count, &data->ioc_nal, + &data->ioc_nid, &data->ioc_nid2, + &data->ioc_nid3); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_GET_NID: { + const ptl_handle_ni_t *nip; + ptl_process_id_t pid; + + CDEBUG (D_IOCTL, "Getting nid [%d]\n", data->ioc_nal); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + RETURN (-EINVAL); + + err = PtlGetId (*nip, &pid); + LASSERT (err == PTL_OK); + kportal_put_ni (data->ioc_nal); + + data->ioc_nid = pid.nid; + if (copy_to_user ((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + } + + case IOC_PORTAL_NAL_CMD: + CDEBUG (D_IOCTL, "nal command nal %d cmd %d\n", data->ioc_nal, + data->ioc_nal_cmd); + err = kportal_nal_cmd(data->ioc_nal, data); + if (err == 0) + if (copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_FAIL_NID: { + const ptl_handle_ni_t *nip; + + CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", + data->ioc_nal, data->ioc_nid, data->ioc_count); + + nip = kportal_get_ni (data->ioc_nal); + if (nip == NULL) + return (-EINVAL); + + err = PtlFailNid (*nip, data->ioc_nid, data->ioc_count); + break; + } + + default: + err = -EINVAL; + break; + } + + RETURN(err); +} + + +static struct file_operations portalsdev_fops = { + ioctl: kportal_ioctl, + open: kportal_psdev_open, + release: kportal_psdev_release +}; + + +static struct miscdevice portal_dev = { + PORTAL_MINOR, + "portals", + &portalsdev_fops +}; + +extern int insert_proc(void); +extern void remove_proc(void); +MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>"); +MODULE_DESCRIPTION("Portals v3.1"); +MODULE_LICENSE("GPL"); + +static int init_kportals_module(void) +{ + int rc; + + rc = portals_debug_init(5 * 1024 * 1024); + if (rc < 0) { + printk(KERN_ERR "portals_debug_init: %d\n", rc); + return (rc); + } + + sema_init(&nal_cmd_sem, 1); + + rc = misc_register(&portal_dev); + if (rc) { + CERROR("misc_register: error %d\n", rc); + goto cleanup_debug; + } + + rc = PtlInit(); + if (rc) { + CERROR("PtlInit: error %d\n", rc); + goto cleanup_deregister; + } + + rc = insert_proc(); + if (rc) { + CERROR("insert_proc: error %d\n", rc); + goto cleanup_fini; + } + + CDEBUG (D_OTHER, "portals setup OK\n"); + return (0); + + cleanup_fini: + PtlFini(); + cleanup_deregister: + misc_deregister(&portal_dev); + cleanup_debug: + portals_debug_cleanup(); + return rc; +} + +static void exit_kportals_module(void) +{ + int rc; + + remove_proc(); + PtlFini(); + + CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", + atomic_read(&portal_kmemory)); + + + rc = misc_deregister(&portal_dev); + if (rc) + CERROR("misc_deregister error %d\n", rc); + + if (atomic_read(&portal_kmemory) != 0) + CERROR("Portals memory leaked: %d bytes\n", + atomic_read(&portal_kmemory)); + + rc = portals_debug_cleanup(); + if (rc) + printk(KERN_ERR "portals_debug_cleanup: %d\n", rc); +} + +EXPORT_SYMBOL(lib_dispatch); +EXPORT_SYMBOL(PtlMEAttach); +EXPORT_SYMBOL(PtlMEInsert); +EXPORT_SYMBOL(PtlMEUnlink); +EXPORT_SYMBOL(PtlEQAlloc); +EXPORT_SYMBOL(PtlMDAttach); +EXPORT_SYMBOL(PtlMDUnlink); +EXPORT_SYMBOL(PtlNIInit); +EXPORT_SYMBOL(PtlNIFini); +EXPORT_SYMBOL(PtlNIDebug); +EXPORT_SYMBOL(PtlInit); +EXPORT_SYMBOL(PtlFini); +EXPORT_SYMBOL(PtlPut); +EXPORT_SYMBOL(PtlGet); +EXPORT_SYMBOL(ptl_err_str); +EXPORT_SYMBOL(portal_subsystem_debug); +EXPORT_SYMBOL(portal_debug); +EXPORT_SYMBOL(portal_stack); +EXPORT_SYMBOL(portal_printk); +EXPORT_SYMBOL(PtlEQWait); +EXPORT_SYMBOL(PtlEQFree); +EXPORT_SYMBOL(PtlEQGet); +EXPORT_SYMBOL(PtlGetId); +EXPORT_SYMBOL(PtlMDBind); +EXPORT_SYMBOL(lib_iov_nob); +EXPORT_SYMBOL(lib_copy_iov2buf); +EXPORT_SYMBOL(lib_copy_buf2iov); +EXPORT_SYMBOL(lib_kiov_nob); +EXPORT_SYMBOL(lib_copy_kiov2buf); +EXPORT_SYMBOL(lib_copy_buf2kiov); +EXPORT_SYMBOL(lib_finalize); +EXPORT_SYMBOL(lib_parse); +EXPORT_SYMBOL(lib_init); +EXPORT_SYMBOL(lib_fini); +EXPORT_SYMBOL(portal_kmemory); +EXPORT_SYMBOL(kportal_daemonize); +EXPORT_SYMBOL(kportal_blockallsigs); +EXPORT_SYMBOL(kportal_nal_register); +EXPORT_SYMBOL(kportal_nal_unregister); +EXPORT_SYMBOL(kportal_assertion_failed); +EXPORT_SYMBOL(dispatch_name); +EXPORT_SYMBOL(kportal_get_ni); +EXPORT_SYMBOL(kportal_put_ni); + +module_init(init_kportals_module); +module_exit (exit_kportals_module); diff --git a/lustre/portals/libcfs/proc.c b/lustre/portals/libcfs/proc.c new file mode 100644 index 0000000..2fa739a --- /dev/null +++ b/lustre/portals/libcfs/proc.c @@ -0,0 +1,290 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Zach Brown <zab@zabbo.net> + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <net/sock.h> +#include <linux/uio.h> + +#include <asm/system.h> +#include <asm/uaccess.h> + +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <asm/uaccess.h> +#include <asm/segment.h> + +#include <linux/proc_fs.h> +#include <linux/sysctl.h> + +# define DEBUG_SUBSYSTEM S_PORTALS + +#include <linux/kp30.h> +#include <asm/div64.h> + +static struct ctl_table_header *portals_table_header = NULL; +extern char debug_file_path[1024]; +extern char debug_daemon_file_path[1024]; +extern char portals_upcall[1024]; + +#define PSDEV_PORTALS (0x100) +#define PSDEV_DEBUG 1 /* control debugging */ +#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */ +#define PSDEV_PRINTK 3 /* force all errors to console */ +#define PSDEV_DEBUG_PATH 4 /* crashdump log location */ +#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */ +#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */ + +#define PORTALS_PRIMARY_CTLCNT 6 +static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = { + {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, + sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, + {PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path, + sizeof(debug_daemon_file_path), 0644, NULL, &proc_dostring, + &sysctl_string}, + {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, + sizeof(portals_upcall), 0644, NULL, &proc_dostring, + &sysctl_string}, + {0} +}; + +static struct ctl_table top_table[2] = { + {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, + {0} +}; + + +#ifdef PORTALS_PROFILING +/* + * profiling stuff. we do this statically for now 'cause its simple, + * but we could do some tricks with elf sections to have this array + * automatically built. + */ +#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } + +struct prof_ent prof_ents[] = { + def_prof(our_recvmsg), + def_prof(our_sendmsg), + def_prof(socknal_recv), + def_prof(lib_parse), + def_prof(conn_list_walk), + def_prof(memcpy), + def_prof(lib_finalize), + def_prof(pingcli_time), + def_prof(gmnal_send), + def_prof(gmnal_recv), +}; + +EXPORT_SYMBOL(prof_ents); + +/* + * this function is as crazy as the proc filling api + * requires. + * + * buffer: page allocated for us to scribble in. the + * data returned to the user will be taken from here. + * *start: address of the pointer that will tell the + * caller where in buffer the data the user wants is. + * ppos: offset in the entire /proc file that the user + * currently wants. + * wanted: the amount of data the user wants. + * + * while going, 'curpos' is the offset in the entire + * file where we currently are. We only actually + * start filling buffer when we get to a place in + * the file that the user cares about. + * + * we take care to only sprintf when the user cares because + * we're holding a lock while we do this. + * + * we're smart and know that we generate fixed size lines. + * we only start writing to the buffer when the user cares. + * This is unpredictable because we don't snapshot the + * list between calls that are filling in a file from + * the list. The list could change mid read and the + * output will look very weird indeed. oh well. + */ + +static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, + int *eof, void *data) +{ + int len = 0, i; + int curpos; + char *header = "Interval Cycles_per (Starts Finishes Total)\n"; + int header_len = strlen(header); + char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; + int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); + + *start = buffer; + + if (ppos < header_len) { + int diff = MIN(header_len, wanted); + memcpy(buffer, header + ppos, diff); + len += diff; + ppos += diff; + } + + if (len >= wanted) + goto out; + + curpos = header_len; + + for ( i = 0; i < MAX_PROFS ; i++) { + int copied; + struct prof_ent *pe = &prof_ents[i]; + long long cycles_per; + /* + * find the part of the array that the buffer wants + */ + if (ppos >= (curpos + line_len)) { + curpos += line_len; + continue; + } + /* the clever caller split a line */ + if (ppos > curpos) { + *start = buffer + (ppos - curpos); + } + + if (pe->finishes == 0) + cycles_per = 0; + else + { + cycles_per = pe->total_cycles; + do_div (cycles_per, pe->finishes); + } + + copied = sprintf(buffer + len, format, pe->str, cycles_per, + pe->starts, pe->finishes, pe->total_cycles); + + len += copied; + + /* pad to line len, -1 for \n */ + if ((copied < line_len-1)) { + int diff = (line_len-1) - copied; + memset(buffer + len, ' ', diff); + len += diff; + copied += diff; + } + + buffer[len++]= '\n'; + + /* bail if we have enough */ + if (((buffer + len) - *start) >= wanted) + break; + + curpos += line_len; + } + + /* lameness */ + if (i == MAX_PROFS) + *eof = 1; + out: + + return MIN(((buffer + len) - *start), wanted); +} + +/* + * all kids love /proc :/ + */ +static unsigned char basedir[]="net/portals"; +#endif /* PORTALS_PROFILING */ + +int insert_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + struct proc_dir_entry *ent; + + if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { + CERROR("profiling enum and array are out of sync.\n"); + return -1; + } + + /* + * This is pretty lame. assuming that failure just + * means that they already existed. + */ + strcat(dir, basedir); + create_proc_entry(dir, S_IFDIR, 0); + + strcat(dir, "/cycles"); + ent = create_proc_entry(dir, 0, 0); + if (!ent) { + CERROR("couldn't register %s?\n", dir); + return -1; + } + + ent->data = NULL; + ent->read_proc = prof_read_proc; +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (!portals_table_header) + portals_table_header = register_sysctl_table(top_table, 0); +#endif + + return 0; +} + +void remove_proc(void) +{ +#if PORTALS_PROFILING + unsigned char dir[128]; + int end; + + dir[0]='\0'; + strcat(dir, basedir); + + end = strlen(dir); + + strcat(dir, "/cycles"); + remove_proc_entry(dir,0); + + dir[end] = '\0'; + remove_proc_entry(dir,0); +#endif /* PORTALS_PROFILING */ + +#ifdef CONFIG_SYSCTL + if (portals_table_header) + unregister_sysctl_table(portals_table_header); + portals_table_header = NULL; +#endif +} diff --git a/lustre/portals/packaging/.cvsignore b/lustre/portals/packaging/.cvsignore new file mode 100644 index 0000000..fd1d56a --- /dev/null +++ b/lustre/portals/packaging/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +aclocal.m4 +config.log +config.status +config.cache +configure +portals.spec diff --git a/lustre/portals/packaging/Makefile.am b/lustre/portals/packaging/Makefile.am new file mode 100644 index 0000000..126bc69 --- /dev/null +++ b/lustre/portals/packaging/Makefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = portals.spec \ No newline at end of file diff --git a/lustre/portals/packaging/portals.spec.in b/lustre/portals/packaging/portals.spec.in new file mode 100644 index 0000000..e196b3f --- /dev/null +++ b/lustre/portals/packaging/portals.spec.in @@ -0,0 +1,116 @@ +%define kversion @RELEASE@ +%define linuxdir @LINUX@ +%define version HEAD + +Summary: Sandia Portals Message Passing - utilities +Name: portals +Version: %{version} +Release: 0210101748uml +Copyright: LGPL +Group: Utilities/System +BuildRoot: /var/tmp/portals-%{version}-root +Source: http://sandiaportals.org/portals-%{version}.tar.gz + +%description +Sandia Portals message passing package. Contains kernel modules, libraries and utilities. + +%package -n portals-modules +Summary: Kernel modules and NAL's for portals +Group: Development/Kernel + +%description -n portals-modules +Object-Based Disk storage drivers for Linux %{kversion}. + +%package -n portals-source +Summary: Portals kernel source for rebuilding with other kernels +Group: Development/Kernel + +%description -n portals-source +Portals kernel source for rebuilding with other kernels + +%prep +%setup -n portals-%{version} + +%build +rm -rf $RPM_BUILD_ROOT + +# Create the pristine source directory. +srcdir=$RPM_BUILD_ROOT/usr/src/portals-%{version} +mkdir -p $srcdir +find . -name CVS -prune -o -print | cpio -ap $srcdir + +# Set an explicit path to our Linux tree, if we can. +conf_flag= +linuxdir=%{linuxdir} +test -d $linuxdir && conf_flag=--with-linux=$linuxdir +./configure $conf_flag +make + +%install +make install prefix=$RPM_BUILD_ROOT + +%ifarch alpha +# this hurts me + conf_flag= + linuxdir=%{linuxdir} + test -d $linuxdir && conf_flag=--with-linux=$linuxdir + make clean + ./configure --enable-rtscts-myrinet $conf_flag + make + cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/portals/rtscts_myrinet.o + cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload +%endif + + +%files +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /usr/sbin/acceptor +%attr(-, root, root) /usr/sbin/ptlctl +%attr(-, root, root) /usr/sbin/debugctl +%ifarch alpha +%attr(-, root, root) /usr/sbin/mcpload +%endif +%attr(-, root, root) /lib/libmyrnal.a +%attr(-, root, root) /lib/libptlapi.a +%attr(-, root, root) /lib/libptlctl.a +%attr(-, root, root) /lib/libprocbridge.a +%attr(-, root, root) /lib/libptllib.a +%attr(-, root, root) /lib/libtcpnal.a +%attr(-, root, root) /lib/libtcpnalutil.a +%attr(-, root, root) /usr/include/portals/*.h +%attr(-, root, root) /usr/include/portals/base/*.h +%attr(-, root, root) /usr/include/linux/*.h + +%files -n portals-modules +%attr(-, root, root) %doc COPYING +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/portals.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptlrouter.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/kptrxtx.o +%ifarch alpha +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/p3mod.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/rtscts.o +%endif +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/portals/*nal.o + +%files -n portals-source +%attr(-, root, root) /usr/src/portals-%{version} + +%post +if [ ! -e /dev/portals ]; then + mknod /dev/portals c 10 240 +fi +depmod -ae || exit 0 + +grep -q portals /etc/modules.conf || \ + echo 'alias char-major-10-240 portals' >> /etc/modules.conf + +grep -q '/dev/portals' /etc/modules.conf || \ + echo 'alias /dev/portals portals' >> /etc/modules.conf + +%postun +depmod -ae || exit 0 + +%clean +#rm -rf $RPM_BUILD_ROOT + +# end of file diff --git a/lustre/portals/portals/.cvsignore b/lustre/portals/portals/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/portals/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/portals/Makefile.am b/lustre/portals/portals/Makefile.am new file mode 100644 index 0000000..8c03749 --- /dev/null +++ b/lustre/portals/portals/Makefile.am @@ -0,0 +1,10 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include +lib_LIBRARIES= libportals.a +libportals_a_SOURCES= api-eq.c api-init.c api-me.c api-errno.c api-ni.c api-wrap.c lib-dispatch.c lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c lib-move.c lib-ni.c lib-pid.c diff --git a/lustre/portals/portals/Makefile.mk b/lustre/portals/portals/Makefile.mk new file mode 100644 index 0000000..5627ef7 --- /dev/null +++ b/lustre/portals/portals/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += portals.o +portals-objs := lib-dispatch.o lib-eq.o lib-init.o lib-md.o lib-me.o lib-move.o lib-msg.o lib-ni.o lib-not-impl.o lib-pid.o api-eq.o api-errno.o api-init.o api-md.o api-me.o api-ni.o api-wrap.o diff --git a/lustre/portals/portals/api-eq.c b/lustre/portals/portals/api-eq.c new file mode 100644 index 0000000..e066619 --- /dev/null +++ b/lustre/portals/portals/api-eq.c @@ -0,0 +1,158 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-eq.c + * User-level event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <portals/api-support.h> + +int ptl_eq_init(void) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_fini(void) +{ + /* Nothing to do anymore... */ +} + +int ptl_eq_ni_init(nal_t * nal) +{ + /* Nothing to do anymore... */ + return PTL_OK; +} + +void ptl_eq_ni_fini(nal_t * nal) +{ + /* Nothing to do anymore... */ +} + +int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t * ev) +{ + ptl_eq_t *eq; + int rc, new_index; + unsigned long flags; + ptl_event_t *new_event; + nal_t *nal; + ENTRY; + + if (!ptl_init) + RETURN(PTL_NOINIT); + + nal = ptl_hndl2nal(&eventq); + if (!nal) + RETURN(PTL_INV_EQ); + + eq = ptl_handle2usereq(&eventq); + nal->lock(nal, &flags); + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + + new_index = eq->sequence & (eq->size - 1); + new_event = &eq->base[new_index]; + CDEBUG(D_INFO, "new_event: %p, sequence: %lu, eq->size: %u\n", + new_event, eq->sequence, eq->size); + if (PTL_SEQ_GT (eq->sequence, new_event->sequence)) { + nal->unlock(nal, &flags); + RETURN(PTL_EQ_EMPTY); + } + + *ev = *new_event; + + /* Set the unlinked_me interface number if there is one to pass + * back, since the NAL hasn't a clue what it is and therefore can't + * set it. */ + if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) + ev->unlinked_me.nal_idx = eventq.nal_idx; + + /* ensure event is delivered correctly despite possible + races with lib_finalize */ + if (eq->sequence != new_event->sequence) { + CERROR("DROPPING EVENT: eq seq %lu ev seq %lu\n", + eq->sequence, new_event->sequence); + rc = PTL_EQ_DROPPED; + } else { + rc = PTL_OK; + } + + eq->sequence = new_event->sequence + 1; + nal->unlock(nal, &flags); + RETURN(rc); +} + + +int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) +{ + int rc; + + /* PtlEQGet does the handle checking */ + while ((rc = PtlEQGet(eventq_in, event_out)) == PTL_EQ_EMPTY) { + nal_t *nal = ptl_hndl2nal(&eventq_in); + + if (nal->yield) + nal->yield(nal); + } + + return rc; +} + +#ifndef __KERNEL__ +static jmp_buf eq_jumpbuf; + +static void eq_timeout(int signal) +{ + longjmp(eq_jumpbuf, -1); +} + +int PtlEQWait_timeout(ptl_handle_eq_t eventq_in, ptl_event_t * event_out, + int timeout) +{ + static void (*prev) (int); + static int left_over; + time_t time_at_start; + int rc; + + if (setjmp(eq_jumpbuf)) { + signal(SIGALRM, prev); + alarm(left_over - timeout); + return PTL_EQ_EMPTY; + } + + left_over = alarm(timeout); + prev = signal(SIGALRM, eq_timeout); + time_at_start = time(NULL); + if (left_over < timeout) + alarm(left_over); + + rc = PtlEQWait(eventq_in, event_out); + + signal(SIGALRM, prev); + alarm(left_over); /* Should compute how long we waited */ + + return rc; +} + +#endif + diff --git a/lustre/portals/portals/api-errno.c b/lustre/portals/portals/api-errno.c new file mode 100644 index 0000000..026c93b --- /dev/null +++ b/lustre/portals/portals/api-errno.c @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-errno.c + * Instantiate the string table of errors + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + */ + +/* If you change these, you must update the number table in portals/errno.h */ +const char *ptl_err_str[] = { + "PTL_OK", + "PTL_SEGV", + + "PTL_NOSPACE", + "PTL_INUSE", + "PTL_VAL_FAILED", + + "PTL_NAL_FAILED", + "PTL_NOINIT", + "PTL_INIT_DUP", + "PTL_INIT_INV", + "PTL_AC_INV_INDEX", + + "PTL_INV_ASIZE", + "PTL_INV_HANDLE", + "PTL_INV_MD", + "PTL_INV_ME", + "PTL_INV_NI", +/* If you change these, you must update the number table in portals/errno.h */ + "PTL_ILL_MD", + "PTL_INV_PROC", + "PTL_INV_PSIZE", + "PTL_INV_PTINDEX", + "PTL_INV_REG", + + "PTL_INV_SR_INDX", + "PTL_ML_TOOLONG", + "PTL_ADDR_UNKNOWN", + "PTL_INV_EQ", + "PTL_EQ_DROPPED", + + "PTL_EQ_EMPTY", + "PTL_NOUPDATE", + "PTL_FAIL", + "PTL_NOT_IMPLEMENTED", + "PTL_NO_ACK", + + "PTL_IOV_TOO_MANY", + "PTL_IOV_TOO_SMALL", + + "PTL_EQ_INUSE", + "PTL_MD_INUSE" +}; +/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lustre/portals/portals/api-init.c b/lustre/portals/portals/api-init.c new file mode 100644 index 0000000..e59c922 --- /dev/null +++ b/lustre/portals/portals/api-init.c @@ -0,0 +1,71 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-init.c + * Initialization and global data for the p30 user side library + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <portals/api-support.h> + +int ptl_init; +unsigned int portal_subsystem_debug = 0xfff7e3ff; +unsigned int portal_debug = ~0; +unsigned int portal_printk; +unsigned int portal_stack; + +#ifdef __KERNEL__ +atomic_t portal_kmemory = ATOMIC_INIT(0); +#endif + +int __p30_initialized; +int __p30_myr_initialized; +int __p30_ip_initialized; +ptl_handle_ni_t __myr_ni_handle; +ptl_handle_ni_t __ip_ni_handle; + +int __p30_myr_timeout = 10; +int __p30_ip_timeout; + +int PtlInit(void) +{ + + if (ptl_init) + return PTL_OK; + + ptl_ni_init(); + ptl_me_init(); + ptl_eq_init(); + ptl_init = 1; + __p30_initialized = 1; + + return PTL_OK; +} + + +void PtlFini(void) +{ + + /* Reverse order of initialization */ + ptl_eq_fini(); + ptl_me_fini(); + ptl_ni_fini(); + ptl_init = 0; +} diff --git a/lustre/portals/portals/api-me.c b/lustre/portals/portals/api-me.c new file mode 100644 index 0000000..e724e58 --- /dev/null +++ b/lustre/portals/portals/api-me.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-me.c + * Match Entry local operations. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <portals/api-support.h> + +int ptl_me_init(void) +{ + return PTL_OK; +} +void ptl_me_fini(void) +{ /* Nothing to do */ +} +int ptl_me_ni_init(nal_t * nal) +{ + return PTL_OK; +} + +void ptl_me_ni_fini(nal_t * nal) +{ /* Nothing to do... */ +} diff --git a/lustre/portals/portals/api-ni.c b/lustre/portals/portals/api-ni.c new file mode 100644 index 0000000..b2e069e --- /dev/null +++ b/lustre/portals/portals/api-ni.c @@ -0,0 +1,197 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-ni.c + * Network Interface code + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <portals/api-support.h> + +/* Put some magic in the NI handle so uninitialised/zeroed handles are easy + * to spot */ +#define NI_HANDLE_MAGIC 0xebc0de00 +#define NI_HANDLE_MASK 0x000000ff +#define MAX_NIS 8 +static nal_t *ptl_interfaces[MAX_NIS]; +int ptl_num_interfaces = 0; + +nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) +{ + unsigned int idx = handle->nal_idx; + + /* XXX we really rely on the caller NOT racing with interface + * setup/teardown. That ensures her NI handle can't get + * invalidated out from under her (or worse, swapped for a + * completely different interface!) */ + + if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) + return NULL; + + idx &= NI_HANDLE_MASK; + if (idx < MAX_NIS) + return ptl_interfaces[idx]; + + return NULL; +} + +int ptl_ni_init(void) +{ + int i; + + LASSERT (MAX_NIS <= (NI_HANDLE_MASK + 1)); + + for (i = 0; i < MAX_NIS; i++) + ptl_interfaces[i] = NULL; + + return PTL_OK; +} + +void ptl_ni_fini(void) +{ + int i; + + for (i = 0; i < MAX_NIS; i++) { + nal_t *nal = ptl_interfaces[i]; + if (!nal) + continue; + + if (nal->shutdown) + nal->shutdown(nal, i); + } +} + +#ifdef __KERNEL__ +DECLARE_MUTEX(ptl_ni_init_mutex); + +static void ptl_ni_init_mutex_enter (void) +{ + down (&ptl_ni_init_mutex); +} + +static void ptl_ni_init_mutex_exit (void) +{ + up (&ptl_ni_init_mutex); +} + +#else +static void ptl_ni_init_mutex_enter (void) +{ +} + +static void ptl_ni_init_mutex_exit (void) +{ +} + +#endif + +int PtlNIInit(ptl_interface_t interface, ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, ptl_pid_t requested_pid, + ptl_handle_ni_t * handle) +{ + nal_t *nal; + int i; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = interface(ptl_num_interfaces, ptl_size, acl_size, requested_pid); + + if (!nal) { + ptl_ni_init_mutex_exit (); + return PTL_NAL_FAILED; + } + + for (i = 0; i < ptl_num_interfaces; i++) { + if (ptl_interfaces[i] == nal) { + nal->refct++; + handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | i; + fprintf(stderr, "Returning existing NAL (%d)\n", i); + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + } + nal->refct = 1; + + if (ptl_num_interfaces >= MAX_NIS) { + if (nal->shutdown) + nal->shutdown (nal, ptl_num_interfaces); + ptl_ni_init_mutex_exit (); + return PTL_NOSPACE; + } + + handle->nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | ptl_num_interfaces; + ptl_interfaces[ptl_num_interfaces++] = nal; + + ptl_eq_ni_init(nal); + ptl_me_ni_init(nal); + + ptl_ni_init_mutex_exit (); + return PTL_OK; +} + + +int PtlNIFini(ptl_handle_ni_t ni) +{ + nal_t *nal; + int idx; + int rc; + + if (!ptl_init) + return PTL_NOINIT; + + ptl_ni_init_mutex_enter (); + + nal = ptl_hndl2nal (&ni); + if (nal == NULL) { + ptl_ni_init_mutex_exit (); + return PTL_INV_HANDLE; + } + + idx = ni.nal_idx & NI_HANDLE_MASK; + + nal->refct--; + if (nal->refct > 0) { + ptl_ni_init_mutex_exit (); + return PTL_OK; + } + + ptl_me_ni_fini(nal); + ptl_eq_ni_fini(nal); + + rc = PTL_OK; + if (nal->shutdown) + rc = nal->shutdown(nal, idx); + + ptl_interfaces[idx] = NULL; + ptl_num_interfaces--; + + ptl_ni_init_mutex_exit (); + return rc; +} + +int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * ni_out) +{ + *ni_out = handle_in; + + return PTL_OK; +} diff --git a/lustre/portals/portals/api-wrap.c b/lustre/portals/portals/api-wrap.c new file mode 100644 index 0000000..e54707f --- /dev/null +++ b/lustre/portals/portals/api-wrap.c @@ -0,0 +1,599 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * api/api-wrap.c + * User-level wrappers that dispatch across the protection boundaries + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include <portals/api-support.h> + +static int do_forward(ptl_handle_any_t any_h, int cmd, void *argbuf, + int argsize, void *retbuf, int retsize) +{ + nal_t *nal; + + if (!ptl_init) { + fprintf(stderr, "PtlGetId: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(&any_h); + if (!nal) + return PTL_INV_HANDLE; + + nal->forward(nal, cmd, argbuf, argsize, retbuf, retsize); + + return PTL_OK; +} + +int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) +{ + PtlGetId_in args; + PtlGetId_out ret; + int rc; + + args.handle_in = ni_handle; + + rc = do_forward(ni_handle, PTL_GETID, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return rc; + + if (id) + *id = ret.id_out; + + return ret.rc; +} + +int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) +{ + PtlFailNid_in args; + PtlFailNid_out ret; + int rc; + + args.interface = interface; + args.nid = nid; + args.threshold = threshold; + + rc = do_forward (interface, PTL_FAILNID, + &args, sizeof(args), &ret, sizeof (ret)); + + return ((rc != PTL_OK) ? rc : ret.rc); +} + +int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, + ptl_sr_value_t * status_out) +{ + PtlNIStatus_in args; + PtlNIStatus_out ret; + int rc; + + args.interface_in = interface_in; + args.register_in = register_in; + + rc = do_forward(interface_in, PTL_NISTATUS, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (status_out) + *status_out = ret.status_out; + + return ret.rc; +} + +int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, + unsigned long *distance_out) +{ + PtlNIDist_in args; + PtlNIDist_out ret; + int rc; + + args.interface_in = interface_in; + args.process_in = process_in; + + rc = do_forward(interface_in, PTL_NIDIST, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (distance_out) + *distance_out = ret.distance_out; + + return ret.rc; +} + + + +unsigned int PtlNIDebug(ptl_handle_ni_t ni, unsigned int mask_in) +{ + PtlNIDebug_in args; + PtlNIDebug_out ret; + int rc; + + args.mask_in = mask_in; + + rc = do_forward(ni, PTL_NIDEBUG, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, + ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, + ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, + ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out) +{ + PtlMEAttach_in args; + PtlMEAttach_out ret; + int rc; + + args.interface_in = interface_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = pos_in; + + rc = do_forward(interface_in, PTL_MEATTACH, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = interface_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + + return ret.rc; +} + +int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, + ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, + ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, + ptl_handle_me_t * handle_out) +{ + PtlMEInsert_in args; + PtlMEInsert_out ret; + int rc; + + args.current_in = current_in; + args.match_id_in = match_id_in; + args.match_bits_in = match_bits_in; + args.ignore_bits_in = ignore_bits_in; + args.unlink_in = unlink_in; + args.position_in = position_in; + + rc = do_forward(current_in, PTL_MEINSERT, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = current_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMEUnlink(ptl_handle_me_t current_in) +{ + PtlMEUnlink_in args; + PtlMEUnlink_out ret; + int rc; + + args.current_in = current_in; + args.unlink_in = PTL_RETAIN; + + rc = do_forward(current_in, PTL_MEUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +int PtlTblDump(ptl_handle_ni_t ni, int index_in) +{ + PtlTblDump_in args; + PtlTblDump_out ret; + int rc; + + args.index_in = index_in; + + rc = do_forward(ni, PTL_TBLDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + return ret.rc; +} + +int PtlMEDump(ptl_handle_me_t current_in) +{ + PtlMEDump_in args; + PtlMEDump_out ret; + int rc; + + args.current_in = current_in; + + rc = do_forward(current_in, PTL_MEDUMP, &args, sizeof(args), &ret, + sizeof(ret)); + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + return ret.rc; +} + +static int validate_md(ptl_handle_any_t current_in, ptl_md_t md_in) +{ + nal_t *nal; + int rc; + int i; + + if (!ptl_init) { + fprintf(stderr, "PtlMDAttach/Bind/Update: Not initialized\n"); + return PTL_NOINIT; + } + + nal = ptl_hndl2nal(¤t_in); + if (!nal) + return PTL_INV_HANDLE; + + if (nal->validate != NULL) /* nal->validate not a NOOP */ + { + if ((md_in.options & PTL_MD_IOV) == 0) /* contiguous */ + { + rc = nal->validate (nal, md_in.start, md_in.length); + if (rc) + return (PTL_SEGV); + } + else + { + struct iovec *iov = (struct iovec *)md_in.start; + + for (i = 0; i < md_in.niov; i++, iov++) + { + rc = nal->validate (nal, iov->iov_base, iov->iov_len); + if (rc) + return (PTL_SEGV); + } + } + } + + return 0; +} + +static ptl_handle_eq_t md2eq (ptl_md_t *md) +{ + if (PtlHandleEqual (md->eventq, PTL_EQ_NONE)) + return (PTL_EQ_NONE); + + return (ptl_handle2usereq (&md->eventq)->cb_eq_handle); +} + + +int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, + ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) +{ + PtlMDAttach_in args; + PtlMDAttach_out ret; + int rc; + + rc = validate_md(me_in, md_in); + if (rc == PTL_OK) { + args.eq_in = md2eq(&md_in); + args.me_in = me_in; + args.md_in = md_in; + args.unlink_in = unlink_in; + + rc = do_forward(me_in, PTL_MDATTACH, + &args, sizeof(args), &ret, sizeof(ret)); + } + + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_ME : rc; + + if (handle_out) { + handle_out->nal_idx = me_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + + + +int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, + ptl_handle_md_t * handle_out) +{ + PtlMDBind_in args; + PtlMDBind_out ret; + int rc; + + rc = validate_md(ni_in, md_in); + if (rc != PTL_OK) + return rc; + + args.eq_in = md2eq(&md_in); + args.ni_in = ni_in; + args.md_in = md_in; + + rc = do_forward(ni_in, PTL_MDBIND, + &args, sizeof(args), &ret, sizeof(ret)); + + if (rc != PTL_OK) + return rc; + + if (handle_out) { + handle_out->nal_idx = ni_in.nal_idx; + handle_out->cookie = ret.handle_out.cookie; + } + return ret.rc; +} + +int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, + ptl_md_t *new_inout, ptl_handle_eq_t testq_in) +{ + PtlMDUpdate_internal_in args; + PtlMDUpdate_internal_out ret; + int rc; + + args.md_in = md_in; + + if (old_inout) { + args.old_inout = *old_inout; + args.old_inout_valid = 1; + } else + args.old_inout_valid = 0; + + if (new_inout) { + rc = validate_md (md_in, *new_inout); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + args.new_inout = *new_inout; + args.new_inout_valid = 1; + } else + args.new_inout_valid = 0; + + if (PtlHandleEqual (testq_in, PTL_EQ_NONE)) { + args.testq_in = PTL_EQ_NONE; + args.sequence_in = -1; + } else { + ptl_eq_t *eq = ptl_handle2usereq (&testq_in); + + args.testq_in = eq->cb_eq_handle; + args.sequence_in = eq->sequence; + } + + rc = do_forward(md_in, PTL_MDUPDATE, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + if (old_inout) + *old_inout = ret.old_inout; + + return ret.rc; +} + +int PtlMDUnlink(ptl_handle_md_t md_in) +{ + PtlMDUnlink_in args; + PtlMDUnlink_out ret; + int rc; + + args.md_in = md_in; + rc = do_forward(md_in, PTL_MDUNLINK, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + return (rc == PTL_INV_HANDLE) ? PTL_INV_MD : rc; + + return ret.rc; +} + +int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, + int (*callback) (ptl_event_t * event), + ptl_handle_eq_t * handle_out) +{ + ptl_eq_t *eq = NULL; + ptl_event_t *ev = NULL; + PtlEQAlloc_in args; + PtlEQAlloc_out ret; + int rc, i; + nal_t *nal; + + if (!ptl_init) + return PTL_NOINIT; + + nal = ptl_hndl2nal (&interface); + if (nal == NULL) + return PTL_INV_HANDLE; + + if (count != LOWEST_BIT_SET(count)) { /* not a power of 2 already */ + do { /* knock off all but the top bit... */ + count &= ~LOWEST_BIT_SET (count); + } while (count != LOWEST_BIT_SET(count)); + + count <<= 1; /* ...and round up */ + } + + if (count == 0) /* catch bad parameter / overflow on roundup */ + return (PTL_VAL_FAILED); + + PORTAL_ALLOC(ev, count * sizeof(ptl_event_t)); + if (!ev) + return PTL_NOSPACE; + + for (i = 0; i < count; i++) + ev[i].sequence = 0; + + if (nal->validate != NULL) { + rc = nal->validate(nal, ev, count * sizeof(ptl_event_t)); + if (rc != PTL_OK) + goto fail; + } + + args.ni_in = interface; + args.count_in = count; + args.base_in = ev; + args.len_in = count * sizeof(*ev); + args.callback_in = callback; + + rc = do_forward(interface, PTL_EQALLOC, &args, sizeof(args), &ret, + sizeof(ret)); + if (rc != PTL_OK) + goto fail; + if (ret.rc) + GOTO(fail, rc = ret.rc); + + PORTAL_ALLOC(eq, sizeof(*eq)); + if (!eq) { + rc = PTL_NOSPACE; + goto fail; + } + + eq->sequence = 1; + eq->size = count; + eq->base = ev; + + /* EQ handles are a little wierd. PtlEQGet() just looks at the + * queued events in shared memory. It doesn't want to do_forward() + * at all, so the cookie in the EQ handle we pass out of here is + * simply a pointer to the event queue we just set up. We stash + * the handle returned by do_forward(), so we can pass it back via + * do_forward() when we need to. */ + + eq->cb_eq_handle.nal_idx = interface.nal_idx; + eq->cb_eq_handle.cookie = ret.handle_out.cookie; + + handle_out->nal_idx = interface.nal_idx; + handle_out->cookie = (__u64)((unsigned long)eq); + return PTL_OK; + +fail: + PORTAL_FREE(ev, count * sizeof(ptl_event_t)); + return rc; +} + +int PtlEQFree(ptl_handle_eq_t eventq) +{ + PtlEQFree_in args; + PtlEQFree_out ret; + ptl_eq_t *eq; + int rc; + + eq = ptl_handle2usereq (&eventq); + args.eventq_in = eq->cb_eq_handle; + + rc = do_forward(eq->cb_eq_handle, PTL_EQFREE, &args, + sizeof(args), &ret, sizeof(ret)); + + /* XXX we're betting rc == PTL_OK here */ + PORTAL_FREE(eq->base, eq->size * sizeof(ptl_event_t)); + PORTAL_FREE(eq, sizeof(*eq)); + + return rc; +} + +int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, + ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) +{ + PtlACEntry_in args; + PtlACEntry_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.ni_in = ni_in; + args.index_in = index_in; + args.match_id_in = match_id_in; + args.portal_in = portal_in; + + rc = do_forward(ni_in, PTL_ACENTRY, &args, sizeof(args), &ret, + sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, + ptl_process_id_t target_in, ptl_pt_index_t portal_in, + ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, + ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) +{ + PtlPut_in args; + PtlPut_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.ack_req_in = ack_req_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + args.hdr_data_in = hdr_data_in; + + rc = do_forward(md_in, PTL_PUT, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} + +int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, + ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, + ptl_match_bits_t match_bits_in, ptl_size_t offset_in) +{ + PtlGet_in args; + PtlGet_out ret; + int rc; + + /* + * Copy arguments into the argument block to + * hand to the forwarding object + */ + args.md_in = md_in; + args.target_in = target_in; + args.portal_in = portal_in; + args.cookie_in = cookie_in; + args.match_bits_in = match_bits_in; + args.offset_in = offset_in; + + rc = do_forward(md_in, PTL_GET, &args, sizeof(args), &ret, sizeof(ret)); + + return (rc != PTL_OK) ? rc : ret.rc; +} diff --git a/lustre/portals/portals/lib-dispatch.c b/lustre/portals/portals/lib-dispatch.c new file mode 100644 index 0000000..13036c7 --- /dev/null +++ b/lustre/portals/portals/lib-dispatch.c @@ -0,0 +1,80 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-dispatch.c + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include <portals/lib-p30.h> +#include <portals/lib-dispatch.h> + +typedef struct { + int (*fun) (nal_cb_t * nal, void *private, void *in, void *out); + char *name; +} dispatch_table_t; + +static dispatch_table_t dispatch_table[] = { + [PTL_GETID] {do_PtlGetId, "PtlGetId"}, + [PTL_NISTATUS] {do_PtlNIStatus, "PtlNIStatus"}, + [PTL_NIDIST] {do_PtlNIDist, "PtlNIDist"}, + [PTL_NIDEBUG] {do_PtlNIDebug, "PtlNIDebug"}, + [PTL_MEATTACH] {do_PtlMEAttach, "PtlMEAttach"}, + [PTL_MEINSERT] {do_PtlMEInsert, "PtlMEInsert"}, + [PTL_MEUNLINK] {do_PtlMEUnlink, "PtlMEUnlink"}, + [PTL_TBLDUMP] {do_PtlTblDump, "PtlTblDump"}, + [PTL_MEDUMP] {do_PtlMEDump, "PtlMEDump"}, + [PTL_MDATTACH] {do_PtlMDAttach, "PtlMDAttach"}, + [PTL_MDBIND] {do_PtlMDBind, "PtlMDBind"}, + [PTL_MDUPDATE] {do_PtlMDUpdate_internal, "PtlMDUpdate_internal"}, + [PTL_MDUNLINK] {do_PtlMDUnlink, "PtlMDUnlink"}, + [PTL_EQALLOC] {do_PtlEQAlloc_internal, "PtlEQAlloc_internal"}, + [PTL_EQFREE] {do_PtlEQFree_internal, "PtlEQFree_internal"}, + [PTL_PUT] {do_PtlPut, "PtlPut"}, + [PTL_GET] {do_PtlGet, "PtlGet"}, + [PTL_FAILNID] {do_PtlFailNid, "PtlFailNid"}, + /* */ {0, ""} +}; + +/* + * This really should be elsewhere, but lib-p30/dispatch.c is + * an automatically generated file. + */ +void lib_dispatch(nal_cb_t * nal, void *private, int index, void *arg_block, + void *ret_block) +{ + lib_ni_t *ni = &nal->ni; + + if (index < 0 || index > LIB_MAX_DISPATCH || + !dispatch_table[index].fun) { + CDEBUG(D_NET, LPU64": Invalid API call %d\n", ni->nid, index); + return; + } + + CDEBUG(D_NET, LPU64": API call %s (%d)\n", ni->nid, + dispatch_table[index].name, index); + + dispatch_table[index].fun(nal, private, arg_block, ret_block); +} + +char *dispatch_name(int index) +{ + return dispatch_table[index].name; +} diff --git a/lustre/portals/portals/lib-eq.c b/lustre/portals/portals/lib-eq.c new file mode 100644 index 0000000..ce343c1 --- /dev/null +++ b/lustre/portals/portals/lib-eq.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-eq.c + * Library level Event queue management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +int do_PtlEQAlloc_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_size_t count_in + * void * base_in + * + * Outgoing: + * ptl_handle_eq_t * handle_out + */ + + PtlEQAlloc_in *args = v_args; + PtlEQAlloc_out *ret = v_ret; + + lib_eq_t *eq; + unsigned long flags; + + /* api should have rounded up */ + if (args->count_in != LOWEST_BIT_SET (args->count_in)) + return ret->rc = PTL_VAL_FAILED; + + eq = lib_eq_alloc (nal); + if (eq == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + if (nal->cb_map != NULL) { + struct iovec iov = { + .iov_base = args->base_in, + .iov_len = args->count_in * sizeof (ptl_event_t) }; + + ret->rc = nal->cb_map (nal, 1, &iov, &eq->eq_addrkey); + if (ret->rc != PTL_OK) { + lib_eq_free (nal, eq); + + state_unlock (nal, &flags); + return (ret->rc); + } + } + + eq->sequence = 1; + eq->base = args->base_in; + eq->size = args->count_in; + eq->eq_refcount = 0; + eq->event_callback = args->callback_in; + + lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); + list_add (&eq->eq_list, &nal->ni.ni_active_eqs); + + state_unlock(nal, &flags); + + ptl_eq2handle(&ret->handle_out, eq); + return (ret->rc = PTL_OK); +} + +int do_PtlEQFree_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_eq_t eventq_in + * + * Outgoing: + */ + + PtlEQFree_in *args = v_args; + PtlEQFree_out *ret = v_ret; + lib_eq_t *eq; + long flags; + + state_lock (nal, &flags); + + eq = ptl_handle2eq(&args->eventq_in, nal); + if (eq == NULL) { + ret->rc = PTL_INV_EQ; + } else if (eq->eq_refcount != 0) { + ret->rc = PTL_EQ_INUSE; + } else { + if (nal->cb_unmap != NULL) { + struct iovec iov = { + .iov_base = eq->base, + .iov_len = eq->size * sizeof (ptl_event_t) }; + + nal->cb_unmap(nal, 1, &iov, &eq->eq_addrkey); + } + + lib_invalidate_handle (nal, &eq->eq_lh); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + ret->rc = PTL_OK; + } + + state_unlock (nal, &flags); + + return (ret->rc); +} diff --git a/lustre/portals/portals/lib-init.c b/lustre/portals/portals/lib-init.c new file mode 100644 index 0000000..99c4d32 --- /dev/null +++ b/lustre/portals/portals/lib-init.c @@ -0,0 +1,474 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-init.c + * Start up the internal library and clear all structures + * Called by the NAL when it initializes. Safe to call multiple times. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +# define DEBUG_SUBSYSTEM S_PORTALS +#include <portals/lib-p30.h> + +#ifdef __KERNEL__ +# include <linux/string.h> /* for memset() */ +# include <linux/kp30.h> +# ifdef KERNEL_ADDR_CACHE +# include <compute/OS/addrCache/cache.h> +# endif +#else +# include <string.h> +# include <sys/time.h> +#endif + +#ifdef PTL_USE_SLAB_CACHE +static int ptl_slab_users; + +kmem_cache_t *ptl_md_slab; +kmem_cache_t *ptl_msg_slab; +kmem_cache_t *ptl_me_slab; +kmem_cache_t *ptl_eq_slab; + +atomic_t md_in_use_count; +atomic_t msg_in_use_count; +atomic_t me_in_use_count; +atomic_t eq_in_use_count; + +/* NB zeroing in ctor and on freeing ensures items that + * kmem_cache_validate() OK, but haven't been initialised + * as an MD/ME/EQ can't have valid handles + */ +static void +ptl_md_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_md_t)); +} + +static void +ptl_me_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_me_t)); +} + +static void +ptl_eq_slab_ctor (void *obj, kmem_cache_t *slab, unsigned long flags) +{ + memset (obj, 0, sizeof (lib_eq_t)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + + /* We'll have 1 set of slabs for ALL the nals :) */ + + if (ptl_slab_users++) + return 0; + + ptl_md_slab = kmem_cache_create("portals_MD", + sizeof(lib_md_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_md_slab_ctor, NULL); + if (!ptl_md_slab) { + CERROR("couldn't allocate ptl_md_t slab"); + RETURN (PTL_NOSPACE); + } + + /* NB no ctor for msgs; they don't need handle verification */ + ptl_msg_slab = kmem_cache_create("portals_MSG", + sizeof(lib_msg_t), 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ptl_msg_slab) { + CERROR("couldn't allocate ptl_msg_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_me_slab = kmem_cache_create("portals_ME", + sizeof(lib_me_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_me_slab_ctor, NULL); + if (!ptl_me_slab) { + CERROR("couldn't allocate ptl_me_t slab"); + RETURN (PTL_NOSPACE); + } + + ptl_eq_slab = kmem_cache_create("portals_EQ", + sizeof(lib_eq_t), 0, + SLAB_HWCACHE_ALIGN, + ptl_eq_slab_ctor, NULL); + if (!ptl_eq_slab) { + CERROR("couldn't allocate ptl_eq_t slab"); + RETURN (PTL_NOSPACE); + } + + RETURN(PTL_OK); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + if (--ptl_slab_users != 0) + return; + + LASSERT (atomic_read (&md_in_use_count) == 0); + LASSERT (atomic_read (&me_in_use_count) == 0); + LASSERT (atomic_read (&eq_in_use_count) == 0); + LASSERT (atomic_read (&msg_in_use_count) == 0); + + if (ptl_md_slab != NULL) + kmem_cache_destroy(ptl_md_slab); + if (ptl_msg_slab != NULL) + kmem_cache_destroy(ptl_msg_slab); + if (ptl_me_slab != NULL) + kmem_cache_destroy(ptl_me_slab); + if (ptl_eq_slab != NULL) + kmem_cache_destroy(ptl_eq_slab); +} +#else + +int +lib_freelist_init (nal_cb_t *nal, lib_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lib_freeobj_t, fo_contents); + + space = nal->cb_malloc (nal, n * size); + if (space == NULL) + return (PTL_NOSPACE); + + INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (PTL_OK); +} + +void +lib_freelist_fini (nal_cb_t *nal, lib_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + nal->cb_free (nal, fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (fl)); +} + +int +kportal_descriptor_setup (nal_cb_t *nal) +{ + /* NB on failure caller must still call kportal_descriptor_cleanup */ + /* ****** */ + int rc; + + memset (&nal->ni.ni_free_mes, 0, sizeof (nal->ni.ni_free_mes)); + memset (&nal->ni.ni_free_msgs, 0, sizeof (nal->ni.ni_free_msgs)); + memset (&nal->ni.ni_free_mds, 0, sizeof (nal->ni.ni_free_mds)); + memset (&nal->ni.ni_free_eqs, 0, sizeof (nal->ni.ni_free_eqs)); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mes, + MAX_MES, sizeof (lib_me_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_msgs, + MAX_MSGS, sizeof (lib_msg_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_mds, + MAX_MDS, sizeof (lib_md_t)); + if (rc != PTL_OK) + return (rc); + + rc = lib_freelist_init (nal, &nal->ni.ni_free_eqs, + MAX_EQS, sizeof (lib_eq_t)); + return (rc); +} + +void +kportal_descriptor_cleanup (nal_cb_t *nal) +{ + lib_freelist_fini (nal, &nal->ni.ni_free_mes); + lib_freelist_fini (nal, &nal->ni.ni_free_msgs); + lib_freelist_fini (nal, &nal->ni.ni_free_mds); + lib_freelist_fini (nal, &nal->ni.ni_free_eqs); +} + +#endif + +__u64 +lib_create_interface_cookie (nal_cb_t *nal) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid in a new instance of the same + * interface. Initialisation time, even if it's only implemented + * to millisecond resolution is probably easily good enough. */ + struct timeval tv; + __u64 cookie; +#ifndef __KERNEL__ + int rc = gettimeofday (&tv, NULL); + LASSERT (rc == 0); +#else + do_gettimeofday(&tv); +#endif + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return (cookie); +} + +int +lib_setup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + int i; + + /* Arbitrary choice of hash table size */ +#ifdef __KERNEL__ + ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); +#else + ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; +#endif + ni->ni_lh_hash_table = + (struct list_head *)nal->cb_malloc (nal, ni->ni_lh_hash_size + * sizeof (struct list_head)); + if (ni->ni_lh_hash_table == NULL) + return (PTL_NOSPACE); + + for (i = 0; i < ni->ni_lh_hash_size; i++) + INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); + + ni->ni_next_object_cookie = PTL_COOKIE_TYPES; + + return (PTL_OK); +} + +void +lib_cleanup_handle_hash (nal_cb_t *nal) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->ni_lh_hash_table == NULL) + return; + + nal->cb_free (nal, ni->ni_lh_hash_table, + ni->ni_lh_hash_size * sizeof (struct list_head)); +} + +lib_handle_t * +lib_lookup_cookie (nal_cb_t *nal, __u64 cookie, int type) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + struct list_head *list; + struct list_head *el; + unsigned int hash; + + if ((cookie & (PTL_COOKIE_TYPES - 1)) != type) + return (NULL); + + hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; + list = &ni->ni_lh_hash_table[hash]; + + list_for_each (el, list) { + lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); + + if (lh->lh_cookie == cookie) + return (lh); + } + + return (NULL); +} + +void +lib_initialise_handle (nal_cb_t *nal, lib_handle_t *lh, int type) +{ + /* ALWAYS called with statelock held */ + lib_ni_t *ni = &nal->ni; + unsigned int hash; + + LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); + lh->lh_cookie = ni->ni_next_object_cookie | type; + ni->ni_next_object_cookie += PTL_COOKIE_TYPES; + + hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; + list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); +} + +void +lib_invalidate_handle (nal_cb_t *nal, lib_handle_t *lh) +{ + list_del (&lh->lh_hash_chain); +} + +int +lib_init(nal_cb_t * nal, ptl_nid_t nid, ptl_pid_t pid, int gsize, + ptl_pt_index_t ptl_size, ptl_ac_index_t acl_size) +{ + int rc = PTL_OK; + lib_ni_t *ni = &nal->ni; + int i; + ENTRY; + + /* NB serialised in PtlNIInit() */ + + if (ni->refcnt != 0) { /* already initialised */ + ni->refcnt++; + goto out; + } + + lib_assert_wire_constants (); + + /* + * Allocate the portal table for this interface + * and all per-interface objects. + */ + memset(&ni->counters, 0, sizeof(lib_counters_t)); + + rc = kportal_descriptor_setup (nal); + if (rc != PTL_OK) + goto out; + + INIT_LIST_HEAD (&ni->ni_active_msgs); + INIT_LIST_HEAD (&ni->ni_active_mds); + INIT_LIST_HEAD (&ni->ni_active_eqs); + + INIT_LIST_HEAD (&ni->ni_test_peers); + + ni->ni_interface_cookie = lib_create_interface_cookie (nal); + ni->ni_next_object_cookie = 0; + rc = lib_setup_handle_hash (nal); + if (rc != PTL_OK) + goto out; + + ni->nid = nid; + ni->pid = pid; + + ni->num_nodes = gsize; + ni->tbl.size = ptl_size; + + ni->tbl.tbl = nal->cb_malloc(nal, sizeof(struct list_head) * ptl_size); + if (ni->tbl.tbl == NULL) { + rc = PTL_NOSPACE; + goto out; + } + + for (i = 0; i < ptl_size; i++) + INIT_LIST_HEAD(&(ni->tbl.tbl[i])); + + ni->debug = PTL_DEBUG_NONE; + ni->up = 1; + ni->refcnt++; + + out: + if (rc != PTL_OK) { + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + } + + RETURN (rc); +} + +int +lib_fini(nal_cb_t * nal) +{ + lib_ni_t *ni = &nal->ni; + int idx; + + ni->refcnt--; + + if (ni->refcnt != 0) + goto out; + + /* NB no stat_lock() since this is the last reference. The NAL + * should have shut down already, so it should be safe to unlink + * and free all descriptors, even those that appear committed to a + * network op (eg MD with non-zero pending count) + */ + + for (idx = 0; idx < ni->tbl.size; idx++) + while (!list_empty (&ni->tbl.tbl[idx])) { + lib_me_t *me = list_entry (ni->tbl.tbl[idx].next, + lib_me_t, me_list); + + CERROR ("Active me %p on exit\n", me); + list_del (&me->me_list); + lib_me_free (nal, me); + } + + while (!list_empty (&ni->ni_active_mds)) { + lib_md_t *md = list_entry (ni->ni_active_mds.next, + lib_md_t, md_list); + + CERROR ("Active md %p on exit\n", md); + list_del (&md->md_list); + lib_md_free (nal, md); + } + + while (!list_empty (&ni->ni_active_eqs)) { + lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, + lib_eq_t, eq_list); + + CERROR ("Active eq %p on exit\n", eq); + list_del (&eq->eq_list); + lib_eq_free (nal, eq); + } + + while (!list_empty (&ni->ni_active_msgs)) { + lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, + lib_msg_t, msg_list); + + CERROR ("Active msg %p on exit\n", msg); + list_del (&msg->msg_list); + lib_msg_free (nal, msg); + } + + nal->cb_free(nal, ni->tbl.tbl, sizeof(struct list_head) * ni->tbl.size); + ni->up = 0; + + lib_cleanup_handle_hash (nal); + kportal_descriptor_cleanup (nal); + + out: + return (PTL_OK); +} diff --git a/lustre/portals/portals/lib-md.c b/lustre/portals/portals/lib-md.c new file mode 100644 index 0000000..a79e2be --- /dev/null +++ b/lustre/portals/portals/lib-md.c @@ -0,0 +1,412 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-md.c + * Memory Descriptor management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include <stdio.h> +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include <linux/kp30.h> +#endif + +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +/* + * must be called with state lock held + */ +void lib_md_unlink(nal_cb_t * nal, lib_md_t * md) +{ + lib_me_t *me = md->me; + + if (md->pending != 0) { + CDEBUG(D_NET, "Queueing unlink of md %p\n", md); + md->md_flags |= PTL_MD_FLAG_UNLINK; + return; + } + + CDEBUG(D_NET, "Unlinking md %p\n", md); + + if ((md->options & PTL_MD_KIOV) != 0) { + if (nal->cb_unmap_pages != NULL) + nal->cb_unmap_pages (nal, md->md_niov, md->md_iov.kiov, + &md->md_addrkey); + } else if (nal->cb_unmap != NULL) + nal->cb_unmap (nal, md->md_niov, md->md_iov.iov, + &md->md_addrkey); + + if (me) { + me->md = NULL; + if (me->unlink == PTL_UNLINK) + lib_me_unlink(nal, me); + } + + if (md->eq != NULL) + { + md->eq->eq_refcount--; + LASSERT (md->eq->eq_refcount >= 0); + } + + lib_invalidate_handle (nal, &md->md_lh); + list_del (&md->md_list); + lib_md_free(nal, md); +} + +/* must be called with state lock held */ +static int lib_md_build(nal_cb_t *nal, lib_md_t *new, void *private, + ptl_md_t *md, ptl_handle_eq_t *eqh, int unlink) +{ + const int max_size_opts = PTL_MD_AUTO_UNLINK | + PTL_MD_MAX_SIZE; + lib_eq_t *eq = NULL; + int rc; + int i; + + /* NB we are passes an allocated, but uninitialised/active md. + * if we return success, caller may lib_md_unlink() it. + * otherwise caller may only lib_md_free() it. + */ + + if (!PtlHandleEqual (*eqh, PTL_EQ_NONE)) { + eq = ptl_handle2eq(eqh, nal); + if (eq == NULL) + return PTL_INV_EQ; + } + + if ((md->options & PTL_MD_IOV) != 0 && /* discontiguous MD */ + md->niov > PTL_MD_MAX_IOV) /* too many fragments */ + return PTL_IOV_TOO_MANY; + + if ((md->options & max_size_opts) != 0 && /* max size used */ + (md->max_size < 0 || md->max_size > md->length)) // illegal max_size + return PTL_INV_MD; + + new->me = NULL; + new->start = md->start; + new->length = md->length; + new->offset = 0; + new->max_size = md->max_size; + new->unlink = unlink; + new->options = md->options; + new->user_ptr = md->user_ptr; + new->eq = eq; + new->threshold = md->threshold; + new->pending = 0; + new->md_flags = 0; + + if ((md->options & PTL_MD_IOV) != 0) { + int total_length = 0; + + if ((md->options & PTL_MD_KIOV) != 0) /* Can't specify both */ + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.iov, md->start, + md->niov * sizeof (new->md_iov.iov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the base address on trust */ + if (new->md_iov.iov[i].iov_len <= 0) /* invalid length */ + return PTL_VAL_FAILED; + + total_length += new->md_iov.iov[i].iov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } else if ((md->options & PTL_MD_KIOV) != 0) { +#ifndef __KERNEL__ + return PTL_INV_MD; +#else + int total_length = 0; + + /* Trap attempt to use paged I/O if unsupported early. */ + if (nal->cb_send_pages == NULL || + nal->cb_recv_pages == NULL) + return PTL_INV_MD; + + new->md_niov = md->niov; + + if (nal->cb_read (nal, private, new->md_iov.kiov, md->start, + md->niov * sizeof (new->md_iov.kiov[0]))) + return PTL_SEGV; + + for (i = 0; i < new->md_niov; i++) { + /* We take the page pointer on trust */ + if (new->md_iov.kiov[i].kiov_offset + + new->md_iov.kiov[i].kiov_len > PAGE_SIZE ) + return PTL_VAL_FAILED; /* invalid length */ + + total_length += new->md_iov.kiov[i].kiov_len; + } + + if (md->length > total_length) + return PTL_IOV_TOO_SMALL; + + if (nal->cb_map_pages != NULL) { + rc = nal->cb_map_pages (nal, new->md_niov, new->md_iov.kiov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } +#endif + } else { /* contiguous */ + new->md_niov = 1; + new->md_iov.iov[0].iov_base = md->start; + new->md_iov.iov[0].iov_len = md->length; + + if (nal->cb_map != NULL) { + rc = nal->cb_map (nal, new->md_niov, new->md_iov.iov, + &new->md_addrkey); + if (rc != PTL_OK) + return (rc); + } + } + + if (eq != NULL) + eq->eq_refcount++; + + /* It's good; let handle2md succeed and add to active mds */ + lib_initialise_handle (nal, &new->md_lh, PTL_COOKIE_TYPE_MD); + list_add (&new->md_list, &nal->ni.ni_active_mds); + + return PTL_OK; +} + +/* must be called with state lock held */ +void lib_md_deconstruct(nal_cb_t * nal, lib_md_t * md, ptl_md_t * new) +{ + /* NB this doesn't copy out all the iov entries so when a + * discontiguous MD is copied out, the target gets to know the + * original iov pointer (in start) and the number of entries it had + * and that's all. + */ + new->start = md->start; + new->length = md->length; + new->threshold = md->threshold; + new->max_size = md->max_size; + new->options = md->options; + new->user_ptr = md->user_ptr; + ptl_eq2handle(&new->eventq, md->eq); + new->niov = ((md->options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0) ? 0 : md->md_niov; +} + +int do_PtlMDAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_me_t current_in + * ptl_md_t md_in + * ptl_unlink_t unlink_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDAttach_in *args = v_args; + PtlMDAttach_out *ret = v_ret; + lib_me_t *me; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->me_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else if (me->md != NULL) { + ret->rc = PTL_INUSE; + } else { + ret->rc = lib_md_build(nal, md, private, &args->md_in, + &args->eq_in, args->unlink_in); + + if (ret->rc == PTL_OK) { + me->md = md; + md->me = me; + + ptl_md2handle(&ret->handle_out, md); + + state_unlock (nal, &flags); + return (PTL_OK); + } + } + + lib_md_free (nal, md); + + state_unlock (nal, &flags); + return (ret->rc); +} + +int do_PtlMDBind(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t ni_in + * ptl_md_t md_in + * + * Outgoing: + * ptl_handle_md_t * handle_out + */ + + PtlMDBind_in *args = v_args; + PtlMDBind_out *ret = v_ret; + lib_md_t *md; + unsigned long flags; + + md = lib_md_alloc (nal); + if (md == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + ret->rc = lib_md_build(nal, md, private, + &args->md_in, &args->eq_in, PTL_UNLINK); + + if (ret->rc == PTL_OK) { + ptl_md2handle(&ret->handle_out, md); + + state_unlock(nal, &flags); + return (PTL_OK); + } + + lib_md_free (nal, md); + + state_unlock(nal, &flags); + return (ret->rc); +} + +int do_PtlMDUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMDUnlink_in *args = v_args; + PtlMDUnlink_out *ret = v_ret; + + lib_md_t *md; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + } else if (md->pending != 0) { /* being filled/spilled */ + ret->rc = PTL_MD_INUSE; + } else { + /* Callers attempting to unlink a busy MD which will get + * unlinked once the net op completes should see INUSE, + * before completion and INV_MD thereafter. LASSERT we've + * got that right... */ + LASSERT ((md->md_flags & PTL_MD_FLAG_UNLINK) == 0); + + lib_md_deconstruct(nal, md, &ret->status_out); + lib_md_unlink(nal, md); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +int do_PtlMDUpdate_internal(nal_cb_t * nal, void *private, void *v_args, + void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_md_t * old_inout + * ptl_md_t * new_inout + * ptl_handle_eq_t testq_in + * ptl_seq_t sequence_in + * + * Outgoing: + * ptl_md_t * old_inout + * ptl_md_t * new_inout + */ + PtlMDUpdate_internal_in *args = v_args; + PtlMDUpdate_internal_out *ret = v_ret; + lib_md_t *md; + lib_eq_t *test_eq = NULL; + ptl_md_t *new = &args->new_inout; + unsigned long flags; + + state_lock(nal, &flags); + + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL) { + ret->rc = PTL_INV_MD; + goto out; + } + + if (args->old_inout_valid) + lib_md_deconstruct(nal, md, &ret->old_inout); + + if (!args->new_inout_valid) { + ret->rc = PTL_OK; + goto out; + } + + if (!PtlHandleEqual (args->testq_in, PTL_EQ_NONE)) { + test_eq = ptl_handle2eq(&args->testq_in, nal); + if (test_eq == NULL) { + ret->rc = PTL_INV_EQ; + goto out; + } + } + + if (md->pending != 0) { + ret->rc = PTL_NOUPDATE; + goto out; + } + + if (test_eq == NULL || + test_eq->sequence == args->sequence_in) { + lib_me_t *me = md->me; + +#warning this does not track eq refcounts properly + + ret->rc = lib_md_build(nal, md, private, + new, &new->eventq, md->unlink); + + md->me = me; + } else { + ret->rc = PTL_NOUPDATE; + } + + out: + state_unlock(nal, &flags); + return (ret->rc); +} diff --git a/lustre/portals/portals/lib-me.c b/lustre/portals/portals/lib-me.c new file mode 100644 index 0000000..bd1af5b --- /dev/null +++ b/lustre/portals/portals/lib-me.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-me.c + * Match Entry management routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include <stdio.h> +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include <linux/kp30.h> +#endif + +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me); + +int do_PtlMEAttach(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEAttach_in *args = v_args; + PtlMEAttach_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_ptl_t *tbl = &ni->tbl; + unsigned long flags; + lib_me_t *me; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + /* Should check for valid matchid, but not yet */ + if (0) + return ret->rc = PTL_INV_PROC; + + me = lib_me_alloc (nal); + if (me == NULL) + return (ret->rc = PTL_NOSPACE); + + state_lock(nal, &flags); + + me->match_id = args->match_id_in; + me->match_bits = args->match_bits_in; + me->ignore_bits = args->ignore_bits_in; + me->unlink = args->unlink_in; + me->md = NULL; + + lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&me->me_list, &(tbl->tbl[args->index_in])); + else + list_add(&me->me_list, &(tbl->tbl[args->index_in])); + + ptl_me2handle(&ret->handle_out, me); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEInsert(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEInsert_in *args = v_args; + PtlMEInsert_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + lib_me_t *new; + + new = lib_me_alloc (nal); + if (new == NULL) + return (ret->rc = PTL_NOSPACE); + + /* Should check for valid matchid, but not yet */ + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + lib_me_free (nal, new); + + state_unlock (nal, &flags); + return (ret->rc = PTL_INV_ME); + } + + new->match_id = args->match_id_in; + new->match_bits = args->match_bits_in; + new->ignore_bits = args->ignore_bits_in; + new->unlink = args->unlink_in; + new->md = NULL; + + lib_initialise_handle (nal, &new->me_lh, PTL_COOKIE_TYPE_ME); + + if (args->position_in == PTL_INS_AFTER) + list_add_tail(&new->me_list, &me->me_list); + else + list_add(&new->me_list, &me->me_list); + + ptl_me2handle(&ret->handle_out, new); + + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEUnlink(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEUnlink_in *args = v_args; + PtlMEUnlink_out *ret = v_ret; + unsigned long flags; + lib_me_t *me; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_unlink(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return (ret->rc); +} + +/* call with state_lock please */ +void lib_me_unlink(nal_cb_t *nal, lib_me_t *me) +{ + lib_ni_t *ni = &nal->ni; + + if (ni->debug & PTL_DEBUG_UNLINK) { + ptl_handle_any_t handle; + ptl_me2handle(&handle, me); + } + + list_del (&me->me_list); + + if (me->md) { + me->md->me = NULL; + lib_md_unlink(nal, me->md); + } + + lib_invalidate_handle (nal, &me->me_lh); + lib_me_free(nal, me); +} + +int do_PtlTblDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlTblDump_in *args = v_args; + PtlTblDump_out *ret = v_ret; + lib_ptl_t *tbl = &nal->ni.tbl; + ptl_handle_any_t handle; + struct list_head *tmp; + unsigned long flags; + + if (args->index_in < 0 || args->index_in >= tbl->size) + return ret->rc = PTL_INV_PTINDEX; + + nal->cb_printf(nal, "Portal table index %d\n", args->index_in); + + state_lock(nal, &flags); + list_for_each(tmp, &(tbl->tbl[args->index_in])) { + lib_me_t *me = list_entry(tmp, lib_me_t, me_list); + ptl_me2handle(&handle, me); + lib_me_dump(nal, me); + } + state_unlock(nal, &flags); + + return ret->rc = PTL_OK; +} + +int do_PtlMEDump(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlMEDump_in *args = v_args; + PtlMEDump_out *ret = v_ret; + lib_me_t *me; + unsigned long flags; + + state_lock(nal, &flags); + + me = ptl_handle2me(&args->current_in, nal); + if (me == NULL) { + ret->rc = PTL_INV_ME; + } else { + lib_me_dump(nal, me); + ret->rc = PTL_OK; + } + + state_unlock(nal, &flags); + + return ret->rc; +} + +static void lib_me_dump(nal_cb_t * nal, lib_me_t * me) +{ + nal->cb_printf(nal, "Match Entry %p ("LPX64")\n", me, + me->me_lh.lh_cookie); + + nal->cb_printf(nal, "\tMatch/Ignore\t= %016lx / %016lx\n", + me->match_bits, me->ignore_bits); + + nal->cb_printf(nal, "\tMD\t= %p\n", me->md); + nal->cb_printf(nal, "\tprev\t= %p\n", + list_entry(me->me_list.prev, lib_me_t, me_list)); + nal->cb_printf(nal, "\tnext\t= %p\n", + list_entry(me->me_list.next, lib_me_t, me_list)); +} diff --git a/lustre/portals/portals/lib-move.c b/lustre/portals/portals/lib-move.c new file mode 100644 index 0000000..fde4f16 --- /dev/null +++ b/lustre/portals/portals/lib-move.c @@ -0,0 +1,1379 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include <stdio.h> +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include <linux/kp30.h> +#endif +#include <portals/p30.h> +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +/* + * Right now it does not check access control lists. + * + * We only support one MD per ME, which is how the Portals 3.1 spec is written. + * All previous complication is removed. + */ + +static lib_me_t * +lib_find_me(nal_cb_t *nal, int index, int op_mask, ptl_nid_t src_nid, + ptl_pid_t src_pid, ptl_size_t rlength, ptl_size_t roffset, + ptl_match_bits_t match_bits, ptl_size_t *mlength_out, + ptl_size_t *offset_out, int *unlink_out) +{ + lib_ni_t *ni = &nal->ni; + struct list_head *match_list = &ni->tbl.tbl[index]; + struct list_head *tmp; + lib_me_t *me; + lib_md_t *md; + ptl_size_t mlength; + ptl_size_t offset; + + ENTRY; + + CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " + "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); + + if (index < 0 || index >= ni->tbl.size) { + CERROR("Invalid portal %d not in [0-%d]\n", + index, ni->tbl.size); + goto failed; + } + + list_for_each (tmp, match_list) { + me = list_entry(tmp, lib_me_t, me_list); + md = me->md; + + /* ME attached but MD not attached yet */ + if (md == NULL) + continue; + + LASSERT (me == md->me); + + /* MD deactivated */ + if (md->threshold == 0) + continue; + + /* mismatched MD op */ + if ((md->options & op_mask) == 0) + continue; + + /* mismatched ME nid/pid? */ + if (me->match_id.nid != PTL_NID_ANY && + me->match_id.nid != src_nid) + continue; + + if (me->match_id.pid != PTL_PID_ANY && + me->match_id.pid != src_pid) + continue; + + /* mismatched ME matchbits? */ + if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) + continue; + + /* Hurrah! This _is_ a match; check it out... */ + + if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) + offset = md->offset; + else + offset = roffset; + + mlength = md->length - offset; + if ((md->options & PTL_MD_MAX_SIZE) != 0 && + mlength > md->max_size) + mlength = md->max_size; + + if (rlength <= mlength) { /* fits in allowed space */ + mlength = rlength; + } else if ((md->options & PTL_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet %d too big: %d left, " + "%d allowed\n", rlength, md->length - offset, + mlength); + goto failed; + } + + md->offset = offset + mlength; + + *offset_out = offset; + *mlength_out = mlength; + *unlink_out = ((md->options & PTL_MD_AUTO_UNLINK) != 0 && + md->offset >= (md->length - md->max_size)); + RETURN (me); + } + + failed: + CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 + " offset %d length %d: no match\n", + ni->nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", + src_nid, src_pid, index, match_bits, roffset, rlength); + RETURN(NULL); +} + +int do_PtlFailNid (nal_cb_t *nal, void *private, void *v_args, void *v_ret) +{ + PtlFailNid_in *args = v_args; + PtlFailNid_out *ret = v_ret; + lib_test_peer_t *tp; + unsigned long flags; + struct list_head *el; + struct list_head *next; + struct list_head cull; + + if (args->threshold != 0) { + /* Adding a new entry */ + tp = (lib_test_peer_t *)nal->cb_malloc (nal, sizeof (*tp)); + if (tp == NULL) + return (ret->rc = PTL_FAIL); + + tp->tp_nid = args->nid; + tp->tp_threshold = args->threshold; + + state_lock (nal, &flags); + list_add (&tp->tp_list, &nal->ni.ni_test_peers); + state_unlock (nal, &flags); + return (ret->rc = PTL_OK); + } + + /* removing entries */ + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0 || /* needs culling anyway */ + args->nid == PTL_NID_ANY || /* removing all entries */ + tp->tp_nid == args->nid) /* matched this one */ + { + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + + list_del (&tp->tp_list); + nal->cb_free (nal, tp, sizeof (*tp)); + } + return (ret->rc = PTL_OK); +} + +static int +fail_peer (nal_cb_t *nal, ptl_nid_t nid, int outgoing) +{ + lib_test_peer_t *tp; + struct list_head *el; + struct list_head *next; + unsigned long flags; + struct list_head cull; + int fail = 0; + + INIT_LIST_HEAD (&cull); + + state_lock (nal, &flags); + + list_for_each_safe (el, next, &nal->ni.ni_test_peers) { + tp = list_entry (el, lib_test_peer_t, tp_list); + + if (tp->tp_threshold == 0) { + /* zombie entry */ + if (outgoing) { + /* only cull zombies on outgoing tests, + * since we may be at interrupt priority on + * incoming messages. */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + continue; + } + + if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ + nid == tp->tp_nid) { /* fail this peer */ + fail = 1; + + if (tp->tp_threshold != PTL_MD_THRESH_INF) { + tp->tp_threshold--; + if (outgoing && + tp->tp_threshold == 0) { + /* see above */ + list_del (&tp->tp_list); + list_add (&tp->tp_list, &cull); + } + } + break; + } + } + + state_unlock (nal, &flags); + + while (!list_empty (&cull)) { + tp = list_entry (cull.next, lib_test_peer_t, tp_list); + list_del (&tp->tp_list); + + nal->cb_free (nal, tp, sizeof (*tp)); + } + + return (fail); +} + +ptl_size_t +lib_iov_nob (int niov, struct iovec *iov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (iov++)->iov_len; + + return (nob); +} + +void +lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (dest, iov->iov_base, nob); + + len -= nob; + dest += nob; + niov--; + iov++; + } +} + +void +lib_copy_buf2iov (int niov, struct iovec *iov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (iov->iov_len, len); + memcpy (iov->iov_base, src, nob); + + len -= nob; + src += nob; + niov--; + iov++; + } +} + +static int +lib_extract_iov (struct iovec *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + struct iovec *src = md->md_iov.iov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (dst_niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} + +#ifndef __KERNEL__ +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + LASSERT (0); + return (0); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + LASSERT (0); +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *dest, ptl_size_t len) +{ + LASSERT (0); +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + LASSERT (0); +} + +#else + +ptl_size_t +lib_kiov_nob (int niov, ptl_kiov_t *kiov) +{ + ptl_size_t nob = 0; + + while (niov-- > 0) + nob += (kiov++)->kiov_len; + + return (nob); +} + +void +lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (dest, addr, nob); + kunmap (kiov->kiov_page); + + len -= nob; + dest += nob; + niov--; + kiov++; + } +} + +void +lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, char *src, ptl_size_t len) +{ + ptl_size_t nob; + char *addr; + + LASSERT (!in_interrupt ()); + while (len > 0) + { + LASSERT (niov > 0); + nob = MIN (kiov->kiov_len, len); + + addr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; + memcpy (addr, src, nob); + kunmap (kiov->kiov_page); + + len -= nob; + src += nob; + niov--; + kiov++; + } +} + +static int +lib_extract_kiov (ptl_kiov_t *dst, lib_md_t *md, + ptl_size_t offset, ptl_size_t len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + int src_niov = md->md_niov; + ptl_kiov_t *src = md->md_iov.kiov; + ptl_size_t frag_len; + int dst_niov; + + LASSERT (len >= 0); + LASSERT (offset >= 0); + LASSERT (offset + len <= md->length); + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + dst_niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (dst_niov <= PTL_MD_MAX_IOV); + + frag_len = src->kiov_len - offset; + dst->kiov_page = src->kiov_page; + dst->kiov_offset = src->kiov_offset + offset; + + if (len <= frag_len) { + dst->kiov_len = len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + return (dst_niov); + } + + dst->kiov_len = frag_len; + LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + + len -= frag_len; + dst++; + src++; + dst_niov++; + src_niov--; + offset = 0; + } +} +#endif + +void +lib_recv (nal_cb_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, + ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +{ + int niov; + + if (mlen == 0) + nal->cb_recv (nal, private, msg, 0, NULL, 0, rlen); + else if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, mlen); + nal->cb_recv (nal, private, msg, + niov, msg->msg_iov.iov, mlen, rlen); + } else { + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, mlen); + nal->cb_recv_pages (nal, private, msg, + niov, msg->msg_iov.kiov, mlen, rlen); + } +} + +int +lib_send (nal_cb_t *nal, void *private, lib_msg_t *msg, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + lib_md_t *md, ptl_size_t offset, ptl_size_t len) +{ + int niov; + + if (len == 0) + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + 0, NULL, 0)); + + if ((md->options & PTL_MD_KIOV) == 0) { + niov = lib_extract_iov (msg->msg_iov.iov, md, offset, len); + return (nal->cb_send (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.iov, len)); + } + + niov = lib_extract_kiov (msg->msg_iov.kiov, md, offset, len); + return (nal->cb_send_pages (nal, private, msg, + hdr, type, nid, pid, + niov, msg->msg_iov.kiov, len)); +} + +static lib_msg_t * +get_new_msg (nal_cb_t *nal, lib_md_t *md) +{ + /* ALWAYS called holding the state_lock */ + lib_counters_t *counters = &nal->ni.counters; + lib_msg_t *msg = lib_msg_alloc (nal); + + if (msg == NULL) + return (NULL); + + memset (msg, 0, sizeof (*msg)); + + msg->send_ack = 0; + + msg->md = md; + msg->ev.arrival_time = get_cycles(); + md->pending++; + if (md->threshold != PTL_MD_THRESH_INF) { + LASSERT (md->threshold > 0); + md->threshold--; + } + + counters->msgs_alloc++; + if (counters->msgs_alloc > counters->msgs_max) + counters->msgs_max = counters->msgs_alloc; + + list_add (&msg->msg_list, &nal->ni.ni_active_msgs); + + return (msg); +} + + +/* + * Incoming messages have a ptl_msg_t object associated with them + * by the library. This object encapsulates the state of the + * message and allows the NAL to do non-blocking receives or sends + * of long messages. + * + */ +static int parse_put(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + unsigned long flags; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = NTOH__u64 (hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = NTOH__u32 (hdr->msg.put.ptl_index); + hdr->msg.put.offset = NTOH__u32 (hdr->msg.put.offset); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, + hdr->src_nid, hdr->src_pid, + PTL_HDR_LENGTH (hdr), hdr->msg.put.offset, + hdr->msg.put.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming put index %x from "LPU64"/%u of length %d/%d " + "into md "LPX64" [%d] + %d\n", hdr->msg.put.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping PUT from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + !(md->options & PTL_MD_ACK_DISABLE)) { + msg->send_ack = 1; + msg->ack_wmd = hdr->msg.put.ack_wmd; + msg->nid = hdr->src_nid; + msg->pid = hdr->src_pid; + msg->ev.match_bits = hdr->msg.put.match_bits; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_PUT; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.put.ptl_index; + msg->ev.match_bits = hdr->msg.put.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = hdr->msg.put.hdr_data; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += mlength; + + /* only unlink after MD's pending count has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, offset, mlength, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_get(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + ptl_size_t mlength = 0; + ptl_size_t offset = 0; + int unlink = 0; + lib_me_t *me; + lib_md_t *md; + lib_msg_t *msg; + ptl_hdr_t reply; + unsigned long flags; + int rc; + + /* Convert get fields to host byte order */ + hdr->msg.get.match_bits = NTOH__u64 (hdr->msg.get.match_bits); + hdr->msg.get.ptl_index = NTOH__u32 (hdr->msg.get.ptl_index); + hdr->msg.get.sink_length = NTOH__u32 (hdr->msg.get.sink_length); + hdr->msg.get.src_offset = NTOH__u32 (hdr->msg.get.src_offset); + + /* compatibility check until field is deleted */ + if (hdr->msg.get.return_offset != 0) + CERROR("Unexpected non-zero get.return_offset %x from " + LPU64"\n", hdr->msg.get.return_offset, hdr->src_nid); + + state_lock(nal, &flags); + + me = lib_find_me(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, + hdr->src_nid, hdr->src_pid, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, + &mlength, &offset, &unlink); + if (me == NULL) + goto drop; + + md = me->md; + CDEBUG(D_NET, "Incoming get index %d from "LPU64".%u of length %d/%d " + "from md "LPX64" [%d] + %d\n", hdr->msg.get.ptl_index, + hdr->src_nid, hdr->src_pid, mlength, PTL_HDR_LENGTH(hdr), + md->md_lh.lh_cookie, md->md_niov, offset); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping GET from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_GET; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.portal = hdr->msg.get.ptl_index; + msg->ev.match_bits = hdr->msg.get.match_bits; + msg->ev.rlength = PTL_HDR_LENGTH(hdr); + msg->ev.mlength = mlength; + msg->ev.offset = offset; + msg->ev.hdr_data = 0; + + /* NB if this match has exhausted the MD, we can't be sure + * that this event will the the last one associated with + * this MD in the event queue (another message already + * matching this ME/MD could end up being last). So we + * remember the ME handle anyway and check again when we're + * allocating our slot in the event queue. + */ + ptl_me2handle (&msg->ev.unlinked_me, me); + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.send_count++; + ni->counters.send_length += mlength; + + /* only unlink after MD's refcount has been bumped + * in get_new_msg() otherwise lib_me_unlink() will nuke it */ + if (unlink) { + md->md_flags |= PTL_MD_FLAG_AUTO_UNLINKED; + lib_me_unlink (nal, me); + } + + state_unlock(nal, &flags); + + memset (&reply, 0, sizeof (reply)); + reply.type = HTON__u32 (PTL_MSG_REPLY); + reply.dest_nid = HTON__u64 (hdr->src_nid); + reply.src_nid = HTON__u64 (ni->nid); + reply.dest_pid = HTON__u32 (hdr->src_pid); + reply.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&reply) = HTON__u32 (mlength); + + reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + + rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, + hdr->src_nid, hdr->src_pid, md, offset, mlength); + if (rc != 0) { + CERROR(LPU64": Dropping GET from "LPU64": send REPLY failed\n", + ni->nid, hdr->src_nid); + state_lock (nal, &flags); + goto drop; + } + + /* Complete the incoming message */ + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (rc); + drop: + ni->counters.drop_count++; + ni->counters.drop_length += hdr->msg.get.sink_length; + state_unlock(nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_reply(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + int rlength; + int length; + lib_msg_t *msg; + unsigned long flags; + + /* compatibility check until field is deleted */ + if (hdr->msg.reply.dst_offset != 0) + CERROR("Unexpected non-zero reply.dst_offset %x from "LPU64"\n", + hdr->msg.reply.dst_offset, hdr->src_nid); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", + ni->nid, hdr->src_nid, + md == NULL ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + goto drop; + } + + LASSERT (md->offset == 0); + + length = rlength = PTL_HDR_LENGTH(hdr); + + if (length > md->length) { + if ((md->options & PTL_MD_TRUNCATE) == 0) { + CERROR (LPU64": Dropping REPLY from "LPU64 + " length %d for MD "LPX64" would overflow (%d)\n", + ni->nid, hdr->src_nid, length, + hdr->msg.reply.dst_wmd.wh_object_cookie, + md->length); + goto drop; + } + length = md->length; + } + + CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", + hdr->src_nid, length, rlength, + hdr->msg.reply.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping REPLY from "LPU64": can't " + "allocate msg\n", ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_REPLY; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.rlength = rlength; + msg->ev.mlength = length; + msg->ev.offset = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + ni->counters.recv_length += length; + + state_unlock(nal, &flags); + + lib_recv (nal, private, msg, md, 0, length, rlength); + return 0; + + drop: + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static int parse_ack(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + unsigned long flags; + + /* Convert ack fields to host byte order */ + hdr->msg.ack.match_bits = NTOH__u64 (hdr->msg.ack.match_bits); + hdr->msg.ack.mlength = NTOH__u32 (hdr->msg.ack.mlength); + + state_lock(nal, &flags); + + /* NB handles only looked up by creator (no flips) */ + md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); + if (md == NULL || md->threshold == 0) { + CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " + LPX64"."LPX64"\n", ni->nid, hdr->src_nid, + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); + goto drop; + } + + CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", + ni->nid, hdr->src_nid, + hdr->msg.ack.dst_wmd.wh_object_cookie); + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR(LPU64": Dropping ACK from "LPU64": can't allocate msg\n", + ni->nid, hdr->src_nid); + goto drop; + } + + if (md->eq) { + msg->ev.type = PTL_EVENT_ACK; + msg->ev.initiator.nid = hdr->src_nid; + msg->ev.initiator.pid = hdr->src_pid; + msg->ev.mlength = hdr->msg.ack.mlength; + msg->ev.match_bits = hdr->msg.ack.match_bits; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + ni->counters.recv_count++; + state_unlock(nal, &flags); + lib_recv (nal, private, msg, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return 0; + + drop: + nal->ni.counters.drop_count++; + state_unlock (nal, &flags); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return -1; +} + +static char * +hdr_type_string (ptl_hdr_t *hdr) +{ + switch (hdr->type) { + case PTL_MSG_ACK: + return ("ACK"); + case PTL_MSG_PUT: + return ("PUT"); + case PTL_MSG_GET: + return ("GET"); + case PTL_MSG_REPLY: + return ("REPLY"); + case PTL_MSG_HELLO: + return ("HELLO"); + default: + return ("<UNKNOWN>"); + } +} + +void print_hdr(nal_cb_t * nal, ptl_hdr_t * hdr) +{ + char *type_str = hdr_type_string (hdr); + + nal->cb_printf(nal, "P3 Header at %p of type %s\n", hdr, type_str); + nal->cb_printf(nal, " From nid/pid %Lu/%Lu", hdr->src_nid, + hdr->src_pid); + nal->cb_printf(nal, " To nid/pid %Lu/%Lu\n", hdr->dest_nid, + hdr->dest_pid); + + switch (hdr->type) { + default: + break; + + case PTL_MSG_PUT: + nal->cb_printf(nal, + " Ptl index %d, ack md "LPX64"."LPX64", " + "match bits "LPX64"\n", + hdr->msg.put.ptl_index, + hdr->msg.put.ack_wmd.wh_interface_cookie, + hdr->msg.put.ack_wmd.wh_object_cookie, + hdr->msg.put.match_bits); + nal->cb_printf(nal, + " Length %d, offset %d, hdr data "LPX64"\n", + PTL_HDR_LENGTH(hdr), hdr->msg.put.offset, + hdr->msg.put.hdr_data); + break; + + case PTL_MSG_GET: + nal->cb_printf(nal, + " Ptl index %d, return md "LPX64"."LPX64", " + "match bits "LPX64"\n", hdr->msg.get.ptl_index, + hdr->msg.get.return_wmd.wh_interface_cookie, + hdr->msg.get.return_wmd.wh_object_cookie, + hdr->msg.get.match_bits); + nal->cb_printf(nal, + " Length %d, src offset %d\n", + hdr->msg.get.sink_length, + hdr->msg.get.src_offset); + break; + + case PTL_MSG_ACK: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "manipulated length %d\n", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie, + hdr->msg.ack.mlength); + break; + + case PTL_MSG_REPLY: + nal->cb_printf(nal, " dst md "LPX64"."LPX64", " + "length %d\n", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie, + PTL_HDR_LENGTH(hdr)); + } + +} /* end of print_hdr() */ + + +int lib_parse(nal_cb_t * nal, ptl_hdr_t * hdr, void *private) +{ + unsigned long flags; + + /* NB static check; optimizer will elide this if it's right */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.put.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.get.length)); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == + offsetof (ptl_hdr_t, msg.reply.length)); + + /* convert common fields to host byte order */ + hdr->dest_nid = NTOH__u64 (hdr->dest_nid); + hdr->src_nid = NTOH__u64 (hdr->src_nid); + hdr->dest_pid = NTOH__u32 (hdr->dest_pid); + hdr->src_pid = NTOH__u32 (hdr->src_pid); + hdr->type = NTOH__u32 (hdr->type); + PTL_HDR_LENGTH(hdr) = NTOH__u32 (PTL_HDR_LENGTH(hdr)); +#if 0 + nal->cb_printf(nal, "%d: lib_parse: nal=%p hdr=%p type=%d\n", + nal->ni.nid, nal, hdr, hdr->type); + print_hdr(nal, hdr); +#endif + if (hdr->type == PTL_MSG_HELLO) { + /* dest_nid is really ptl_magicversion_t */ + ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; + + CERROR (LPU64": Dropping unexpected HELLO message: " + "magic %d, version %d.%d from "LPD64"\n", + nal->ni.nid, mv->magic, + mv->version_major, mv->version_minor, + hdr->src_nid); + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (hdr->dest_nid != nal->ni.nid) { + CERROR(LPU64": Dropping %s message from "LPU64" to "LPU64 + " (not me)\n", nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid, hdr->dest_nid); + + state_lock (nal, &flags); + nal->ni.counters.drop_count++; + nal->ni.counters.drop_length += PTL_HDR_LENGTH(hdr); + state_unlock (nal, &flags); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ + { + CERROR(LPU64": Dropping incoming %s from "LPU64 + ": simulated failure\n", + nal->ni.nid, hdr_type_string (hdr), + hdr->src_nid); + return (-1); + } + + switch (hdr->type) { + case PTL_MSG_ACK: + return (parse_ack(nal, hdr, private)); + case PTL_MSG_PUT: + return (parse_put(nal, hdr, private)); + break; + case PTL_MSG_GET: + return (parse_get(nal, hdr, private)); + break; + case PTL_MSG_REPLY: + return (parse_reply(nal, hdr, private)); + break; + default: + CERROR(LPU64": Dropping <unknown> message from "LPU64 + ": Bad type=0x%x\n", nal->ni.nid, hdr->src_nid, + hdr->type); + + lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr)); + return (-1); + } +} + + +int do_PtlPut(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_ack_req_t ack_req_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlPut_in *args = v_args; + PtlPut_out *ret = v_ret; + ptl_hdr_t hdr; + + lib_ni_t *ni = &nal->ni; + lib_md_t *md; + lib_msg_t *msg = NULL; + ptl_process_id_t *id = &args->target_in; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + ret->rc = PTL_OK; + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + CDEBUG(D_NET, "PtlPut -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_PUT); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = HTON__u32 (md->length); + + /* NB handles only looked up by creator (no flips) */ + if (args->ack_req_in == PTL_ACK_REQ) { + hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; + } else { + hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; + } + + hdr.msg.put.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.put.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.put.offset = HTON__u32 (args->offset_in); + hdr.msg.put.hdr_data = args->hdr_data_in; + + ni->counters.send_count++; + ni->counters.send_length += md->length; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("BAD: could not allocate msg!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we need to allocate a message state object and record the + * information about this operation that will be recorded into + * event queue once the message has been completed. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = args->hdr_data_in; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_PUT, + id->nid, id->pid, md, 0, md->length); + + return ret->rc = PTL_OK; +} + + +int do_PtlGet(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_md_t md_in + * ptl_process_id_t target_in + * ptl_pt_index_t portal_in + * ptl_ac_index_t cookie_in + * ptl_match_bits_t match_bits_in + * ptl_size_t offset_in + * + * Outgoing: + */ + + PtlGet_in *args = v_args; + PtlGet_out *ret = v_ret; + ptl_hdr_t hdr; + lib_msg_t *msg = NULL; + lib_ni_t *ni = &nal->ni; + ptl_process_id_t *id = &args->target_in; + lib_md_t *md; + unsigned long flags; + + if (!list_empty (&nal->ni.ni_test_peers) && /* normally we don't */ + fail_peer (nal, id->nid, 1)) /* shall we now? */ + { + CERROR(LPU64": Dropping PUT to "LPU64": simulated failure\n", + nal->ni.nid, id->nid); + return (ret->rc = PTL_INV_PROC); + } + + state_lock(nal, &flags); + md = ptl_handle2md(&args->md_in, nal); + if (md == NULL || !md->threshold) { + state_unlock(nal, &flags); + return ret->rc = PTL_INV_MD; + } + + LASSERT (md->offset == 0); + + CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, + (unsigned long)id->pid); + + memset (&hdr, 0, sizeof (hdr)); + hdr.type = HTON__u32 (PTL_MSG_GET); + hdr.dest_nid = HTON__u64 (id->nid); + hdr.src_nid = HTON__u64 (ni->nid); + hdr.dest_pid = HTON__u32 (id->pid); + hdr.src_pid = HTON__u32 (ni->pid); + PTL_HDR_LENGTH(&hdr) = 0; + + /* NB handles only looked up by creator (no flips) */ + hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; + hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; + + hdr.msg.get.match_bits = HTON__u64 (args->match_bits_in); + hdr.msg.get.ptl_index = HTON__u32 (args->portal_in); + hdr.msg.get.src_offset = HTON__u32 (args->offset_in); + hdr.msg.get.sink_length = HTON__u32 (md->length); + + ni->counters.send_count++; + + msg = get_new_msg (nal, md); + if (msg == NULL) { + CERROR("do_PtlGet: BAD - could not allocate cookie!\n"); + state_unlock(nal, &flags); + return ret->rc = PTL_NOSPACE; + } + + /* + * If this memory descriptor has an event queue associated with + * it we must allocate a message state object that will record + * the information to be filled in once the message has been + * completed. More information is in the do_PtlPut() comments. + * + * NB. We're now committed to the GET, since we just marked the MD + * busy. Callers who observe this (by getting PTL_MD_INUSE from + * PtlMDUnlink()) expect a completion event to tell them when the + * MD becomes idle. + */ + if (md->eq) { + msg->ev.type = PTL_EVENT_SENT; + msg->ev.initiator.nid = ni->nid; + msg->ev.initiator.pid = ni->pid; + msg->ev.portal = args->portal_in; + msg->ev.match_bits = args->match_bits_in; + msg->ev.rlength = md->length; + msg->ev.mlength = md->length; + msg->ev.offset = args->offset_in; + msg->ev.hdr_data = 0; + + lib_md_deconstruct(nal, md, &msg->ev.mem_desc); + } + + state_unlock(nal, &flags); + + lib_send (nal, private, msg, &hdr, PTL_MSG_GET, + id->nid, id->pid, NULL, 0, 0); + + return ret->rc = PTL_OK; +} + +void lib_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' */ + + /* Constants... */ + LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded); + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + LASSERT (PORTALS_PROTO_VERSION_MINOR == 1); + LASSERT (PTL_MSG_ACK == 0); + LASSERT (PTL_MSG_PUT == 1); + LASSERT (PTL_MSG_GET == 2); + LASSERT (PTL_MSG_REPLY == 3); + LASSERT (PTL_MSG_HELLO == 4); + + /* Checks for struct ptl_handle_wire_t */ + LASSERT (sizeof (ptl_handle_wire_t) == 16); + LASSERT (offsetof (ptl_handle_wire_t, wh_interface_cookie) == 0); + LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8); + LASSERT (offsetof (ptl_handle_wire_t, wh_object_cookie) == 8); + LASSERT (sizeof (((ptl_handle_wire_t *)0)->wh_object_cookie) == 8); + + /* Checks for struct ptl_magicversion_t */ + LASSERT (sizeof (ptl_magicversion_t) == 8); + LASSERT (offsetof (ptl_magicversion_t, magic) == 0); + LASSERT (sizeof (((ptl_magicversion_t *)0)->magic) == 4); + LASSERT (offsetof (ptl_magicversion_t, version_major) == 4); + LASSERT (sizeof (((ptl_magicversion_t *)0)->version_major) == 2); + LASSERT (offsetof (ptl_magicversion_t, version_minor) == 6); + LASSERT (sizeof (((ptl_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct ptl_hdr_t */ + LASSERT (sizeof (ptl_hdr_t) == 72); + LASSERT (offsetof (ptl_hdr_t, dest_nid) == 0); + LASSERT (sizeof (((ptl_hdr_t *)0)->dest_nid) == 8); + LASSERT (offsetof (ptl_hdr_t, src_nid) == 8); + LASSERT (sizeof (((ptl_hdr_t *)0)->src_nid) == 8); + LASSERT (offsetof (ptl_hdr_t, dest_pid) == 16); + LASSERT (sizeof (((ptl_hdr_t *)0)->dest_pid) == 4); + LASSERT (offsetof (ptl_hdr_t, src_pid) == 20); + LASSERT (sizeof (((ptl_hdr_t *)0)->src_pid) == 4); + LASSERT (offsetof (ptl_hdr_t, type) == 24); + LASSERT (sizeof (((ptl_hdr_t *)0)->type) == 4); + + /* Ack */ + LASSERT (offsetof (ptl_hdr_t, msg.ack.mlength) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.mlength) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.ack.dst_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.ack.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.ack.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.ack.length) == 4); + + /* Put */ + LASSERT (offsetof (ptl_hdr_t, msg.put.ptl_index) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ptl_index) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.ack_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.put.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.put.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.length) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.offset) == 60); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.put.hdr_data) == 64); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.put.hdr_data) == 8); + + /* Get */ + LASSERT (offsetof (ptl_hdr_t, msg.get.ptl_index) == 28); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.ptl_index) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.return_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.get.match_bits) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.match_bits) == 8); + LASSERT (offsetof (ptl_hdr_t, msg.get.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.length) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.src_offset) == 60); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.src_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.return_offset) == 64); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.return_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.get.sink_length) == 68); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_wmd) == 32); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16); + LASSERT (offsetof (ptl_hdr_t, msg.reply.dst_offset) == 48); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.dst_offset) == 4); + LASSERT (offsetof (ptl_hdr_t, msg.reply.length) == 56); + LASSERT (sizeof (((ptl_hdr_t *)0)->msg.reply.length) == 4); +} diff --git a/lustre/portals/portals/lib-msg.c b/lustre/portals/portals/lib-msg.c new file mode 100644 index 0000000..f10892c --- /dev/null +++ b/lustre/portals/portals/lib-msg.c @@ -0,0 +1,163 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-msg.c + * Message decoding, parsing and finalizing routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __KERNEL__ +# include <stdio.h> +#else +# define DEBUG_SUBSYSTEM S_PORTALS +# include <linux/kp30.h> +#endif + +#include <portals/lib-p30.h> + +int lib_finalize(nal_cb_t * nal, void *private, lib_msg_t *msg) +{ + lib_md_t *md; + lib_eq_t *eq; + int rc; + unsigned long flags; + + /* ni went down while processing this message */ + if (nal->ni.up == 0) { + return -1; + } + + if (msg == NULL) + return 0; + + rc = 0; + if (msg->send_ack) { + ptl_hdr_t ack; + + LASSERT (!ptl_is_wire_handle_none (&msg->ack_wmd)); + + memset (&ack, 0, sizeof (ack)); + ack.type = HTON__u32 (PTL_MSG_ACK); + ack.dest_nid = HTON__u64 (msg->nid); + ack.src_nid = HTON__u64 (nal->ni.nid); + ack.dest_pid = HTON__u32 (msg->pid); + ack.src_pid = HTON__u32 (nal->ni.pid); + PTL_HDR_LENGTH(&ack) = 0; + + ack.msg.ack.dst_wmd = msg->ack_wmd; + ack.msg.ack.match_bits = msg->ev.match_bits; + ack.msg.ack.mlength = HTON__u32 (msg->ev.mlength); + + rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, + msg->nid, msg->pid, NULL, 0, 0); + } + + md = msg->md; + LASSERT (md->pending > 0); /* I've not dropped my ref yet */ + eq = md->eq; + + state_lock(nal, &flags); + + if (eq != NULL) { + ptl_event_t *ev = &msg->ev; + ptl_event_t *eq_slot; + + /* I have to hold the lock while I bump the sequence number + * and copy the event into the queue. If not, and I was + * interrupted after bumping the sequence number, other + * events could fill the queue, including the slot I just + * allocated to this event. On resuming, I would overwrite + * a more 'recent' event with old event state, and + * processes taking events off the queue would not detect + * overflow correctly. + */ + + ev->sequence = eq->sequence++;/* Allocate the next queue slot */ + + /* size must be a power of 2 to handle a wrapped sequence # */ + LASSERT (eq->size != 0 && + eq->size == LOWEST_BIT_SET (eq->size)); + eq_slot = eq->base + (ev->sequence & (eq->size - 1)); + + /* Invalidate unlinked_me unless this is the last + * event for an auto-unlinked MD. Note that if md was + * auto-unlinked, md->pending can only decrease + */ + if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || /* not auto-unlinked */ + md->pending != 1) /* not last ref */ + ev->unlinked_me = PTL_HANDLE_NONE; + + /* Copy the event into the allocated slot, ensuring all the + * rest of the event's contents have been copied _before_ + * the sequence number gets updated. A processes 'getting' + * an event waits on the next queue slot's sequence to be + * 'new'. When it is, _all_ other event fields had better + * be consistent. I assert 'sequence' is the last member, + * so I only need a 2 stage copy. + */ + LASSERT(sizeof (ptl_event_t) == + offsetof(ptl_event_t, sequence) + sizeof(ev->sequence)); + + rc = nal->cb_write (nal, private, (user_ptr)eq_slot, ev, + offsetof (ptl_event_t, sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + /* Updating the sequence number is what makes the event 'new' */ + + /* cb_write is not necessarily atomic, so this could + cause a race with PtlEQGet */ + rc = nal->cb_write(nal, private, (user_ptr)&eq_slot->sequence, + (void *)&ev->sequence,sizeof (ev->sequence)); + LASSERT (rc == 0); + +#ifdef __KERNEL__ + barrier(); +#endif + + /* I must also ensure that (a) callbacks are made in the + * same order as the events land in the queue, and (b) the + * callback occurs before the event can be removed from the + * queue, so I can't drop the lock during the callback. */ + if (nal->cb_callback != NULL) + nal->cb_callback(nal, private, eq, ev); + else if (eq->event_callback != NULL) + (void)((eq->event_callback) (ev)); + } + + LASSERT ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINKED) == 0 || + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0); + + md->pending--; + if (md->pending == 0 && /* no more outstanding operations on this md */ + (md->threshold == 0 || /* done its business */ + (md->md_flags & PTL_MD_FLAG_UNLINK) != 0)) /* marked for death */ + lib_md_unlink(nal, md); + + list_del (&msg->msg_list); + nal->ni.counters.msgs_alloc--; + lib_msg_free(nal, msg); + + state_unlock(nal, &flags); + + return rc; +} diff --git a/lustre/portals/portals/lib-ni.c b/lustre/portals/portals/lib-ni.c new file mode 100644 index 0000000..aa30329 --- /dev/null +++ b/lustre/portals/portals/lib-ni.c @@ -0,0 +1,128 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-ni.c + * Network status registers and distance functions. + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * Copyright (c) 2001-2002 Sandia National Laboratories + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PORTALS +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +#define MAX_DIST 18446744073709551615UL + +int do_PtlNIDebug(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + PtlNIDebug_in *args = v_args; + PtlNIDebug_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->rc = ni->debug; + ni->debug = args->mask_in; + + return 0; +} + +int do_PtlNIStatus(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_sr_index_t register_in + * + * Outgoing: + * ptl_sr_value_t * status_out + */ + + PtlNIStatus_in *args = v_args; + PtlNIStatus_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + lib_counters_t *count = &ni->counters; + + if (!args) + return ret->rc = PTL_SEGV; + + ret->rc = PTL_OK; + ret->status_out = 0; + + /* + * I hate this sort of code.... Hash tables, offset lists? + * Treat the counters as an array of ints? + */ + if (args->register_in == PTL_SR_DROP_COUNT) + ret->status_out = count->drop_count; + + else if (args->register_in == PTL_SR_DROP_LENGTH) + ret->status_out = count->drop_length; + + else if (args->register_in == PTL_SR_RECV_COUNT) + ret->status_out = count->recv_count; + + else if (args->register_in == PTL_SR_RECV_LENGTH) + ret->status_out = count->recv_length; + + else if (args->register_in == PTL_SR_SEND_COUNT) + ret->status_out = count->send_count; + + else if (args->register_in == PTL_SR_SEND_LENGTH) + ret->status_out = count->send_length; + + else if (args->register_in == PTL_SR_MSGS_MAX) + ret->status_out = count->msgs_max; + else + ret->rc = PTL_INV_SR_INDX; + + return ret->rc; +} + + +int do_PtlNIDist(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t interface_in + * ptl_process_id_t process_in + + * + * Outgoing: + * unsigned long * distance_out + + */ + + PtlNIDist_in *args = v_args; + PtlNIDist_out *ret = v_ret; + + unsigned long dist; + ptl_process_id_t id_in = args->process_in; + ptl_nid_t nid; + int rc; + + nid = id_in.nid; + + if ((rc = nal->cb_dist(nal, nid, &dist)) != 0) { + ret->distance_out = (unsigned long) MAX_DIST; + return PTL_INV_PROC; + } + + ret->distance_out = dist; + + return ret->rc = PTL_OK; +} diff --git a/lustre/portals/portals/lib-pid.c b/lustre/portals/portals/lib-pid.c new file mode 100644 index 0000000..12eebb5 --- /dev/null +++ b/lustre/portals/portals/lib-pid.c @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-pid.c + * + * Process identification routines + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This should be removed. The NAL should have the PID information */ +#define DEBUG_SUBSYSTEM S_PORTALS + +#if defined (__KERNEL__) +# include <linux/kernel.h> +extern int getpid(void); +#else +# include <stdio.h> +# include <unistd.h> +#endif +#include <portals/lib-p30.h> +#include <portals/arg-blocks.h> + +int do_PtlGetId(nal_cb_t * nal, void *private, void *v_args, void *v_ret) +{ + /* + * Incoming: + * ptl_handle_ni_t handle_in + * + * Outgoing: + * ptl_process_id_t * id_out + * ptl_id_t * gsize_out + */ + + PtlGetId_out *ret = v_ret; + lib_ni_t *ni = &nal->ni; + + ret->id_out.nid = ni->nid; + ret->id_out.pid = ni->pid; + + return ret->rc = PTL_OK; +} diff --git a/lustre/portals/router/.cvsignore b/lustre/portals/router/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/router/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/router/Makefile.am b/lustre/portals/router/Makefile.am new file mode 100644 index 0000000..1c8087b --- /dev/null +++ b/lustre/portals/router/Makefile.am @@ -0,0 +1,16 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +MODULE = kptlrouter +modulenet_DATA = kptlrouter.o +EXTRA_PROGRAMS = kptlrouter + + +#CFLAGS:= @KCFLAGS@ +#CPPFLAGS:=@KCPPFLAGS@ +DEFS = +kptlrouter_SOURCES = router.c proc.c router.h diff --git a/lustre/portals/router/Makefile.mk b/lustre/portals/router/Makefile.mk new file mode 100644 index 0000000..64bd09b --- /dev/null +++ b/lustre/portals/router/Makefile.mk @@ -0,0 +1,9 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Kernelenv + +obj-y += kptlrouter.o +kptlrouter-objs := router.o proc.o diff --git a/lustre/portals/router/proc.c b/lustre/portals/router/proc.c new file mode 100644 index 0000000..dd65b34 --- /dev/null +++ b/lustre/portals/router/proc.c @@ -0,0 +1,78 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +#define KPR_PROC_ROUTER "sys/portals/router" + +int +kpr_proc_read (char *page, char **start, off_t off, int count, int *eof, void *data) +{ + unsigned long long bytes = kpr_fwd_bytes; + unsigned long packets = kpr_fwd_packets; + unsigned long errors = kpr_fwd_errors; + unsigned int qdepth = atomic_read (&kpr_queue_depth); + int len; + + *eof = 1; + if (off != 0) + return (0); + + len = sprintf (page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); + + *start = page; + return (len); +} + +int +kpr_proc_write (struct file *file, const char *ubuffer, unsigned long count, void *data) +{ + /* Ignore what we've been asked to write, and just zero the stats counters */ + kpr_fwd_bytes = 0; + kpr_fwd_packets = 0; + kpr_fwd_errors = 0; + + return (count); +} + +void +kpr_proc_init(void) +{ + struct proc_dir_entry *entry = create_proc_entry (KPR_PROC_ROUTER, S_IFREG | S_IRUGO | S_IWUSR, NULL); + + if (entry == NULL) + { + CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); + return; + } + + entry->data = NULL; + entry->read_proc = kpr_proc_read; + entry->write_proc = kpr_proc_write; +} + +void +kpr_proc_fini(void) +{ + remove_proc_entry(KPR_PROC_ROUTER, 0); +} diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c new file mode 100644 index 0000000..6074c3c --- /dev/null +++ b/lustre/portals/router/router.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "router.h" + +struct list_head kpr_routes; +struct list_head kpr_nals; + +unsigned long long kpr_fwd_bytes; +unsigned long kpr_fwd_packets; +unsigned long kpr_fwd_errors; +atomic_t kpr_queue_depth; + +/* Mostly the tables are read-only (thread and interrupt context) + * + * Once in a blue moon we register/deregister NALs and add/remove routing + * entries (thread context only)... */ +rwlock_t kpr_rwlock; + +kpr_router_interface_t kpr_router_interface = { + kprri_register: kpr_register_nal, + kprri_lookup: kpr_lookup_target, + kprri_fwd_start: kpr_forward_packet, + kprri_fwd_done: kpr_complete_packet, + kprri_shutdown: kpr_shutdown_nal, + kprri_deregister: kpr_deregister_nal, +}; + +kpr_control_interface_t kpr_control_interface = { + kprci_add_route: kpr_add_route, + kprci_del_route: kpr_del_route, + kprci_get_route: kpr_get_route, +}; + +int +kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) +{ + long flags; + struct list_head *e; + kpr_nal_entry_t *ne; + + CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + + PORTAL_ALLOC (ne, sizeof (*ne)); + if (ne == NULL) + return (-ENOMEM); + + memset (ne, 0, sizeof (*ne)); + memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); + + LASSERT (!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) + { + write_unlock_irqrestore (&kpr_rwlock, flags); + + CERROR ("Attempt to register same NAL %d twice\n", ne->kpne_interface.kprni_nalid); + + PORTAL_FREE (ne, sizeof (*ne)); + return (-EEXIST); + } + } + + list_add (&ne->kpne_list, &kpr_nals); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + *argp = ne; + PORTAL_MODULE_USE; + return (0); +} + +void +kpr_shutdown_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (!ne->kpne_shutdown); + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); /* locking a bit spurious... */ + ne->kpne_shutdown = 1; + write_unlock_irqrestore (&kpr_rwlock, flags); /* except it's a memory barrier */ + + while (atomic_read (&ne->kpne_refcount) != 0) + { + CDEBUG (D_NET, "Waiting for refcount on NAL %d to reach zero (%d)\n", + ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); + + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } +} + +void +kpr_deregister_nal (void *arg) +{ + long flags; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + + LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ + LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ + LASSERT (!in_interrupt()); + + write_lock_irqsave (&kpr_rwlock, flags); + + list_del (&ne->kpne_list); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (ne, sizeof (*ne)); + PORTAL_MODULE_UNUSE; +} + + +int +kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + int rc = -ENOENT; + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + + if (ne->kpne_shutdown) /* caller is shutting down */ + return (-ENOENT); + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid on the callers network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || + re->kpre_hi_nid < target_nid) + continue; + + /* found table entry */ + + if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ + rc = -EHOSTUNREACH; + else + { + rc = 0; + *gateway_nidp = re->kpre_gateway_nid; + } + break; + } + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + target_nid, ne->kpne_interface.kprni_nalid, rc, + (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); + return (rc); +} + +void +kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) +{ + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + struct list_head *e; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ + LASSERT (nob == lib_iov_nob (fwd->kprfd_niov, fwd->kprfd_iov)); + + atomic_inc (&kpr_queue_depth); + atomic_inc (&src_ne->kpne_refcount); /* source nal is busy until fwd completes */ + + kpr_fwd_packets++; /* (loose) stats accounting */ + kpr_fwd_bytes += nob; + + if (src_ne->kpne_shutdown) /* caller is shutting down */ + goto out; + + fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ + + read_lock (&kpr_rwlock); + + /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) + { + kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + + if (re->kpre_lo_nid > target_nid || /* no match */ + re->kpre_hi_nid < target_nid) + continue; + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + re->kpre_gateway_nid, re->kpre_gateway_nalid); + + if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) + break; /* don't route to same NAL */ + + /* Search for gateway's NAL's entry */ + + for (e = kpr_nals.next; e != &kpr_nals; e = e->next) + { + kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ + break; + + fwd->kprfd_gateway_nid = re->kpre_gateway_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + + read_unlock (&kpr_rwlock); + + CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; + } + break; + } + + read_unlock (&kpr_rwlock); + out: + kpr_fwd_errors++; + + CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + target_nid, src_ne->kpne_interface.kprni_nalid); + + /* Can't find anywhere to forward to */ + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, -EHOSTUNREACH); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); +} + +void +kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) +{ + kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; + + CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ + + (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); + + CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + src_ne->kpne_interface.kprni_nalid, error); + + atomic_dec (&kpr_queue_depth); + atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ +} + +int +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, + ptl_nid_t hi_nid) +{ + long flags; + struct list_head *e; + kpr_route_entry_t *re; + + CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + gateway_nalid, gateway_nid, lo_nid, hi_nid); + + LASSERT(lo_nid <= hi_nid); + + PORTAL_ALLOC (re, sizeof (*re)); + if (re == NULL) + return (-ENOMEM); + + re->kpre_gateway_nalid = gateway_nalid; + re->kpre_gateway_nid = gateway_nid; + re->kpre_lo_nid = lo_nid; + re->kpre_hi_nid = hi_nid; + + LASSERT(!in_interrupt()); + write_lock_irqsave (&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > re2->kpre_hi_nid || + re->kpre_hi_nid < re2->kpre_lo_nid) + continue; + + CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" + "to ["LPX64" - "LPX64"]\n", + re->kpre_lo_nid, re->kpre_hi_nid, + re2->kpre_lo_nid, re2->kpre_hi_nid); + + write_unlock_irqrestore (&kpr_rwlock, flags); + + PORTAL_FREE (re, sizeof (*re)); + return (-EINVAL); + } + + list_add (&re->kpre_list, &kpr_routes); + + write_unlock_irqrestore (&kpr_rwlock, flags); + return (0); +} + +int +kpr_del_route (ptl_nid_t nid) +{ + long flags; + struct list_head *e; + + CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + + LASSERT(!in_interrupt()); + write_lock_irqsave(&kpr_rwlock, flags); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + continue; + + list_del (&re->kpre_list); + write_unlock_irqrestore(&kpr_rwlock, flags); + + PORTAL_FREE(re, sizeof (*re)); + return (0); + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + return (-ENOENT); +} + +int +kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +{ + struct list_head *e; + + read_lock(&kpr_rwlock); + + for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + + if (idx-- == 0) { + *gateway_nalid = re->kpre_gateway_nalid; + *gateway_nid = re->kpre_gateway_nid; + *lo_nid = re->kpre_lo_nid; + *hi_nid = re->kpre_hi_nid; + + read_unlock(&kpr_rwlock); + return (0); + } + } + + read_unlock (&kpr_rwlock); + return (-ENOENT); +} + +static void __exit +kpr_finalise (void) +{ + LASSERT (list_empty (&kpr_nals)); + + while (!list_empty (&kpr_routes)) { + kpr_route_entry_t *re = list_entry(kpr_routes.next, + kpr_route_entry_t, + kpre_list); + + list_del(&re->kpre_list); + PORTAL_FREE(re, sizeof (*re)); + } + + kpr_proc_fini(); + + PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); + PORTAL_SYMBOL_UNREGISTER(kpr_control_interface); + + CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", + atomic_read(&portal_kmemory)); +} + +static int __init +kpr_initialise (void) +{ + CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", + atomic_read(&portal_kmemory)); + + rwlock_init(&kpr_rwlock); + INIT_LIST_HEAD(&kpr_routes); + INIT_LIST_HEAD(&kpr_nals); + + kpr_proc_init(); + + PORTAL_SYMBOL_REGISTER(kpr_router_interface); + PORTAL_SYMBOL_REGISTER(kpr_control_interface); + return (0); +} + +MODULE_AUTHOR("Eric Barton"); +MODULE_DESCRIPTION("Kernel Portals Router v0.01"); +MODULE_LICENSE("GPL"); + +module_init (kpr_initialise); +module_exit (kpr_finalise); + +EXPORT_SYMBOL (kpr_control_interface); +EXPORT_SYMBOL (kpr_router_interface); diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h new file mode 100644 index 0000000..b8c3bec --- /dev/null +++ b/lustre/portals/router/router.h @@ -0,0 +1,81 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef _KPTLROUTER_H +#define _KPTLROUTER_H +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> + +#define DEBUG_SUBSYSTEM S_PTLROUTER + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> + +typedef struct +{ + struct list_head kpne_list; + kpr_nal_interface_t kpne_interface; + atomic_t kpne_refcount; + int kpne_shutdown; +} kpr_nal_entry_t; + +typedef struct +{ + struct list_head kpre_list; + int kpre_gateway_nalid; + ptl_nid_t kpre_gateway_nid; + ptl_nid_t kpre_lo_nid; + ptl_nid_t kpre_hi_nid; +} kpr_route_entry_t; + +extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); +extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_shutdown_nal (void *arg); +extern void kpr_deregister_nal (void *arg); + +extern void kpr_proc_init (void); +extern void kpr_proc_fini (void); + +extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); +extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + +extern unsigned long long kpr_fwd_bytes; +extern unsigned long kpr_fwd_packets; +extern unsigned long kpr_fwd_errors; +extern atomic_t kpr_queue_depth; + +#endif /* _KPLROUTER_H */ diff --git a/lustre/portals/tests/.cvsignore b/lustre/portals/tests/.cvsignore new file mode 100644 index 0000000..051d1bd --- /dev/null +++ b/lustre/portals/tests/.cvsignore @@ -0,0 +1,3 @@ +Makefile +Makefile.in +.deps diff --git a/lustre/portals/tests/Makefile.am b/lustre/portals/tests/Makefile.am new file mode 100644 index 0000000..7b47ae0 --- /dev/null +++ b/lustre/portals/tests/Makefile.am @@ -0,0 +1,23 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include ../Rules.linux + +LDFLAGS = -m "`$(LD) --help | awk '/supported emulations/ {print $$4}'`" -r +LINK = $(LD) $(LDFLAGS) -o $@ +DEFS = +LIBS = +MODULE = $(basename) +EXTRA_DIST = startserver.sh startclient.sh stopserver.sh stopclient.sh + +noinst_PROGRAMS = pingsrv.o pingcli.o spingsrv.o spingcli.o + +pingsrv_o_SOURCES = ping_srv.c ping.h + +pingcli_o_SOURCES = ping_cli.c ping.h + +spingsrv_o_SOURCES = sping_srv.c ping.h + +spingcli_o_SOURCES = sping_cli.c ping.h diff --git a/lustre/portals/tests/ping.h b/lustre/portals/tests/ping.h new file mode 100644 index 0000000..f07444b --- /dev/null +++ b/lustre/portals/tests/ping.h @@ -0,0 +1,80 @@ +#ifndef _KPING_INCLUDED +#define _KPING_INCLUDED + +#include <portals/p30.h> + + +#define PTL_PING_IN_SIZE 256 // n packets per buffer +#define PTL_PING_IN_BUFFERS 2 // n fallback buffers + +#define PTL_PING_CLIENT 4 +#define PTL_PING_SERVER 5 + +#define PING_HEADER_MAGIC 0xDEADBEEF +#define PING_BULK_MAGIC 0xCAFEBABE + +#define PING_HEAD_BITS 0x00000001 +#define PING_BULK_BITS 0x00000002 +#define PING_IGNORE_BITS 0xFFFFFFFC + +#define PTL_PING_ACK 0x01 +#define PTL_PING_VERBOSE 0x02 +#define PTL_PING_VERIFY 0x04 +#define PTL_PING_PREALLOC 0x08 + + +#define NEXT_PRIMARY_BUFFER(index) \ + (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) + +#define PDEBUG(str, err) \ + CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) + + +/* Ping data to be passed via the ioctl to kernel space */ + +#if __KERNEL__ + + +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) +#include <linux/workqueue.h> +#else +#include <linux/tqueue.h> +#endif +struct pingsrv_data { + + ptl_handle_ni_t ni; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + void *in_buf; + ptl_process_id_t my_id; + ptl_process_id_t id_local; + ptl_md_t mdin; + ptl_md_t mdout; + ptl_handle_md_t mdin_h; + ptl_handle_md_t mdout_h; + ptl_event_t evnt; + struct task_struct *tsk; +}; /* struct pingsrv_data */ + +struct pingcli_data { + + struct portal_ioctl_data *args; + ptl_handle_me_t me; + ptl_handle_eq_t eq; + char *inbuf; + char *outbuf; + ptl_process_id_t myid; + ptl_process_id_t id_local; + ptl_process_id_t id_remote; + ptl_md_t md_in_head; + ptl_md_t md_out_head; + ptl_handle_md_t md_in_head_h; + ptl_handle_md_t md_out_head_h; + ptl_event_t ev; + struct task_struct *tsk; +}; /* struct pingcli_data */ + + +#endif /* __KERNEL__ */ + +#endif /* _KPING_INCLUDED */ diff --git a/lustre/portals/tests/ping_cli.c b/lustre/portals/tests/ping_cli.c new file mode 100644 index 0000000..389ffbb --- /dev/null +++ b/lustre/portals/tests/ping_cli.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf <behlendorf1@llnl.gov> + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/poll.h> +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) + +#define MAX_TIME 100000 + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + if ((rc = PtlMDUnlink (client->md_in_head_h))) + PDEBUG ("PtlMDUnlink", rc); + + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + int i, magic; + i = *(int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned)); + magic = *(int *)(ev->mem_desc.start + ev->offset); + + if(magic != 0xcafebabe) { + printk ("Unexpected response \n"); + return 1; + } + + if((i == count) || !count) + wake_up_process (client->tsk); + else + printk ("Received response after timeout for %d\n",i); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + unsigned ping_bulk_magic = PING_BULK_MAGIC; + int rc; + struct timeval tv1, tv2; + client->tsk = current; + client->args = args; + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + count = args->ioc_count; + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = (args->ioc_size + STDSIZE) + * count; + client->md_in_head.threshold = PTL_MD_THRESH_INF; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE + args->ioc_size; + client->md_out_head.threshold = args->ioc_count; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_bulk_magic)); + + count = 0; + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return NULL; + } + while ((args->ioc_count - count)) { + memcpy (client->outbuf + sizeof(unsigned), + &(count), sizeof(unsigned)); + /* Put the ping packet */ + do_gettimeofday (&tv1); + + memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, + sizeof(struct timeval)); + + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + printk ("sent msg no %d", count); + + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" :: timeout .....\n"); + } else { + do_gettimeofday (&tv2); + printk(" :: Reply in %u usec\n", + (unsigned)((tv2.tv_sec - tv1.tv_sec) + * 1000000 + (tv2.tv_usec - tv1.tv_usec))); + } + count++; + } + + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, + (args->ioc_size + STDSIZE) * args->ioc_count); + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + memset (client, 0, sizeof(struct pingcli_data)); + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lustre/portals/tests/ping_srv.c b/lustre/portals/tests/ping_srv.c new file mode 100644 index 0000000..1037d09 --- /dev/null +++ b/lustre/portals/tests/ping_srv.c @@ -0,0 +1,308 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf <behlendorf1@llnl.gov> + * Amey Inamdar <amey@calsoftinc.com> + * Kedar Sovani <kedar@calsoftinc.com> + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include <linux/kp30.h> +#include <portals/p30.h> +#include "ping.h" + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/version.h> +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include <linux/workqueue.h> +#else +#include <linux/tqueue.h> +#endif +#include <linux/wait.h> +#include <linux/smp_lock.h> + +#include <asm/unistd.h> +#include <asm/semaphore.h> + +#define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) +#define MAXSIZE (16*1024*1024) + +static unsigned ping_head_magic; +static unsigned ping_bulk_magic; +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + case 5: + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, MAXSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + unsigned long magic; + unsigned long ping_bulk_magic = 0xcafebabe; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + magic = *((int *)(server->evnt.mem_desc.start + + server->evnt.offset)); + + + if(magic != 0xdeadbeef) { + printk("Unexpected Packet to the server\n"); + + } + memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic)); + + server->mdout.length = server->evnt.rlength; + server->mdout.start = server->in_buf; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset)), + *((int *)(ev->mem_desc.start + ev->offset + sizeof(unsigned))), + *((int *)(ev->mem_desc.start + ev->offset + 2 * + sizeof(unsigned)))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "NAL %d not loaded\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 1024, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, MAXSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = MAXSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + ping_head_magic = PING_HEADER_MAGIC; + ping_bulk_magic = PING_BULK_MAGIC; + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/sping_cli.c b/lustre/portals/tests/sping_cli.c new file mode 100644 index 0000000..4cef08b --- /dev/null +++ b/lustre/portals/tests/sping_cli.c @@ -0,0 +1,276 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf <behlendorf1@llnl.gov> + * Kedar Sovani (kedar@calsoftinc.com) + * Amey Inamdar (amey@calsoftinc.com) + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + + +#define DEBUG_SUBSYSTEM S_PINGER + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/poll.h> +#include "ping.h" +/* int portal_debug = D_PING_CLI; */ + + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes + assumed */ + +/* This should be enclosed in a structure */ + +static struct pingcli_data *client = NULL; + +static int count = 0; + +static void +pingcli_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (client->md_out_head_h))) + PDEBUG ("PtlMDUnlink", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (client->eq))) + PDEBUG ("PtlEQFree", rc); + + if ((rc = PtlMEUnlink (client->me))) + PDEBUG ("PtlMEUnlink", rc); + case 3: + kportal_put_ni (client->args->ioc_nal); + + case 4: + /* Free our buffers */ + if (client->outbuf != NULL) + PORTAL_FREE (client->outbuf, STDSIZE); + + if (client->inbuf != NULL) + PORTAL_FREE (client->inbuf, STDSIZE); + + + if (client != NULL) + PORTAL_FREE (client, + sizeof(struct pingcli_data)); + } + + + CDEBUG (D_OTHER, "ping client released resources\n"); +} /* pingcli_shutdown() */ + +static int pingcli_callback(ptl_event_t *ev) +{ + wake_up_process (client->tsk); + return 1; +} + + +static struct pingcli_data * +pingcli_start(struct portal_ioctl_data *args) +{ + const ptl_handle_ni_t *nip; + unsigned ping_head_magic = PING_HEADER_MAGIC; + int rc; + + client->tsk = current; + client->args = args; + + CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64", \ + nal %d, size %u, count: %u, timeout: %u\n", + args->ioc_nid, args->ioc_nal, args->ioc_size, + args->ioc_count, args->ioc_timeout); + + + PORTAL_ALLOC (client->outbuf, STDSIZE) ; + if (client->outbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + PORTAL_ALLOC (client->inbuf, STDSIZE); + + if (client->inbuf == NULL) + { + CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); + pingcli_shutdown (4); + return (NULL); + } + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (args->ioc_nal)) == NULL) + { + CERROR ("NAL %d not loaded.\n", args->ioc_nal); + pingcli_shutdown (4); + return (NULL); + } + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (*nip, &client->myid))) + { + CERROR ("PtlGetId error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Setup the local match entries */ + client->id_local.nid = PTL_NID_ANY; + client->id_local.pid = PTL_PID_ANY; + + /* Setup the remote match entries */ + client->id_remote.nid = args->ioc_nid; + client->id_remote.pid = 0; + + if ((rc = PtlMEAttach (*nip, PTL_PING_CLIENT, + client->id_local, 0, ~0, PTL_RETAIN, + PTL_INS_AFTER, &client->me))) + { + CERROR ("PtlMEAttach error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + /* Allocate the event queue for this network interface */ + if ((rc = PtlEQAlloc (*nip, 64, pingcli_callback, &client->eq))) + { + CERROR ("PtlEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return (NULL); + } + + + client->md_in_head.start = client->inbuf; + client->md_in_head.length = STDSIZE; + client->md_in_head.threshold = 1; + client->md_in_head.options = PTL_MD_OP_PUT; + client->md_in_head.user_ptr = NULL; + client->md_in_head.eventq = client->eq; + memset (client->inbuf, 0, STDSIZE); + + /* Attach the incoming buffer */ + if ((rc = PtlMDAttach (client->me, client->md_in_head, + PTL_UNLINK, &client->md_in_head_h))) { + CERROR ("PtlMDAttach error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + + /* Setup the outgoing ping header */ + client->md_out_head.start = client->outbuf; + client->md_out_head.length = STDSIZE; + client->md_out_head.threshold = 1; + client->md_out_head.options = PTL_MD_OP_PUT; + client->md_out_head.user_ptr = NULL; + client->md_out_head.eventq = PTL_EQ_NONE; + + memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); + + /* Bind the outgoing ping header */ + if ((rc=PtlMDBind (*nip, client->md_out_head, + &client->md_out_head_h))) { + CERROR ("PtlMDBind error %d\n", rc); + pingcli_shutdown (1); + return (NULL); + } + /* Put the ping packet */ + if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { + PDEBUG ("PtlPut (header)", rc); + pingcli_shutdown (1); + return NULL; + } + + count = 0; + set_current_state (TASK_INTERRUPTIBLE); + rc = schedule_timeout (20 * args->ioc_timeout); + if (rc == 0) { + printk (" Time out on the server\n"); + pingcli_shutdown (2); + return NULL; + } else + printk("Received respose from the server \n"); + + + pingcli_shutdown (2); + + /* Success! */ + return NULL; +} /* pingcli_setup() */ + + + +/* called by the portals_ioctl for ping requests */ +static int kping_client(struct portal_ioctl_data *args) +{ + + PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + memset (client, 0, sizeof(struct pingcli_data)); + if (client == NULL) + { + CERROR ("Unable to allocate client structure\n"); + return (0); + } + pingcli_start (args); + + return 0; +} /* kping_client() */ + + +static int __init pingcli_init(void) +{ + PORTAL_SYMBOL_REGISTER(kping_client); + return 0; +} /* pingcli_init() */ + + +static void __exit pingcli_cleanup(void) +{ + PORTAL_SYMBOL_UNREGISTER (kping_client); +} /* pingcli_cleanup() */ + + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingcli_init); +module_exit(pingcli_cleanup); + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +EXPORT_SYMBOL (kping_client); +#endif diff --git a/lustre/portals/tests/sping_srv.c b/lustre/portals/tests/sping_srv.c new file mode 100644 index 0000000..a18ea35 --- /dev/null +++ b/lustre/portals/tests/sping_srv.c @@ -0,0 +1,295 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) + * Author: Brian Behlendorf <behlendorf1@llnl.gov> + * Amey Inamdar <amey@calsoftinc.com> + * Kedar Sovani <kedar@calsoftinc.com> + * + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* This is a striped down version of pinger. It follows a single + * request-response protocol. Doesn't do Bulk data pinging. Also doesn't + * send multiple packets in a single ioctl. + */ + +#define DEBUG_SUBSYSTEM S_PINGER + +#include <linux/kp30.h> +#include <portals/p30.h> +#include "ping.h" + +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/version.h> +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#include <linux/workqueue.h> +#else +#include <linux/tqueue.h> +#endif +#include <linux/wait.h> +#include <linux/smp_lock.h> + +#include <asm/unistd.h> +#include <asm/semaphore.h> + +#define STDSIZE (sizeof(int) + sizeof(int) + 4) + +static int nal = 0; // Your NAL, +static unsigned long packets_valid = 0; // Valid packets +static int running = 1; +atomic_t pkt; + +static struct pingsrv_data *server=NULL; // Our ping server + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +#endif + +static void *pingsrv_shutdown(int err) +{ + int rc; + + /* Yes, we are intentionally allowing us to fall through each + * case in to the next. This allows us to pass an error + * code to just clean up the right stuff. + */ + switch (err) { + case 1: + /* Unlink any memory descriptors we may have used */ + if ((rc = PtlMDUnlink (server->mdin_h))) + PDEBUG ("PtlMDUnlink (out head buffer)", rc); + case 2: + /* Free the event queue */ + if ((rc = PtlEQFree (server->eq))) + PDEBUG ("PtlEQFree", rc); + + /* Unlink the client portal from the ME list */ + if ((rc = PtlMEUnlink (server->me))) + PDEBUG ("PtlMEUnlink", rc); + + case 3: + kportal_put_ni (nal); + + case 4: + + if (server->in_buf != NULL) + PORTAL_FREE (server->in_buf, STDSIZE); + + if (server != NULL) + PORTAL_FREE (server, + sizeof (struct pingsrv_data)); + + } + + CDEBUG (D_OTHER, "ping sever resources released\n"); + return NULL; +} /* pingsrv_shutdown() */ + + +int pingsrv_thread(void *arg) +{ + int rc; + + kportal_daemonize ("pingsrv"); + server->tsk = current; + + while (running) { + set_current_state (TASK_INTERRUPTIBLE); + if (atomic_read (&pkt) == 0) { + schedule_timeout (MAX_SCHEDULE_TIMEOUT); + continue; + } + + server->mdout.start = server->in_buf; + server->mdout.length = STDSIZE; + server->mdout.threshold = 1; + server->mdout.options = PTL_MD_OP_PUT; + server->mdout.user_ptr = NULL; + server->mdout.eventq = PTL_EQ_NONE; + + /* Bind the outgoing buffer */ + if ((rc = PtlMDBind (server->ni, server->mdout, + &server->mdout_h))) { + PDEBUG ("PtlMDBind", rc); + pingsrv_shutdown (1); + return 1; + } + + + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) + PDEBUG ("PtlPut", rc); + + atomic_dec (&pkt); + + } + pingsrv_shutdown (1); + running = 1; + return 0; +} + +static int pingsrv_packet(ptl_event_t *ev) +{ + atomic_inc (&pkt); + wake_up_process (server->tsk); + return 1; +} /* pingsrv_head() */ + +static int pingsrv_callback(ptl_event_t *ev) +{ + + if (ev == NULL) { + CERROR ("null in callback, ev=%p\n", ev); + return 0; + } + server->evnt = *ev; + + printk ("received ping from nid "LPX64" " + "(off=%u rlen=%u mlen=%u head=%x)\n", + ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, + *((int *)(ev->mem_desc.start + ev->offset))); + + packets_valid++; + + return pingsrv_packet(ev); + +} /* pingsrv_callback() */ + + +static struct pingsrv_data *pingsrv_setup(void) +{ + ptl_handle_ni_t *nip; + int rc; + + /* Aquire and initialize the proper nal for portals. */ + if ((nip = kportal_get_ni (nal)) == NULL) { + CDEBUG (D_OTHER, "Nal %d not loaded.\n", nal); + return pingsrv_shutdown (4); + } + + server->ni= *nip; + + /* Based on the initialization aquire our unique portal ID. */ + if ((rc = PtlGetId (server->ni, &server->my_id))) { + PDEBUG ("PtlGetId", rc); + return pingsrv_shutdown (2); + } + + server->id_local.nid = PTL_NID_ANY; + server->id_local.pid = PTL_PID_ANY; + + /* Attach a match entries for header packets */ + if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + server->id_local,0, ~0, + PTL_RETAIN, PTL_INS_AFTER, &server->me))) { + PDEBUG ("PtlMEAttach", rc); + return pingsrv_shutdown (2); + } + + + if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, + &server->eq))) { + PDEBUG ("PtlEQAlloc (callback)", rc); + return pingsrv_shutdown (2); + } + + PORTAL_ALLOC (server->in_buf, STDSIZE); + if(!server->in_buf){ + CDEBUG (D_OTHER,"Allocation error\n"); + return pingsrv_shutdown(2); + } + + /* Setup the incoming buffer */ + server->mdin.start = server->in_buf; + server->mdin.length = STDSIZE; + server->mdin.threshold = 1; + server->mdin.options = PTL_MD_OP_PUT; + server->mdin.user_ptr = NULL; + server->mdin.eventq = server->eq; + memset (server->in_buf, 0, STDSIZE); + + if ((rc = PtlMDAttach (server->me, server->mdin, + PTL_UNLINK, &server->mdin_h))) { + PDEBUG ("PtlMDAttach (bulk)", rc); + CDEBUG (D_OTHER, "ping server resources allocated\n"); + } + + /* Success! */ + return server; +} /* pingsrv_setup() */ + +static int pingsrv_start(void) +{ + /* Setup our server */ + if (!pingsrv_setup()) { + CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); + return -ENOMEM; + } + kernel_thread (pingsrv_thread,NULL,0); + return 0; +} /* pingsrv_start() */ + + + +static int __init pingsrv_init(void) +{ + PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + return pingsrv_start (); +} /* pingsrv_init() */ + + +static void __exit pingsrv_cleanup(void) +{ + remove_proc_entry ("net/pingsrv", NULL); + + running = 0; + wake_up_process (server->tsk); + while (running != 1) { + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + +} /* pingsrv_cleanup() */ + + +MODULE_PARM(nal, "i"); +MODULE_PARM_DESC(nal, "Use the specified NAL " + "(6-kscimacnal, 4-toenal, 2-ksocknal, 1-kqswnal)"); + +MODULE_AUTHOR("Brian Behlendorf (LLNL)"); +MODULE_DESCRIPTION("A kernel space ping server for portals testing"); +MODULE_LICENSE("GPL"); + +module_init(pingsrv_init); +module_exit(pingsrv_cleanup); diff --git a/lustre/portals/tests/startclient.sh b/lustre/portals/tests/startclient.sh new file mode 100755 index 0000000..c9b7c16 --- /dev/null +++ b/lustre/portals/tests/startclient.sh @@ -0,0 +1,37 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingcli.o +else + PING=spingcli.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +exit 0; diff --git a/lustre/portals/tests/startserver.sh b/lustre/portals/tests/startserver.sh new file mode 100755 index 0000000..942300e --- /dev/null +++ b/lustre/portals/tests/startserver.sh @@ -0,0 +1,38 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-0} + +if [ $SIMPLE -eq 0 ]; then + PING=pingsrv.o +else + PING=spingsrv.o +fi + +case "$1" in + toe) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../toenal/ktoenal.o + /sbin/insmod ./$PING nal=4 + echo ktoenal > /tmp/nal + ;; + + tcp) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../socknal/ksocknal.o + /sbin/insmod ./$PING nal=2 + echo ksocknal > /tmp/nal + ;; + + elan) + /sbin/insmod ../oslib/portals.o + /sbin/insmod ../qswnal/kqswnal.o + /sbin/insmod ./$PING nal=4 + echo kqswnal > /tmp/nal + ;; + + *) + echo "Usage : ${0} < tcp | toe | elan >" + exit 1; +esac +../utils/acceptor 9999& +exit 0; diff --git a/lustre/portals/tests/stopclient.sh b/lustre/portals/tests/stopclient.sh new file mode 100755 index 0000000..f7e3aa1 --- /dev/null +++ b/lustre/portals/tests/stopclient.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingcli +else + PING=pingcli +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +rmmod portals diff --git a/lustre/portals/tests/stopserver.sh b/lustre/portals/tests/stopserver.sh new file mode 100644 index 0000000..3e81831 --- /dev/null +++ b/lustre/portals/tests/stopserver.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +SIMPLE=${SIMPLE:-1} + +if [ $SIMPLE -eq 0 ]; then + PING=spingsrv +else + PING=pingsrv +fi + +rmmod $PING +NAL=`cat /tmp/nal`; +rmmod $NAL +killall -9 acceptor +rm -f /var/run/acceptor-9999.pid +rmmod portals diff --git a/lustre/portals/unals/.cvsignore b/lustre/portals/unals/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/portals/unals/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/portals/unals/Makefile.am b/lustre/portals/unals/Makefile.am new file mode 100644 index 0000000..dc427b0 --- /dev/null +++ b/lustre/portals/unals/Makefile.am @@ -0,0 +1,5 @@ +CPPFLAGS= +INCLUDES=-I$(top_srcdir)/portals/include -I$(top_srcdir)/include -I$(srcdir) +lib_LIBRARIES = libtcpnal.a +pkginclude_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h +libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h diff --git a/lustre/portals/unals/README b/lustre/portals/unals/README new file mode 100644 index 0000000..6cb93d9 --- /dev/null +++ b/lustre/portals/unals/README @@ -0,0 +1,53 @@ +This library implements two NAL interfaces, both running over IP. +The first, tcpnal, creates TCP connections between participating +processes in order to transport the portals requests. The second, +ernal, provides a simple transport protocol which runs over +UDP datagrams. + +The interface functions return both of these values in host order for +convenience and readability. However this means that addresses +exchanged in messages between hosts of different orderings will not +function properly. + +Both NALs use the same support functions in order to schedule events +and communicate with the generic portals implementation. + + ------------------------- + | api | + |_______________________| + | lib | + |_______________________| + | ernal | |tcpnal | + |--------| |----------| + | udpsock| |connection| + |-----------------------| + | timer/select | + ------------------------- + + + These NALs uses the framework from fdnal of a pipe between the api +and library sides. This is wrapped up in the select on the library +side, and blocks on the api side. Performance could be severely +enhanced by collapsing this aritificial barrier, by using shared +memory queues, or by wiring the api layer directly to the library. + + +nid is defined as the low order 24-bits of the IP address of the +physical node left shifted by 8 plus a virtual node number of 0 +through 255 (really only 239). The virtual node number of a tcpnal +application should be specified using the environment variable +PTL_VIRTNODE. pid is now a completely arbitrary number in the +range of 0 to 255. The IP interface used can be overridden by +specifying the appropriate hostid by setting the PTL_HOSTID +environment variable. The value can be either dotted decimal +(n.n.n.n) or hex starting with "0x". +TCPNAL: + As the NAL needs to try to send to a particular nid/pid pair, it + will open up connections on demand. Because the port associated with + the connecting socket is different from the bound port, two + connections will normally be established between a pair of peers, with + data flowing from the anonymous connect (active) port to the advertised + or well-known bound (passive) port of each peer. + + Should the connection fail to open, an error is reported to the + library component, which causes the api request to fail. diff --git a/lustre/portals/unals/address.c b/lustre/portals/unals/address.c new file mode 100644 index 0000000..b422c3f --- /dev/null +++ b/lustre/portals/unals/address.c @@ -0,0 +1,146 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* address.c: + * this file provides functions to aquire the IP address of the node + * and translate them into a NID/PID pair which supports a static + * mapping of virtual nodes into the port range of an IP socket. +*/ + +#include <stdlib.h> +#include <netdb.h> +#include <unistd.h> +#include <stdio.h> +#include <portals/p30.h> +#include <bridge.h> +#include <ipmap.h> + + +/* Function: get_node_id + * Returns: a 32 bit id for this node, actually a big-endian IP address + * + * get_node_id() determines the host name and uses the resolver to + * find out its ip address. This is fairly fragile and inflexible, but + * explicitly asking about interfaces and their addresses is very + * complicated and nonportable. + */ +static unsigned int get_node_id(void) +{ + char buffer[255]; + unsigned int x; + struct hostent *he; + char * host_envp; + + if (!(host_envp = getenv("PTL_HOSTID"))) + { + gethostname(buffer,sizeof(buffer)); + he=gethostbyname(buffer); + if (he) + x=*(unsigned int *)he->h_addr_list[0]; + else + x = 0; + return(ntohl(x)); + } + else + { + if (host_envp[1] != 'x') + { + int a, b, c, d; + sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); + return ((a<<24) | (b<<16) | (c<<8) | d); + } + else + { + long long hostid = strtoll(host_envp, 0, 0); + return((unsigned int) hostid); + } + } +} + + +/* Function: set_address + * Arugments: t: a procnal structure to populate with the request + * + * set_address performs the bit manipulations to set the nid, pid, and + * iptop8 fields of the procnal structures. + * + * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY + */ + +#ifdef DIRECT_IP_MODE +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int port; + if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; + else port=pidrequest; + t->nal_cb->ni.nid=get_node_id(); + t->nal_cb->ni.pid=port; +} +#else + +void set_address(bridge t,ptl_pid_t pidrequest) +{ + int virtnode, in_addr, port; + ptl_pid_t pid; + + /* get and remember my node id*/ + if (!getenv("PTL_VIRTNODE")) + virtnode = 0; + else + { + int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT + >> PNAL_VNODE_SHIFT); + virtnode = atoi(getenv("PTL_VIRTNODE")); + if (virtnode > maxvnode) + { + fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", + virtnode, maxvnode); + return; + } + } + + in_addr = get_node_id(); + + t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ + t->nal_cb->ni.nid = ((in_addr & PNAL_HOSTID_MASK) + << PNAL_VNODE_SHIFT) + + virtnode; + + pid=pidrequest; + /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ +#ifdef notyet + if (pid==(unsigned short)PTL_PID_ANY) port = 0; +#endif + if (pid==(unsigned short)PTL_PID_ANY) + { + fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); + return; + } + else if (pid > PNAL_PID_MASK) + { + fprintf(stderr, "portal pid of %d is too large - max %d\n", + pid, PNAL_PID_MASK); + return; + } + else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; + t->nal_cb->ni.pid=pid; +} +#endif diff --git a/lustre/portals/unals/bridge.h b/lustre/portals/unals/bridge.h new file mode 100644 index 0000000..0b4940f --- /dev/null +++ b/lustre/portals/unals/bridge.h @@ -0,0 +1,29 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include <portals/lib-p30.h> + +typedef struct bridge { + int alive; + nal_cb_t *nal_cb; + void *lower; + void *local; + void (*shutdown)(struct bridge *); + /* this doesn't really belong here */ + unsigned char iptop8; +} *bridge; + + +nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc); + +typedef int (*nal_initialize)(bridge); +extern nal_initialize nal_table[PTL_IFACE_MAX]; diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c new file mode 100644 index 0000000..310e899 --- /dev/null +++ b/lustre/portals/unals/connection.c @@ -0,0 +1,294 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* connection.c: + This file provides a simple stateful connection manager which + builds tcp connections on demand and leaves them open for + future use. It also provides the machinery to allow peers + to connect to it +*/ + +#include <stdlib.h> +#include <pqtimer.h> +#include <dispatch.h> +#include <table.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <unistd.h> +#include <syscall.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <connection.h> +#include <errno.h> + + +/* global variable: acceptor port */ +unsigned short tcpnal_acceptor_port = 988; + + +/* Function: compare_connection + * Arguments: connection c: a connection in the hash table + * ptl_process_id_t: an id to verify agains + * Returns: 1 if the connection is the one requested, 0 otherwise + * + * compare_connection() tests for collisions in the hash table + */ +static int compare_connection(void *arg1, void *arg2) +{ + connection c = arg1; + unsigned int * id = arg2; + return((c->ip==id[0]) && (c->port==id[1])); +} + + +/* Function: connection_key + * Arguments: ptl_process_id_t id: an id to hash + * Returns: a not-particularily-well-distributed hash + * of the id + */ +static unsigned int connection_key(unsigned int *id) +{ + return(id[0]^id[1]); +} + + +/* Function: remove_connection + * Arguments: c: the connection to remove + */ +void remove_connection(void *arg) +{ + connection c = arg; + unsigned int id[2]; + + id[0]=c->ip; + id[1]=c->port; + hash_table_remove(c->m->connections,id); + close(c->fd); + free(c); +} + + +/* Function: read_connection: + * Arguments: c: the connection to read from + * dest: the buffer to read into + * len: the number of bytes to read + * Returns: success as 1, or failure as 0 + * + * read_connection() reads data from the connection, continuing + * to read partial results until the request is satisfied or + * it errors. TODO: this read should be covered by signal protection. + */ +int read_connection(connection c, + unsigned char *dest, + int len) +{ + int offset=0,rc; + + if (len){ + do { + if((rc=syscall(SYS_read, c->fd, dest+offset, len-offset))<=0){ + if (errno==EINTR) { + rc=0; + } else { + remove_connection(c); + return(0); + } + } + offset+=rc; + } while (offset<len); + } + return(1); +} + +static int connection_input(void *d) +{ + connection c = d; + return((*c->m->handler)(c->m->handler_arg,c)); +} + + +/* Function: allocate_connection + * Arguments: t: tcpnal the allocation is occuring in the context of + * dest: portal endpoint address for this connection + * fd: open file descriptor for the socket + * Returns: an allocated connection structure + * + * just encompasses the action common to active and passive + * connections of allocation and placement in the global table + */ +static connection allocate_connection(manager m, + unsigned int ip, + unsigned short port, + int fd) +{ + connection c=malloc(sizeof(struct connection)); + unsigned int id[2]; + c->m=m; + c->fd=fd; + c->ip=ip; + c->port=port; + id[0]=ip; + id[1]=port; + register_io_handler(fd,READ_HANDLER,connection_input,c); + hash_table_insert(m->connections,c,id); + return(c); +} + + +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + allocate_connection(m,htonl(nid),0/*pid*/,fd); + return(1); +} + + +/* Function: force_tcp_connection + * Arguments: t: tcpnal + * dest: portals endpoint for the connection + * Returns: an allocated connection structure, either + * a pre-existing one, or a new connection + */ +connection force_tcp_connection(manager m, + unsigned int ip, + unsigned short port) +{ + connection c; + struct sockaddr_in addr; + unsigned int id[2]; + + port = tcpnal_acceptor_port; + + id[0]=ip; + id[1]=port; + + if (!(c=hash_table_find(m->connections,id))){ + int fd; + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(ip); + addr.sin_port = htons(port); + + if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("tcpnal socket failed"); + exit(-1); + } + if (connect(fd, + (struct sockaddr *)&addr, + sizeof(struct sockaddr_in))) + { + perror("tcpnal connect"); + return(0); + } + return(allocate_connection(m,ip,port,fd)); + } + return(c); +} + + +/* Function: bind_socket + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: 1 on success, or 0 on error + * + * bind_socket() attempts to allocate and bind a socket to the requested + * port, or dynamically assign one from the kernel should the port be + * zero. Sets the bound and bound_handler elements of m. + * + * TODO: The port should be an explicitly sized type. + */ +static int bind_socket(manager m,unsigned short port) +{ + struct sockaddr_in addr; + int alen=sizeof(struct sockaddr_in); + + if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) + return(0); + + bzero((char *) &addr, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = 0; + addr.sin_port = port; + + if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ + perror ("tcpnal bind"); + return(0); + } + + getsockname(m->bound,(struct sockaddr *)&addr, &alen); + + m->bound_handler=register_io_handler(m->bound,READ_HANDLER, + new_connection,m); + listen(m->bound,5); + m->port=addr.sin_port; + return(1); +} + + +/* Function: shutdown_connections + * Arguments: m: the manager structure + * + * close all connections and reclaim resources + */ +void shutdown_connections(manager m) +{ + close(m->bound); + remove_io_handler(m->bound_handler); + hash_destroy_table(m->connections,remove_connection); + free(m); +} + + +/* Function: init_connections + * Arguments: t: the nal state for this interface + * port: the port to attempt to bind to + * Returns: a newly allocated manager structure, or + * zero if the fixed port could not be bound + */ +manager init_connections(unsigned short pid, + int (*input)(void *, void *), + void *a) +{ + manager m=(manager)malloc(sizeof(struct manager)); + m->connections=hash_create_table(compare_connection,connection_key); + m->handler=input; + m->handler_arg=a; + if (bind_socket(m,pid)) return(m); + free(m); + return(0); +} diff --git a/lustre/portals/unals/connection.h b/lustre/portals/unals/connection.h new file mode 100644 index 0000000..6f57287 --- /dev/null +++ b/lustre/portals/unals/connection.h @@ -0,0 +1,32 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#include <table.h> + +typedef struct manager { + table connections; + int bound; + io_handler bound_handler; + int (*handler)(void *, void *); + void *handler_arg; + unsigned short port; +} *manager; + + +typedef struct connection { + unsigned int ip; + unsigned short port; + int fd; + manager m; +} *connection; + +connection force_tcp_connection(manager m, unsigned int ip, unsigned int short); +manager init_connections(unsigned short, int (*f)(void *, void *), void *); +void remove_connection(void *arg); +void shutdown_connections(manager m); +int read_connection(connection c, unsigned char *dest, int len); diff --git a/lustre/portals/unals/debug.c b/lustre/portals/unals/debug.c new file mode 100644 index 0000000..529bb2d --- /dev/null +++ b/lustre/portals/unals/debug.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan <phil@clusterfs.com> + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <stdarg.h> +#include <sys/time.h> + +int smp_processor_id = 1; +char debug_file_path[1024] = "/tmp/lustre-log"; +char debug_file_name[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; +} + + +void portals_debug_print(void) +{ + return; +} + + +void portals_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + + +int portals_debug_init(unsigned long bufsize) +{ + debug_file_fd = stdout; + return 0; +} + +int portals_debug_cleanup(void) +{ + return 0; //close(portals_debug_fd); +} + +int portals_debug_clear_buffer(void) +{ + return 0; +} + +int portals_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +int portals_debug_copy_to_user(char *buf, unsigned long len) +{ + return 0; +} + +/* FIXME: I'm not very smart; someone smarter should make this better. */ +void +portals_debug_msg (int subsys, int mask, char *file, char *fn, int line, + const char *format, ...) +{ + va_list ap; + unsigned long flags; + struct timeval tv; + int nob; + + + /* NB since we pass a non-zero sized buffer (at least) on the first + * print, we can be assured that by the end of all the snprinting, + * we _do_ have a terminated buffer, even if our message got truncated. + */ + + gettimeofday(&tv, NULL); + + nob += fprintf(debug_file_fd, + "%02x:%06x:%d:%lu.%06lu ", + subsys >> 24, mask, smp_processor_id, + tv.tv_sec, tv.tv_usec); + + nob += fprintf(debug_file_fd, + "(%s:%d:%s() %d+%ld): ", + file, line, fn, 0, + 8192 - ((unsigned long)&flags & 8191UL)); + + va_start (ap, format); + nob += fprintf(debug_file_fd, format, ap); + va_end (ap); + + +} + diff --git a/lustre/portals/unals/dispatch.h b/lustre/portals/unals/dispatch.h new file mode 100644 index 0000000..34dd070 --- /dev/null +++ b/lustre/portals/unals/dispatch.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* this file is only called dispatch.h to prevent it + from colliding with /usr/include/sys/select.h */ + +typedef struct io_handler *io_handler; + +struct io_handler{ + io_handler *last; + io_handler next; + int fd; + int type; + int (*function)(void *); + void *argument; + int disabled; +}; + + +#define READ_HANDLER 1 +#define WRITE_HANDLER 2 +#define EXCEPTION_HANDLER 4 +#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) + +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg); + +void remove_io_handler (io_handler i); +void init_unix_timer(void); +void select_timer_block(when until); +when now(void); diff --git a/lustre/portals/unals/ipmap.h b/lustre/portals/unals/ipmap.h new file mode 100644 index 0000000..85b1e18 --- /dev/null +++ b/lustre/portals/unals/ipmap.h @@ -0,0 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#define DIRECT_IP_MODE +#ifdef DIRECT_IP_MODE +#define PNAL_NID(in_addr, port) (in_addr) +#define PNAL_PID(pid) (pid) +#define PNAL_IP(in_addr, port) (in_addr) +#define PNAL_PORT(nid, pid) (pid) +#else + +#define PNAL_BASE_PORT 4096 +#define PNAL_HOSTID_SHIFT 24 +#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) +#define PNAL_VNODE_SHIFT 8 +#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) +#define PNAL_PID_SHIFT 8 +#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) + +#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ + << PNAL_VNODE_SHIFT) \ + | (((ntohs(port)-PNAL_BASE_PORT) >>\ + PNAL_PID_SHIFT))) +#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) + +#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ + >> PNAL_VNODE_SHIFT)\ + | (t->iptop8 << PNAL_HOSTID_SHIFT))) +#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ + << PNAL_VNODE_SHIFT) \ + | ((pid) & PNAL_PID_MASK)) \ + + PNAL_BASE_PORT)) +#endif diff --git a/lustre/portals/unals/pqtimer.c b/lustre/portals/unals/pqtimer.c new file mode 100644 index 0000000..fa2fb4f --- /dev/null +++ b/lustre/portals/unals/pqtimer.c @@ -0,0 +1,226 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* timer.c: + * this file implements a simple priority-queue based timer system. when + * combined with a file which implements now() and block(), it can + * be used to provide course-grained time-based callbacks. + */ + +#include <pqtimer.h> +#include <stdlib.h> +#include <string.h> + +struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +}; + +typedef struct thunk *thunk; +struct thunk { + void (*f)(void *); + void *a; + thunk next; +}; + +extern when now(void); + +static thunk thunks; +static int internal; +static void (*block_function)(when); +static int number_of_timers; +static int size_of_pqueue; +static timer *timers; + + +static void heal(int where) +{ + int left=(where<<1); + int right=(where<<1)+1; + int min=where; + timer temp; + + if (left <= number_of_timers) + if (timers[left]->w < timers[min]->w) min=left; + if (right <= number_of_timers) + if (timers[right]->w < timers[min]->w) min=right; + if (min != where){ + temp=timers[where]; + timers[where]=timers[min]; + timers[min]=temp; + heal(min); + } +} + +static void add_pqueue(int i) +{ + timer temp; + int parent=(i>>1); + if ((i>1) && (timers[i]->w< timers[parent]->w)){ + temp=timers[i]; + timers[i]=timers[parent]; + timers[parent]=temp; + add_pqueue(parent); + } +} + +static void add_timer(timer t) +{ + if (size_of_pqueue<(number_of_timers+2)){ + int oldsize=size_of_pqueue; + timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); + memcpy(new,timers,sizeof(timer)*oldsize); + timers=new; + } + timers[++number_of_timers]=t; + add_pqueue(number_of_timers); +} + +/* Function: register_timer + * Arguments: interval: the time interval from the current time when + * the timer function should be called + * function: the function to call when the time has expired + * argument: the argument to call it with. + * Returns: a pointer to a timer structure + */ +timer register_timer(when interval, + void (*function)(void *), + void *argument) +{ + timer t=(timer)malloc(sizeof(struct timer)); + + t->arg=argument; + t->function=function; + t->interval=interval; + t->disable=0; + t->w=now()+interval; + add_timer(t); + if (!internal && (number_of_timers==1)) + block_function(t->w); + return(t); +} + +/* Function: remove_timer + * Arguments: t: + * Returns: nothing + * + * remove_timer removes a timer from the system, insuring + * that it will never be called. It does not actually + * free the timer due to reentrancy issues. + */ + +void remove_timer(timer t) +{ + t->disable=1; +} + + + +void timer_fire() +{ + timer current; + + current=timers[1]; + timers[1]=timers[number_of_timers--]; + heal(1); + if (!current->disable) { + (*current->function)(current->arg); + } + free(current); +} + +when next_timer(void) +{ + when here=now(); + + while (number_of_timers && (timers[1]->w <= here)) timer_fire(); + if (number_of_timers) return(timers[1]->w); + return(0); +} + +/* Function: timer_loop + * Arguments: none + * Returns: never + * + * timer_loop() is the blocking dispatch function for the timer. + * Is calls the block() function registered with init_timer, + * and handles associated with timers that have been registered. + */ +void timer_loop() +{ + when here; + + while (1){ + thunk z; + here=now(); + + for (z=thunks;z;z=z->next) (*z->f)(z->a); + + if (number_of_timers){ + if (timers[1]->w > here){ + (*block_function)(timers[1]->w); + } else { + timer_fire(); + } + } else { + thunk z; + for (z=thunks;z;z=z->next) (*z->f)(z->a); + (*block_function)(0); + } + } +} + + +/* Function: register_thunk + * Arguments: f: the function to call + * a: the single argument to call it with + * + * Thunk functions get called at irregular intervals, they + * should not assume when, or take a particularily long + * amount of time. Thunks are for background cleanup tasks. + */ +void register_thunk(void (*f)(void *),void *a) +{ + thunk t=(void *)malloc(sizeof(struct thunk)); + t->f=f; + t->a=a; + t->next=thunks; + thunks=t; +} + +/* Function: initialize_timer + * Arguments: block: the function to call to block for the specified interval + * + * initialize_timer() must be called before any other timer function, + * including timer_loop. + */ +void initialize_timer(void (*block)(when)) +{ + block_function=block; + number_of_timers=0; + size_of_pqueue=10; + timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); + thunks=0; +} diff --git a/lustre/portals/unals/pqtimer.h b/lustre/portals/unals/pqtimer.h new file mode 100644 index 0000000..11efb0e --- /dev/null +++ b/lustre/portals/unals/pqtimer.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned long long when; +when now(void); +typedef struct timer *timer; +timer register_timer(when interval, + void (*function)(void *), + void *argument); +timer register_timer_wait(void); +void remove_timer(timer); +void timer_loop(void); +void initialize_timer(void (*block)(when)); +void timer_fire(void); + + +#define HZ 0x100000000ull + + diff --git a/lustre/portals/unals/procapi.c b/lustre/portals/unals/procapi.c new file mode 100644 index 0000000..6da3210 --- /dev/null +++ b/lustre/portals/unals/procapi.c @@ -0,0 +1,283 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* api.c: + * This file provides the 'api' side for the process-based nals. + * it is responsible for creating the 'library' side thread, + * and passing wrapped portals transactions to it. + * + * Along with initialization, shutdown, and transport to the library + * side, this file contains some stubs to satisfy the nal definition. + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <syscall.h> +#include <procbridge.h> +#include <pqtimer.h> +#include <dispatch.h> +#include <errno.h> + + +/* Function: forward + * Arguments: nal_t *nal: pointer to my top-side nal structure + * id: the command to pass to the lower layer + * args, args_len:pointer to and length of the request + * ret, ret_len: pointer to and size of the result + * Returns: a portals status code + * + * forwards a packaged api call from the 'api' side to the 'library' + * side, and collects the result + */ +#define forward_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(PTL_SEGV);\ + } +static int procbridge_forward(nal_t *n, int id, void *args, ptl_size_t args_len, + void *ret, ptl_size_t ret_len) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int lib=p->to_lib[1]; + int k; + + forward_failure(write,lib, &id, sizeof(id)); + forward_failure(write,lib,&args_len, sizeof(args_len)); + forward_failure(write,lib,&ret_len, sizeof(ret_len)); + forward_failure(write,lib,args, args_len); + + do { + k=syscall(SYS_read, p->from_lib[0], ret, ret_len); + } while ((k!=ret_len) && (errno += EINTR)); + + if(k!=ret_len){ + perror("nal: read return block"); + return PTL_SEGV; + } + return (PTL_OK); +} +#undef forward_failure + + +/* Function: shutdown + * Arguments: nal: a pointer to my top side nal structure + * ni: my network interface index + * + * cleanup nal state, reclaim the lower side thread and + * its state using PTL_FINI codepoint + */ +static int procbridge_shutdown(nal_t *n, int ni) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + int code=PTL_FINI; + + syscall(SYS_write, p->to_lib[1],&code,sizeof(code)); + syscall(SYS_read, p->from_lib[0],&code,sizeof(code)); + + syscall(SYS_close, p->to_lib[0]); + syscall(SYS_close, p->to_lib[1]); + syscall(SYS_close, p->from_lib[0]); + syscall(SYS_close, p->from_lib[1]); + + free(p); + return(0); +} + + +/* Function: validate + * useless stub + */ +static int procbridge_validate(nal_t *nal, void *base, ptl_size_t extent) +{ + return(0); +} + + +/* Function: yield + * Arguments: pid: + * + * this function was originally intended to allow the + * lower half thread to be scheduled to allow progress. we + * overload it to explicitly block until signalled by the + * lower half. + */ +static void procbridge_yield(nal_t *n) +{ + bridge b=(bridge)n->nal_data; + procbridge p=(procbridge)b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_wait(&p->cond,&p->mutex); + pthread_mutex_unlock(&p->mutex); +} + + +static void procbridge_lock(nal_t * nal, unsigned long *flags){} +static void procbridge_unlock(nal_t * nal, unsigned long *flags){} +/* api_nal + * the interface vector to allow the generic code to access + * this nal. this is seperate from the library side nal_cb. + * TODO: should be dyanmically allocated + */ +static nal_t api_nal = { + ni: {0}, + nal_data: NULL, + forward: procbridge_forward, + shutdown: procbridge_shutdown, + validate: procbridge_validate, + yield: procbridge_yield, + lock: procbridge_lock, + unlock: procbridge_unlock +}; + +/* Function: bridge_init + * + * Arguments: pid: requested process id (port offset) + * PTL_ID_ANY not supported. + * desired: limits passed from the application + * and effectively ignored + * actual: limits actually allocated and returned + * + * Returns: a pointer to my statically allocated top side NAL + * structure + * + * initializes the tcp nal. we define unix_failure as an + * error wrapper to cut down clutter. + */ +#define unix_failure(operand,fd,buffer,length,text)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + perror(text);\ + return(NULL);\ + } +#if 0 +static nal_t *bridge_init(ptl_interface_t nal, + ptl_pid_t pid_request, + ptl_ni_limits_t *desired, + ptl_ni_limits_t *actual, + int *rc) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (desired) limits = *desired; + unix_failure(write,p->to_lib[1], &pid_request, sizeof(pid_request), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal, sizeof(ptl_interface_t), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], actual, sizeof(ptl_ni_limits_t), + "tcp_init: read"); + unix_failure(read,p->from_lib[0], rc, sizeof(rc), + "nal_init: read"); + + if(*rc) return(NULL); + + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#endif + +ptl_nid_t tcpnal_mynid; + +nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid) +{ + procbridge p; + bridge b; + static int initialized=0; + ptl_ni_limits_t limits = {-1,-1,-1,-1,-1}; + int rc, nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ + + if(initialized) return (&api_nal); + + init_unix_timer(); + + b=(bridge)malloc(sizeof(struct bridge)); + p=(procbridge)malloc(sizeof(struct procbridge)); + api_nal.nal_data=b; + b->local=p; + + if(pipe(p->to_lib) || pipe(p->from_lib)) { + perror("nal_init: pipe"); + return(NULL); + } + + if (ptl_size) + limits.max_ptable_index = ptl_size; + if (acl_size) + limits.max_atable_index = acl_size; + + unix_failure(write,p->to_lib[1], &requested_pid, sizeof(requested_pid), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &limits, sizeof(ptl_ni_limits_t), + "nal_init: write"); + unix_failure(write,p->to_lib[1], &nal_type, sizeof(nal_type), + "nal_init: write"); + + if(pthread_create(&p->t, NULL, nal_thread, b)) { + perror("nal_init: pthread_create"); + return(NULL); + } + + unix_failure(read,p->from_lib[0], &rc, sizeof(rc), + "nal_init: read"); + + if(rc) return(NULL); + + b->nal_cb->ni.nid = tcpnal_mynid; + initialized = 1; + pthread_mutex_init(&p->mutex,0); + pthread_cond_init(&p->cond, 0); + + return (&api_nal); +} +#undef unix_failure diff --git a/lustre/portals/unals/procbridge.h b/lustre/portals/unals/procbridge.h new file mode 100644 index 0000000..060ae7b --- /dev/null +++ b/lustre/portals/unals/procbridge.h @@ -0,0 +1,40 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef _PROCBRIDGE_H_ +#define _PROCBRIDGE_H_ + +#include <pthread.h> +#include <bridge.h> +#include <ipmap.h> + + +typedef struct procbridge { + pthread_t t; + pthread_cond_t cond; + pthread_mutex_t mutex; + int to_lib[2]; + int from_lib[2]; +} *procbridge; + +extern void *nal_thread(void *); + + +#define PTL_INIT (LIB_MAX_DISPATCH+1) +#define PTL_FINI (LIB_MAX_DISPATCH+2) + +#define MAX_ACLS 1 +#define MAX_PTLS 128 + +extern void set_address(bridge t,ptl_pid_t pidrequest); +extern nal_t *procbridge_interface(int num_interface, + ptl_pt_index_t ptl_size, + ptl_ac_index_t acl_size, + ptl_pid_t requested_pid); + +#endif diff --git a/lustre/portals/unals/proclib.c b/lustre/portals/unals/proclib.c new file mode 100644 index 0000000..c3ee103 --- /dev/null +++ b/lustre/portals/unals/proclib.c @@ -0,0 +1,270 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* lib.c: + * This file provides the 'library' side for the process-based nals. + * it is responsible for communication with the 'api' side and + * providing service to the generic portals 'library' + * implementation. 'library' might be better termed 'communication' + * or 'kernel'. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <unistd.h> +#include <syscall.h> +#include <procbridge.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <errno.h> +#include <timer.h> +//#include <util/pqtimer.h> +#include <dispatch.h> + +/* the following functions are stubs to satisfy the nal definition + without doing anything particularily useful*/ + +static int nal_write(nal_cb_t *nal, + void *private, + user_ptr dst_addr, + void *src_addr, + ptl_size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static int nal_read(nal_cb_t * nal, + void *private, + void *dst_addr, + user_ptr src_addr, + size_t len) +{ + memcpy(dst_addr, src_addr, len); + return 0; +} + +static void *nal_malloc(nal_cb_t *nal, + ptl_size_t len) +{ + void *buf = malloc(len); + return buf; +} + +static void nal_free(nal_cb_t *nal, + void *buf, + ptl_size_t len) +{ + free(buf); +} + +static void nal_printf(nal_cb_t *nal, + const char *fmt, + ...) +{ + va_list ap; + + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + + +static void nal_cli(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static void nal_sti(nal_cb_t *nal, + unsigned long *flags) +{ +} + + +static int nal_dist(nal_cb_t *nal, + ptl_nid_t nid, + unsigned long *dist) +{ + return 0; +} + + + +/* Function: data_from_api + * Arguments: t: the nal state for this interface + * Returns: whether to continue reading from the pipe + * + * data_from_api() reads data from the api side in response + * to a select. + * + * We define data_failure() for syntactic convenience + * of unix error reporting. + */ + +#define data_failure(operand,fd,buffer,length)\ + if(syscall(SYS_##operand,fd,buffer,length)!=length){\ + lib_fini(b->nal_cb);\ + return(0);\ + } +static int data_from_api(void *arg) +{ + bridge b = arg; + procbridge p=(procbridge)b->local; + /* where are these two sizes derived from ??*/ + char arg_block[ 256 ]; + char ret_block[ 128 ]; + ptl_size_t arg_len,ret_len; + int fd=p->to_lib[0]; + int index; + + data_failure(read,fd, &index, sizeof(index)); + + if (index==PTL_FINI) { + lib_fini(b->nal_cb); + if (b->shutdown) (*b->shutdown)(b); + syscall(SYS_write, p->from_lib[1],&b->alive,sizeof(b->alive)); + + /* a heavy-handed but convenient way of shutting down + the lower side thread */ + pthread_exit(0); + } + + data_failure(read,fd, &arg_len, sizeof(arg_len)); + data_failure(read,fd, &ret_len, sizeof(ret_len)); + data_failure(read,fd, arg_block, arg_len); + + lib_dispatch(b->nal_cb, NULL, index, arg_block, ret_block); + + data_failure(write,p->from_lib[1],ret_block, ret_len); + return(1); +} +#undef data_failure + + + +static void wakeup_topside(void *z) +{ + bridge b=z; + procbridge p=b->local; + + pthread_mutex_lock(&p->mutex); + pthread_cond_broadcast(&p->cond); + pthread_mutex_unlock(&p->mutex); +} + + +/* Function: nal_thread + * Arguments: z: an opaque reference to a nal control structure + * allocated and partially populated by the api level code + * Returns: nothing, and only on error or explicit shutdown + * + * This function is the entry point of the pthread initiated on + * the api side of the interface. This thread is used to handle + * asynchronous delivery to the application. + * + * We define a limit macro to place a ceiling on limits + * for syntactic convenience + */ +#define LIMIT(x,y,max)\ + if ((unsigned int)x > max) y = max; + +extern int tcpnal_init(bridge); + +nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; + +void *nal_thread(void *z) +{ + bridge b=z; + procbridge p=b->local; + int rc; + ptl_pid_t pid_request; + int nal_type; + ptl_ni_limits_t desired; + ptl_ni_limits_t actual; + + b->nal_cb=(nal_cb_t *)malloc(sizeof(nal_cb_t)); + b->nal_cb->nal_data=b; + b->nal_cb->cb_read=nal_read; + b->nal_cb->cb_write=nal_write; + b->nal_cb->cb_malloc=nal_malloc; + b->nal_cb->cb_free=nal_free; + b->nal_cb->cb_map=NULL; + b->nal_cb->cb_unmap=NULL; + b->nal_cb->cb_printf=nal_printf; + b->nal_cb->cb_cli=nal_cli; + b->nal_cb->cb_sti=nal_sti; + b->nal_cb->cb_dist=nal_dist; + + + register_io_handler(p->to_lib[0],READ_HANDLER,data_from_api,(void *)b); + + if(!(rc = syscall(SYS_read, p->to_lib[0], &pid_request, sizeof(pid_request)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &desired, sizeof(ptl_ni_limits_t)))) + perror("procbridge read from api"); + if(!(rc = syscall(SYS_read, p->to_lib[0], &nal_type, sizeof(nal_type)))) + perror("procbridge read from api"); + + actual = desired; + LIMIT(desired.max_match_entries,actual.max_match_entries,MAX_MES); + LIMIT(desired.max_mem_descriptors,actual.max_mem_descriptors,MAX_MDS); + LIMIT(desired.max_event_queues,actual.max_event_queues,MAX_EQS); + LIMIT(desired.max_atable_index,actual.max_atable_index,MAX_ACLS); + LIMIT(desired.max_ptable_index,actual.max_ptable_index,MAX_PTLS); + + set_address(b,pid_request); + + if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); + /* initialize the generic 'library' level code */ + + rc = lib_init(b->nal_cb, + b->nal_cb->ni.nid, + b->nal_cb->ni.pid, + 10, + actual.max_ptable_index, + actual.max_atable_index); + + /* + * Whatever the initialization returned is passed back to the + * user level code for further interpretation. We just exit if + * it is non-zero since something went wrong. + */ + /* this should perform error checking */ +#if 0 + write(p->from_lib[1], &actual, sizeof(ptl_ni_limits_t)); +#endif + syscall(SYS_write, p->from_lib[1], &rc, sizeof(rc)); + + if(!rc) { + /* the thunk function is called each time the timer loop + performs an operation and returns to blocking mode. we + overload this function to inform the api side that + it may be interested in looking at the event queue */ + register_thunk(wakeup_topside,b); + timer_loop(); + } + return(0); +} +#undef LIMIT + diff --git a/lustre/portals/unals/select.c b/lustre/portals/unals/select.c new file mode 100644 index 0000000..c4f84f4 --- /dev/null +++ b/lustre/portals/unals/select.c @@ -0,0 +1,165 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* select.c: + * Provides a general mechanism for registering and dispatching + * io events through the select system call. + */ + +#ifdef sun +#include <sys/filio.h> +#else +#include <sys/ioctl.h> +#endif + +#include <sys/time.h> +#include <sys/types.h> +#include <stdlib.h> +#include <pqtimer.h> +#include <dispatch.h> + + +static struct timeval beginning_of_epoch; +static io_handler io_handlers; + +/* Function: now + * + * Return: the current time in canonical units: a 64 bit number + * where the most significant 32 bits contains the number + * of seconds, and the least signficant a count of (1/(2^32))ths + * of a second. + */ +when now() +{ + struct timeval result; + + gettimeofday(&result,0); + return((((unsigned long long)result.tv_sec)<<32)| + (((unsigned long long)result.tv_usec)<<32)/1000000); +} + + +/* Function: register_io_handler + * Arguments: fd: the file descriptor of interest + * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER + * function: a function to call when io is available on fd + * arg: an opaque correlator to return to the handler + * Returns: a pointer to the io_handler structure + */ +io_handler register_io_handler(int fd, + int type, + int (*function)(void *), + void *arg) +{ + io_handler i=(io_handler)malloc(sizeof(struct io_handler)); + if ((i->fd=fd)>=0){ + i->type=type; + i->function=function; + i->argument=arg; + i->disabled=0; + i->last=&io_handlers; + if ((i->next=io_handlers)) i->next->last=&i->next; + io_handlers=i; + } + return(i); +} + +/* Function: remove_io_handler + * Arguments: i: a pointer to the handler to stop servicing + * + * remove_io_handler() doesn't actually free the handler, due + * to reentrancy problems. it just marks the handler for + * later cleanup by the blocking function. + */ +void remove_io_handler (io_handler i) +{ + i->disabled=1; +} + +static void set_flag(io_handler n,fd_set *fds) +{ + if (n->type & READ_HANDLER) FD_SET(n->fd,fds); + if (n->type & WRITE_HANDLER) FD_SET(n->fd,fds+1); + if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd,fds+2); +} + + +/* Function: select_timer_block + * Arguments: until: an absolute time when the select should return + * + * This function dispatches the various file descriptors' handler + * functions, if the kernel indicates there is io available. + */ +void select_timer_block(when until) +{ + fd_set fds[3]; + struct timeval timeout; + struct timeval *timeout_pointer; + int result; + io_handler j; + io_handler *k; + + /* TODO: loop until the entire interval is expired*/ + if (until){ + when interval=until-now(); + timeout.tv_sec=(interval>>32); + timeout.tv_usec=((interval<<32)/1000000)>>32; + timeout_pointer=&timeout; + } else timeout_pointer=0; + + FD_ZERO(fds); + FD_ZERO(fds+1); + FD_ZERO(fds+2); + for (k=&io_handlers;*k;){ + if ((*k)->disabled){ + j=*k; + *k=(*k)->next; + free(j); + } + if (*k) { + set_flag(*k,fds); + k=&(*k)->next; + } + } + result=select(FD_SETSIZE,fds,fds+1,fds+2,timeout_pointer); + + if (result > 0) + for (j=io_handlers;j;j=j->next){ + if (!(j->disabled) && + ((FD_ISSET(j->fd,fds) && (j->type & READ_HANDLER)) || + (FD_ISSET(j->fd,fds+1) && (j->type & WRITE_HANDLER)) || + (FD_ISSET(j->fd,fds+2) && (j->type & EXCEPTION_HANDLER)))){ + if (!(*j->function)(j->argument)) + j->disabled=1; + } + } +} + +/* Function: init_unix_timer() + * is called to initialize the library + */ +void init_unix_timer() +{ + io_handlers=0; + gettimeofday(&beginning_of_epoch, 0); + initialize_timer(select_timer_block); +} diff --git a/lustre/portals/unals/table.c b/lustre/portals/unals/table.c new file mode 100644 index 0000000..bef13c5 --- /dev/null +++ b/lustre/portals/unals/table.c @@ -0,0 +1,264 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <table.h> +#include <stdlib.h> +#include <string.h> + + +/* table.c: + * a very simple hash table implementation with paramerterizable + * comparison and key generation functions. it does resize + * in order to accomidate more entries, but never collapses + * the table + */ + +static table_entry *table_lookup (table t,void *comparator, + unsigned int k, + int (*compare_function)(void *, void *), + int *success) +{ + unsigned int key=k%t->size; + table_entry *i; + + for (i=&(t->entries[key]);*i;i=&((*i)->next)){ + if (compare_function && ((*i)->key==k)) + if ((*t->compare_function)((*i)->value,comparator)){ + *success=1; + return(i); + } + } + *success=0; + return(&(t->entries[key])); +} + + +static void resize_table(table t, int size) +{ + int old_size=t->size; + table_entry *old_entries=t->entries; + int i; + table_entry j,n; + table_entry *position; + int success; + + t->size=size; + t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); + memset(t->entries,0,sizeof(table_entry)*t->size); + + for (i=0;i<old_size;i++) + for (j=old_entries[i];j;j=n){ + n=j->next; + position=table_lookup(t,0,j->key,0,&success); + j->next= *position; + *position=j; + } + free(old_entries); +} + + +/* Function: key_from_int + * Arguments: int i: value to compute the key of + * Returns: the key + */ +unsigned int key_from_int(int i) +{ + return(i); +} + + +/* Function: key_from_string + * Arguments: char *s: the null terminated string + * to compute the key of + * Returns: the key + */ +unsigned int key_from_string(char *s) +{ + unsigned int result=0; + unsigned char *n; + int i; + if (!s) return(1); + for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; + return(result); +} + + +/* Function: hash_create_table + * Arguments: compare_function: a function to compare + * a table instance with a correlator + * key_function: a function to generate a 32 bit + * hash key from a correlator + * Returns: a pointer to the new table + */ +table hash_create_table (int (*compare_function)(void *, void *), + unsigned int (*key_function)(unsigned int *)) +{ + table new=(table)malloc(sizeof(struct table)); + memset(new, 0, sizeof(struct table)); + + new->compare_function=compare_function; + new->key_function=key_function; + new->number_of_entries=0; + new->size=4; + new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); + memset(new->entries,0,sizeof(table_entry)*new->size); + return(new); +} + + +/* Function: hash_table_find + * Arguments: t: a table to look in + * comparator: a value to access the table entry + * Returns: the element references to by comparator, or null + */ +void *hash_table_find (table t, void *comparator) +{ + int success; + table_entry* entry=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function, + &success); + if (success) return((*entry)->value); + return(0); +} + + +/* Function: hash_table_insert + * Arguments: t: a table to insert the object + * value: the object to put in the table + * comparator: the value by which the object + * will be addressed + * Returns: nothing + */ +void hash_table_insert (table t, void *value, void *comparator) +{ + int success; + unsigned int k=(*t->key_function)(comparator); + table_entry *position=table_lookup(t,comparator,k, + t->compare_function,&success); + table_entry entry; + + if (success) { + entry = *position; + } else { + entry = (table_entry)malloc(sizeof(struct table_entry)); + memset(entry, 0, sizeof(struct table_entry)); + entry->next= *position; + *position=entry; + t->number_of_entries++; + } + entry->value=value; + entry->key=k; + if (t->number_of_entries > t->size) resize_table(t,t->size*2); +} + +/* Function: hash_table_remove + * Arguments: t: the table to remove the object from + * comparator: the index value of the object to remove + * Returns: + */ +void hash_table_remove (table t, void *comparator) +{ + int success; + table_entry temp; + table_entry *position=table_lookup(t,comparator, + (*t->key_function)(comparator), + t->compare_function,&success); + if(success) { + temp=*position; + *position=(*position)->next; + free(temp); /* the value? */ + t->number_of_entries--; + } +} + +/* Function: hash_iterate_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + */ +void hash_iterate_table_entries(table t, + void (*handler)(void *,void *), + void *arg) +{ + int i; + table_entry *j,*next; + + for (i=0;i<t->size;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + (*handler)(arg,(*j)->value); + } +} + +/* Function: hash_filter_table_entries + * Arguments: t: the table to iterate over + * handler: a function to call with each element + * of the table, along with arg + * arg: the opaque object to pass to handler + * Returns: nothing + * Notes: operations on the table inside handler are not safe + * + * filter_table_entires() calls the handler function for each + * item in the table, passing it and arg. The handler function + * returns 1 if it is to be retained in the table, and 0 + * if it is to be removed. + */ +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) +{ + int i; + table_entry *j,*next,v; + + for (i=0;i<t->size;i++) + for (j=t->entries+i;*j;j=next){ + next=&((*j)->next); + if (!(*handler)(arg,(*j)->value)){ + next=j; + v=*j; + *j=(*j)->next; + free(v); + t->number_of_entries--; + } + } +} + +/* Function: destroy_table + * Arguments: t: the table to free + * thunk: a function to call with each element, + * most likely free() + * Returns: nothing + */ +void hash_destroy_table(table t,void (*thunk)(void *)) +{ + table_entry j,next; + int i; + for (i=0;i<t->size;i++) + for (j=t->entries[i];j;j=next){ + next=j->next; + if (thunk) (*thunk)(j->value); + free(j); + } + free(t->entries); + free(t); +} diff --git a/lustre/portals/unals/table.h b/lustre/portals/unals/table.h new file mode 100644 index 0000000..7fab586 --- /dev/null +++ b/lustre/portals/unals/table.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +#ifndef E_TABLE +#define E_TABLE + +typedef struct table_entry { + unsigned int key; + void *value; + struct table_entry *next; +} *table_entry; + + +typedef struct table { + unsigned int size; + int number_of_entries; + table_entry *entries; + int (*compare_function)(void *, void *); + unsigned int (*key_function)(unsigned int *); +} *table; + +/* table.c */ +unsigned int key_from_int(int i); +unsigned int key_from_string(char *s); +table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +void *hash_table_find(table t, void *comparator); +void hash_table_insert(table t, void *value, void *comparator); +void hash_table_remove(table t, void *comparator); +void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); +void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); +void hash_destroy_table(table t, void (*thunk)(void *)); + +#endif diff --git a/lustre/portals/unals/tcpnal.c b/lustre/portals/unals/tcpnal.c new file mode 100644 index 0000000..534fc17 --- /dev/null +++ b/lustre/portals/unals/tcpnal.c @@ -0,0 +1,198 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2.1 of the GNU Lesser General + * Public License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* tcpnal.c: + This file implements the TCP-based nal by providing glue + between the connection service and the generic NAL implementation */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdarg.h> +#include <unistd.h> +#include <syscall.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <pqtimer.h> +#include <dispatch.h> +#include <bridge.h> +#include <ipmap.h> +#include <connection.h> + +/* Function: tcpnal_send + * Arguments: nal: pointer to my nal control block + * private: unused + * cookie: passed back to the portals library + * hdr: pointer to the portals header + * nid: destination node + * pid: destination process + * data: body of the message + * len: length of the body + * Returns: zero on success + * + * sends a packet to the peer, after insuring that a connection exists + */ +#warning FIXME: "param 'type' is newly added, make use of it!!" +int tcpnal_send(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int niov, + struct iovec *iov, + size_t len) +{ + connection c; + bridge b=(bridge)n->nal_data; + struct iovec tiov[2]; + int count = 1; + + if (!(c=force_tcp_connection((manager)b->lower, + PNAL_IP(nid,b), + PNAL_PORT(nid,pid)))) + return(1); + +#if 0 + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + syscall(SYS_write, c->fd,hdr,sizeof(ptl_hdr_t)); + LASSERT (niov <= 1); + if (len) syscall(SYS_write, c->fd,iov[0].iov_base,len); +#else + LASSERT (niov <= 1); + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(ptl_hdr_t); + + if (len) { + tiov[1].iov_base = iov[0].iov_base; + tiov[1].iov_len = len; + count++; + } + + syscall(SYS_writev, c->fd, tiov, count); +#endif + lib_finalize(n, private, cookie); + + return(0); +} + + +/* Function: tcpnal_recv + * Arguments: nal_cb_t *nal: pointer to my nal control block + * void *private: connection pointer passed through + * lib_parse() + * lib_msg_t *cookie: passed back to portals library + * user_ptr data: pointer to the destination buffer + * size_t mlen: length of the body + * size_t rlen: length of data in the network + * Returns: zero on success + * + * blocking read of the requested data. must drain out the + * difference of mainpulated and requested lengths from the network + */ +int tcpnal_recv(nal_cb_t *n, + void *private, + lib_msg_t *cookie, + unsigned int niov, + struct iovec *iov, + ptl_size_t mlen, + ptl_size_t rlen) + +{ + if (mlen) { + LASSERT (niov <= 1); + read_connection(private,iov[0].iov_base,mlen); + lib_finalize(n, private, cookie); + } + + if (mlen!=rlen){ + char *trash=malloc(rlen-mlen); + + /*TODO: check error status*/ + read_connection(private,trash,rlen-mlen); + free(trash); + } + + return(rlen); +} + + +/* Function: from_connection: + * Arguments: c: the connection to read from + * Returns: whether or not to continue reading from this connection, + * expressed as a 1 to continue, and a 0 to not + * + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and + * pass it to the generic library for processing. + */ +static int from_connection(void *a, void *d) +{ + connection c = d; + bridge b=a; + ptl_hdr_t hdr; + + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ + lib_parse(b->nal_cb, &hdr, c); + return(1); + } + return(0); +} + + +static void tcpnal_shutdown(bridge b) +{ + shutdown_connections(b->lower); +} + +/* Function: PTL_IFACE_TCP + * Arguments: pid_request: desired port number to bind to + * desired: passed NAL limits structure + * actual: returned NAL limits structure + * Returns: a nal structure on success, or null on failure + */ +int tcpnal_init(bridge b) +{ + manager m; + + b->nal_cb->cb_send=tcpnal_send; + b->nal_cb->cb_recv=tcpnal_recv; + b->shutdown=tcpnal_shutdown; + + if (!(m=init_connections(PNAL_PORT(b->nal_cb->ni.nid, + b->nal_cb->ni.pid), + from_connection,b))){ + /* TODO: this needs to shut down the + newly created junk */ + return(PTL_NAL_FAILED); + } + /* XXX cfs hack */ + b->nal_cb->ni.pid=0; + b->lower=m; + return(PTL_OK); +} diff --git a/lustre/portals/unals/timer.h b/lustre/portals/unals/timer.h new file mode 100644 index 0000000..aaf39d2 --- /dev/null +++ b/lustre/portals/unals/timer.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * Copyright (c) 2002 Eric Hoffman + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +/* TODO: make this an explicit type when they become available */ +typedef unsigned long long when; + +typedef struct timer { + void (*function)(void *); + void *arg; + when w; + int interval; + int disable; +} *timer; + +timer register_timer(when, void (*f)(void *), void *a); +void remove_timer(timer t); +void timer_loop(void); +void initialize_timer(void); +void register_thunk(void (*f)(void *),void *a); + + +#define HZ 0x100000000ull + + diff --git a/lustre/portals/unals/utypes.h b/lustre/portals/unals/utypes.h new file mode 100644 index 0000000..7eca959 --- /dev/null +++ b/lustre/portals/unals/utypes.h @@ -0,0 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cray Inc. + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + */ + +typedef unsigned short uint16; +typedef unsigned long uint32; +typedef unsigned long long uint64; +typedef unsigned char uint8; diff --git a/lustre/portals/utils/.cvsignore b/lustre/portals/utils/.cvsignore new file mode 100644 index 0000000..148310a --- /dev/null +++ b/lustre/portals/utils/.cvsignore @@ -0,0 +1,8 @@ +Makefile +Makefile.in +acceptor +debugctl +ptlctl +.deps +routerstat +wirecheck \ No newline at end of file diff --git a/lustre/portals/utils/Makefile.am b/lustre/portals/utils/Makefile.am new file mode 100644 index 0000000..05af598 --- /dev/null +++ b/lustre/portals/utils/Makefile.am @@ -0,0 +1,27 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + + +COMPILE = gcc -Wall -g -I$(srcdir)/../include +LINK = gcc -o $@ + +sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck +lib_LIBRARIES = libptlctl.a + +acceptor_SOURCES = acceptor.c # -lefence + +wirecheck_SOURCES = wirecheck.c + +libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h + +ptlctl_SOURCES = ptlctl.c +ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence +ptlctl_DEPENDENCIES = libptlctl.a + +debugctl_SOURCES = debugctl.c +debugctl_LDADD = -L. -lptlctl -lncurses # -lefence +debugctl_DEPENDENCIES = libptlctl.a + +routerstat_SOURCES = routerstat.c diff --git a/lustre/portals/utils/acceptor.c b/lustre/portals/utils/acceptor.c new file mode 100644 index 0000000..c6590db --- /dev/null +++ b/lustre/portals/utils/acceptor.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <netdb.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <unistd.h> +#include <asm/byteorder.h> +#include <syslog.h> + +#include <errno.h> + +#include <portals/api-support.h> +#include <portals/list.h> +#include <portals/lib-types.h> + +/* should get this from autoconf somehow */ +#ifndef PIDFILE_DIR +#define PIDFILE_DIR "/var/run" +#endif + +#define PROGNAME "acceptor" + +void create_pidfile(char *name, int port) +{ + char pidfile[1024]; + FILE *fp; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if ((fp = fopen(pidfile, "w"))) { + fprintf(fp, "%d\n", getpid()); + fclose(fp); + } else { + syslog(LOG_ERR, "%s: %s\n", pidfile, + strerror(errno)); + } +} + +int pidfile_exists(char *name, int port) +{ + char pidfile[1024]; + + snprintf(pidfile, sizeof(pidfile), "%s/%s-%d.pid", + PIDFILE_DIR, name, port); + + if (!access(pidfile, F_OK)) { + fprintf(stderr, "%s: exists, acceptor already running.\n", + pidfile); + return (1); + } + return (0); +} + +int +parse_size (int *sizep, char *str) +{ + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) + { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) + { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +void +show_connection (int fd, __u32 net_ip, ptl_nid_t nid) +{ + struct hostent *h = gethostbyaddr ((char *)&net_ip, sizeof net_ip, AF_INET); + __u32 host_ip = ntohl (net_ip); + int rxmem = 0; + int txmem = 0; + int nonagle = 0; + int len; + char host[1024]; + + len = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &len) != 0) + perror ("Cannot get write buffer size"); + + len = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &len) != 0) + perror ("Cannot get read buffer size"); + + len = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &len) != 0) + perror ("Cannot get nagle"); + + if (h == NULL) + snprintf (host, sizeof(host), "%d.%d.%d.%d", (host_ip >> 24) & 0xff, + (host_ip >> 16) & 0xff, (host_ip >> 8) & 0xff, host_ip & 0xff); + else + snprintf (host, sizeof(host), "%s", h->h_name); + + syslog (LOG_INFO, "Accepted host: %s NID: "LPX64" snd: %d rcv %d nagle: %s\n", + host, nid, txmem, rxmem, nonagle ? "disabled" : "enabled"); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +void +usage (char *myname) +{ + fprintf (stderr, "Usage: %s [-r recv_mem] [-s send_mem] [-n] [-N nal_id] port\n", myname); + exit (1); +} + +int main(int argc, char **argv) +{ + int o, fd, rc, port, pfd; + struct sockaddr_in srvaddr; + int c; + int rxmem = 0; + int txmem = 0; + int noclose = 0; + int nonagle = 1; + int nal = SOCKNAL; + int xchg_nids = 0; + int bind_irq = 0; + + while ((c = getopt (argc, argv, "N:r:s:nlxi")) != -1) + switch (c) + { + case 'r': + if (parse_size (&rxmem, optarg) != 0 || rxmem < 0) + usage (argv[0]); + break; + + case 's': + if (parse_size (&txmem, optarg) != 0 || txmem < 0) + usage (argv[0]); + break; + + case 'n': + nonagle = 0; + break; + + case 'l': + noclose = 1; + break; + + case 'x': + xchg_nids = 1; + break; + + case 'i': + bind_irq = 1; + break; + + case 'N': + if (parse_size(&nal, optarg) != 0 || + nal < 0 || nal > NAL_MAX_NR) + usage(argv[0]); + break; + + default: + usage (argv[0]); + break; + } + + if (optind >= argc) + usage (argv[0]); + + port = atol(argv[optind++]); + + if (pidfile_exists(PROGNAME, port)) + exit(1); + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(port); + srvaddr.sin_addr.s_addr = INADDR_ANY; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("opening socket"); + exit(1); + } + + o = 1; + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { + perror("Cannot set REUSEADDR socket opt"); + exit(1); + } + + if (nonagle) + { + o = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)); + if (rc != 0) + { + perror ("Cannot disable nagle"); + exit (1); + } + } + + if (txmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, sizeof (txmem)); + if (rc != 0) + { + perror ("Cannot set write buffer size"); + exit (1); + } + } + + if (rxmem != 0) + { + rc = setsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, sizeof (rxmem)); + if (rc != 0) + { + perror ("Cannot set read buffer size"); + exit (1); + } + } + + rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + perror("bind: "); + exit(1); + } + + if (listen(fd, 127)) { + perror("listen: "); + exit(1); + } + fprintf(stderr, "listening on port %d\n", port); + + pfd = open("/dev/portals", O_RDWR); + if ( pfd < 0 ) { + perror("opening portals device"); + exit(1); + } + + rc = daemon(1, noclose); + if (rc < 0) { + perror("daemon(): "); + exit(1); + } + + openlog(PROGNAME, LOG_PID, LOG_DAEMON); + syslog(LOG_INFO, "started, listening on port %d\n", port); + create_pidfile(PROGNAME, port); + + while (1) { + struct sockaddr_in clntaddr; + int len = sizeof(clntaddr); + int cfd; + struct portal_ioctl_data data; + ptl_nid_t peer_nid; + + cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); + if ( cfd < 0 ) { + perror("accept"); + exit(0); + continue; + } + + if (!xchg_nids) + peer_nid = ntohl (clntaddr.sin_addr.s_addr); /* HOST byte order */ + else + { + PORTAL_IOC_INIT (data); + data.ioc_nal = nal; + rc = ioctl (pfd, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + { + perror ("Can't get my NID"); + close (cfd); + continue; + } + + rc = exchange_nids (cfd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (cfd); + continue; + } + } + + show_connection (cfd, clntaddr.sin_addr.s_addr, peer_nid); + + PORTAL_IOC_INIT(data); + data.ioc_fd = cfd; + data.ioc_nal = nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { + perror("ioctl failed"); + + } else { + printf("client registered\n"); + } + rc = close(cfd); + if (rc) + perror ("close failed"); + } + + closelog(); + exit(0); + +} diff --git a/lustre/portals/utils/debug.c b/lustre/portals/utils/debug.c new file mode 100644 index 0000000..9ab1c73d --- /dev/null +++ b/lustre/portals/utils/debug.c @@ -0,0 +1,618 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include <stdio.h> +#include <netdb.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <time.h> +#include <syscall.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <sys/stat.h> +#include <sys/mman.h> +#define BUG() /* workaround for module.h includes */ +#include <linux/version.h> + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include <linux/module.h> +#endif + +#include <portals/api-support.h> +#include <portals/ptlctl.h> +#include "parser.h" + +static char rawbuf[8192]; +static char *buf = rawbuf; +static int max = 8192; +//static int g_pfd = -1; +static int subsystem_array[1 << 8]; +static int debug_mask = ~0; + +static const char *portal_debug_subsystems[] = + {"undefined", "mdc", "mds", "osc", "ost", "class", "obdfs", "llite", + "rpc", "ext2obd", "portals", "socknal", "qswnal", "pinger", "filter", + "obdtrace", "echo", "ldlm", "lov", "gmnal", "router", "ptldb", NULL}; +static const char *portal_debug_masks[] = + {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", + "blocks", "net", "warning", "buffs", "other", "dentry", "portals", + "page", "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", NULL}; + +struct debug_daemon_cmd { + char *cmd; + unsigned int cmdv; +}; + +static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { + {"start", DEBUG_DAEMON_START}, + {"stop", DEBUG_DAEMON_STOP}, + {"pause", DEBUG_DAEMON_PAUSE}, + {"continue", DEBUG_DAEMON_CONTINUE}, + {0, 0} +}; + +static int do_debug_mask(char *name, int enable) +{ + int found = 0, i; + + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || + strcasecmp(name, "all_subs") == 0) { + printf("%s output from subsystem \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_subsystems[i]); + subsystem_array[i] = enable; + found = 1; + } + } + for (i = 0; portal_debug_masks[i] != NULL; i++) { + if (strcasecmp(name, portal_debug_masks[i]) == 0 || + strcasecmp(name, "all_types") == 0) { + printf("%s output of type \"%s\"\n", + enable ? "Enabling" : "Disabling", + portal_debug_masks[i]); + if (enable) + debug_mask |= (1 << i); + else + debug_mask &= ~(1 << i); + found = 1; + } + } + + return found; +} + +int dbg_initialize(int argc, char **argv) +{ + memset(subsystem_array, 1, sizeof(subsystem_array)); + return 0; +} + +int jt_dbg_filter(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 0)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + return 0; +} + +int jt_dbg_show(int argc, char **argv) +{ + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s <subsystem ID or debug mask>\n", + argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) + if (!do_debug_mask(argv[i], 1)) + fprintf(stderr, "Unknown subsystem or debug type: %s\n", + argv[i]); + + return 0; +} + +static int applymask(char* procpath, int value) +{ + int rc; + char buf[64]; + int len = snprintf(buf, 64, "%d", value); + + int fd = open(procpath, O_WRONLY); + if (fd == -1) { + fprintf(stderr, "Unable to open %s: %s\n", + procpath, strerror(errno)); + return fd; + } + rc = write(fd, buf, len+1); + if (rc<0) { + fprintf(stderr, "Write to %s failed: %s\n", + procpath, strerror(errno)); + return rc; + } + close(fd); + return 0; +} + +extern char *dump_filename; +extern int dump(int dev_id, int opc, void *buf); + +static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) +{ + if (!dump_filename) { + applymask("/proc/sys/portals/subsystem_debug", subs_mask); + applymask("/proc/sys/portals/debug", debug_mask); + } else { + struct portals_debug_ioctl_data data; + + data.hdr.ioc_len = sizeof(data); + data.hdr.ioc_version = 0; + data.subs = subs_mask; + data.debug = debug_mask; + + dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); + } + printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", + subs_mask, debug_mask); +} + +int jt_dbg_list(int argc, char **argv) +{ + int i; + + if (argc != 2) { + fprintf(stderr, "usage: %s <subs || types>\n", argv[0]); + return 0; + } + + if (strcasecmp(argv[1], "subs") == 0) { + printf("Subsystems: all_subs"); + for (i = 0; portal_debug_subsystems[i] != NULL; i++) + printf(", %s", portal_debug_subsystems[i]); + printf("\n"); + } else if (strcasecmp(argv[1], "types") == 0) { + printf("Types: all_types"); + for (i = 0; portal_debug_masks[i] != NULL; i++) + printf(", %s", portal_debug_masks[i]); + printf("\n"); + } + else if (strcasecmp(argv[1], "applymasks") == 0) { + unsigned int subsystem_mask = 0; + for (i = 0; portal_debug_subsystems[i] != NULL; i++) { + if (subsystem_array[i]) subsystem_mask |= (1 << i); + } + applymask_all(subsystem_mask, debug_mask); + } + return 0; +} + +/* if 'raw' is true, don't strip the debug information from the front of the + * lines */ +static void dump_buffer(FILE *fd, char *buf, int size, int raw) +{ + char *p, *z; + unsigned long subsystem, debug, dropped = 0, kept = 0; + int max_sub, max_type; + + for (max_sub = 0; portal_debug_subsystems[max_sub] != NULL; max_sub++) + ; + for (max_type = 0; portal_debug_masks[max_type] != NULL; max_type++) + ; + + while (size) { + p = memchr(buf, '\n', size); + if (!p) + break; + subsystem = strtoul(buf, &z, 16); + debug = strtoul(z + 1, &z, 16); + + z++; + /* for some reason %*s isn't working. */ + *p = '\0'; + if (subsystem < max_sub && + subsystem_array[subsystem] && + (!debug || (debug_mask & debug))) { + if (raw) + fprintf(fd, "%s\n", buf); + else + fprintf(fd, "%s\n", z); + //printf("%s\n", buf); + kept++; + } else { + //fprintf(stderr, "dropping line (%lx:%lx): %s\n", subsystem, debug, buf); + dropped++; + } + *p = '\n'; + p++; + size -= (p - buf); + buf = p; + } + + printf("Debug log: %lu lines, %lu kept, %lu dropped.\n", + dropped + kept, kept, dropped); +} + +int jt_dbg_debug_kernel(int argc, char **argv) +{ + int rc, raw = 1; + FILE *fd = stdout; + const int databuf_size = (6 << 20); + struct portal_ioctl_data data, *newdata; + char *databuf = NULL; + + if (argc > 3) { + fprintf(stderr, "usage: %s [file] [raw]\n", argv[0]); + return 0; + } + + if (argc > 1) { + fd = fopen(argv[1], "w"); + if (fd == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } + } + if (argc > 2) + raw = atoi(argv[2]); + + databuf = malloc(databuf_size); + if (!databuf) { + fprintf(stderr, "No memory for buffer.\n"); + goto out; + } + + memset(&data, 0, sizeof(data)); + data.ioc_plen1 = databuf_size; + data.ioc_pbuf1 = databuf; + + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + goto out; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_GET_DEBUG failed: %s\n", + strerror(errno)); + goto out; + } + + newdata = (struct portal_ioctl_data *)buf; + if (newdata->ioc_size > 0) + dump_buffer(fd, databuf, newdata->ioc_size, raw); + + out: + if (databuf) + free(databuf); + if (fd != stdout) + fclose(fd); + return 0; +} + +int jt_dbg_debug_daemon(int argc, char **argv) +{ + int i, rc; + unsigned int cmd = 0; + FILE *fd = stdout; + struct portal_ioctl_data data; + + if (argc <= 1) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + for (i = 0; portal_debug_daemon_cmd[i].cmd != NULL; i++) { + if (strcasecmp(argv[1], portal_debug_daemon_cmd[i].cmd) == 0) { + cmd = portal_debug_daemon_cmd[i].cmdv; + break; + } + } + if (portal_debug_daemon_cmd[i].cmd == NULL) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|pause|" + "continue]\n", argv[0]); + return 0; + } + memset(&data, 0, sizeof(data)); + if (cmd == DEBUG_DAEMON_START) { + if (argc < 3) { + fprintf(stderr, "usage: %s [start file <#MB>|stop|" + "pause|continue]\n", argv[0]); + return 0; + } + if (access(argv[2], F_OK) != 0) { + fd = fopen(argv[2], "w"); + if (fd != NULL) { + fclose(fd); + remove(argv[2]); + goto ok; + } + } + if (access(argv[2], W_OK) == 0) + goto ok; + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + return -1; +ok: + data.ioc_inllen1 = strlen(argv[2]) + 1; + data.ioc_inlbuf1 = argv[2]; + data.ioc_misc = 0; + if (argc == 4) { + unsigned long size; + errno = 0; + size = strtoul(argv[3], NULL, 0); + if (errno) { + fprintf(stderr, "file size(%s): error %s\n", + argv[3], strerror(errno)); + return -1; + } + data.ioc_misc = size; + } + } + data.ioc_count = cmd; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_SET_DAEMON, buf); + if (rc < 0) { + fprintf(stderr, "IOC_PORTAL_SET_DEMON failed: %s\n", + strerror(errno)); + return rc; + } + return 0; +} + +int jt_dbg_debug_file(int argc, char **argv) +{ + int rc, fd = -1, raw = 1; + FILE *output = stdout; + char *databuf = NULL; + struct stat statbuf; + + if (argc > 4 || argc < 2) { + fprintf(stderr, "usage: %s <input> [output] [raw]\n", argv[0]); + return 0; + } + + fd = open(argv[1], O_RDONLY); + if (fd < 0) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[1], + strerror(errno)); + return -1; + } +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + rc = syscall(__SYS_fstat__, fd, &statbuf); + if (rc < 0) { + fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + goto out; + } + + if (argc >= 3) { + output = fopen(argv[2], "w"); + if (output == NULL) { + fprintf(stderr, "fopen(%s) failed: %s\n", argv[2], + strerror(errno)); + goto out; + } + } + + if (argc == 4) + raw = atoi(argv[3]); + + databuf = mmap(NULL, statbuf.st_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (databuf == NULL) { + fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + goto out; + } + + dump_buffer(output, databuf, statbuf.st_size, raw); + + out: + if (databuf) + munmap(databuf, statbuf.st_size); + if (output != stdout) + fclose(output); + if (fd > 0) + close(fd); + return 0; +} + +int jt_dbg_clear_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_dbg_mark_debug_buf(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + char *text; + time_t now = time(NULL); + + if (argc > 2) { + fprintf(stderr, "usage: %s [marker text]\n", argv[0]); + return 0; + } + + if (argc == 2) { + text = argv[1]; + } else { + text = ctime(&now); + text[strlen(text) - 1] = '\0'; /* stupid \n */ + } + + memset(&data, 0, sizeof(data)); + data.ioc_inllen1 = strlen(text) + 1; + data.ioc_inlbuf1 = text; + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + + +int jt_dbg_modules(int argc, char **argv) +{ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + struct mod_paths { + char *name, *path; + } *mp, mod_paths[] = { + {"portals", "lustre/portals/libcfs"}, + {"ksocknal", "lustre/portals/knals/socknal"}, + {"obdclass", "lustre/obdclass"}, + {"ptlrpc", "lustre/ptlrpc"}, + {"obdext2", "lustre/obdext2"}, + {"ost", "lustre/ost"}, + {"osc", "lustre/osc"}, + {"mds", "lustre/mds"}, + {"mdc", "lustre/mdc"}, + {"llite", "lustre/llite"}, + {"obdecho", "lustre/obdecho"}, + {"ldlm", "lustre/ldlm"}, + {"obdfilter", "lustre/obdfilter"}, + {"extN", "lustre/extN"}, + {"lov", "lustre/lov"}, + {"fsfilt_ext3", "lustre/obdclass"}, + {"fsfilt_extN", "lustre/obdclass"}, + {"mds_ext2", "lustre/mds"}, + {"mds_ext3", "lustre/mds"}, + {"mds_extN", "lustre/mds"}, + {"ptlbd", "lustre/ptlbd"}, + {NULL, NULL} + }; + char *path = ".."; + char *kernel = "linux"; + + if (argc >= 2) + path = argv[1]; + if (argc == 3) + kernel = argv[2]; + if (argc > 3) { + printf("%s [path] [kernel]\n", argv[0]); + return 0; + } + + for (mp = mod_paths; mp->name != NULL; mp++) { + struct module_info info; + int rc; + size_t crap; + int query_module(const char *name, int which, void *buf, + size_t bufsize, size_t *ret); + + rc = query_module(mp->name, QM_INFO, &info, sizeof(info), + &crap); + if (rc < 0) { + if (errno != ENOENT) + printf("query_module(%s) failed: %s\n", + mp->name, strerror(errno)); + } else { + printf("add-symbol-file %s/%s/%s.o 0x%0lx\n", path, + mp->path, mp->name, + info.addr + sizeof(struct module)); + } + } + + return 0; +#else + printf("jt_dbg_module is not yet implemented for Linux 2.5\n"); + return 0; +#endif /* linux 2.5 */ +} + +int jt_dbg_panic(int argc, char **argv) +{ + int rc; + struct portal_ioctl_data data; + + if (argc != 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + memset(&data, 0, sizeof(data)); + if (portal_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "portal_ioctl_pack failed.\n"); + return -1; + } + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); + if (rc) { + fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", + strerror(errno)); + return -1; + } + return 0; +} diff --git a/lustre/portals/utils/debugctl.c b/lustre/portals/utils/debugctl.c new file mode 100644 index 0000000..02cb9b4 --- /dev/null +++ b/lustre/portals/utils/debugctl.c @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Some day I'll split all of this functionality into a cfs_debug module + * of its own. That day is not today. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <portals/api-support.h> +#include <portals/ptlctl.h> +#include "parser.h" + + +command_t list[] = { + {"debug_kernel", jt_dbg_debug_kernel, 0, "usage: debug_kernel [file] [raw], get debug buffer and print it [to a file]"}, + {"debug_daemon", jt_dbg_debug_daemon, 0, "usage: debug_daemon [start file [#MB]|stop|pause|continue], control debug daemon to dump debug buffer to a file"}, + {"debug_file", jt_dbg_debug_file, 0, "usage: debug_file <input> [output] [raw], read debug buffer from input and print it [to output]"}, + {"clear", jt_dbg_clear_debug_buf, 0, "clear kernel debug buffer"}, + {"mark", jt_dbg_mark_debug_buf, 0, "insert a marker into the kernel debug buffer (args: [marker text])"}, + {"filter", jt_dbg_filter, 0, "filter certain messages (args: subsystem/debug ID)\n"}, + {"show", jt_dbg_show, 0, "enable certain messages (args: subsystem/debug ID)\n"}, + {"list", jt_dbg_list, 0, "list subsystem and debug types (args: subs or types)\n"}, + {"modules", jt_dbg_modules, 0, "provide gdb-friendly module info (arg: <path>)"}, + {"panic", jt_dbg_panic, 0, "cause the kernel to panic"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (dbg_initialize(argc, argv) < 0) + exit(2); + + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + + Parser_init("debugctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + unregister_ioc_dev(PORTALS_DEV_ID); + return 0; +} diff --git a/lustre/portals/utils/l_ioctl.c b/lustre/portals/utils/l_ioctl.c new file mode 100644 index 0000000..722bb57 --- /dev/null +++ b/lustre/portals/utils/l_ioctl.c @@ -0,0 +1,281 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <unistd.h> + +#include <portals/api-support.h> +#include <portals/ptlctl.h> + +struct ioc_dev { + const char * dev_name; + int dev_fd; +}; + +static struct ioc_dev ioc_dev_list[10]; + +struct dump_hdr { + int magic; + int dev_id; + int opc; +}; + +char * dump_filename; + +static int +open_ioc_dev(int dev_id) +{ + const char * dev_name; + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { + fprintf(stderr, "unknown device id: %d\n", dev_id); + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; +} + + +static int +do_ioctl(int dev_id, int opc, void *buf) +{ + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + return rc; + +} + +static FILE * +get_dump_file() +{ + FILE *fp = NULL; + + if (!dump_filename) { + fprintf(stderr, "no dump filename\n"); + } else + fp = fopen(dump_filename, "a"); + return fp; +} + +/* + * The dump file should start with a description of which devices are + * used, but for now it will assumed whatever app reads the file will + * know what to do. */ +int +dump(int dev_id, int opc, void *buf) +{ + FILE *fp; + struct dump_hdr dump_hdr; + struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; + int rc; + + printf("dumping opc %x to %s\n", opc, dump_filename); + + + dump_hdr.magic = 0xdeadbeef; + dump_hdr.dev_id = dev_id; + dump_hdr.opc = opc; + + fp = get_dump_file(); + if (fp == NULL) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); + if (rc == 1) + rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); + fclose(fp); + if (rc != 1) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + + return 0; +} + +/* register a device to send ioctls to. */ +int +register_ioc_dev(int dev_id, const char * dev_name) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return -EINVAL; + + unregister_ioc_dev(dev_id); + + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + + return dev_id; +} + +void +unregister_ioc_dev(int dev_id) +{ + + if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) + return; + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); + + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; +} + +/* If this file is set, then all ioctl buffers will be + appended to the file. */ +int +set_ioctl_dump(char * file) +{ + if (dump_filename) + free(dump_filename); + + dump_filename = strdup(file); + return 0; +} + +int +l_ioctl(int dev_id, int opc, void *buf) +{ + if (dump_filename) + return dump(dev_id, opc, buf); + else + return do_ioctl(dev_id, opc, buf); +} + +/* Read an ioctl dump file, and call the ioc_func for each ioctl buffer + * in the file. For example: + * + * parse_dump("lctl.dump", l_ioctl); + * + * Note: if using l_ioctl, then you also need to register_ioc_dev() for + * each device used in the dump. + */ +int +parse_dump(char * dump_file, int (*ioc_func)(int dev_id, int opc, void *)) +{ + int fd, line =0; + struct stat st; + char *buf, *end; + + fd = syscall(SYS_open, dump_file, O_RDONLY); + +#warning FIXME: cleanup fstat issue here +#ifndef SYS_fstat64 +#define __SYS_fstat__ SYS_fstat +#else +#define __SYS_fstat__ SYS_fstat64 +#endif + if (syscall(__SYS_fstat__, fd, &st)) { + perror("stat fails"); + exit(1); + } + + if (st.st_size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } + + buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = buf + st.st_size; + close(fd); + while (buf < end) { + struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; + struct portal_ioctl_hdr * data; + char tmp[8096]; + int rc; + + line++; + + data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + if (buf + data->ioc_len > end ) { + fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, + data->ioc_len, end); + return -1; + } +#if 0 + printf ("dump_hdr: %lx data: %lx\n", + (unsigned long)dump_hdr - (unsigned long)buf, (unsigned long)data - (unsigned long)buf); + + printf("%d: opcode %x len: %d ver: %x ", line, dump_hdr->opc, + data->ioc_len, data->ioc_version); +#endif + + memcpy(tmp, data, data->ioc_len); + + rc = ioc_func(dump_hdr->dev_id, dump_hdr->opc, tmp); + if (rc) { + printf("failed: %d\n", rc); + exit(1); + } + + buf += data->ioc_len + sizeof(*dump_hdr); + } + return 0; +} + +int +jt_ioc_dump(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + printf("setting dumpfile to: %s\n", argv[1]); + + set_ioctl_dump(argv[1]); + return 0; +} diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c new file mode 100644 index 0000000..4d93645 --- /dev/null +++ b/lustre/portals/utils/parser.c @@ -0,0 +1,703 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <stddef.h> +#include <unistd.h> +#include <sys/param.h> +#include <assert.h> + +#include <config.h> +#ifdef HAVE_LIBREADLINE +#define READLINE_LIBRARY +#include <readline/readline.h> +#endif +//extern char **completion_matches __P((char *, rl_compentry_func_t *)); +extern void using_history(void); +extern void stifle_history(int); +extern void add_history(char *); + +#include "parser.h" + +static command_t * top_level; /* Top level of commands, initialized by + * InitParser */ +static char * parser_prompt = NULL;/* Parser prompt, set by InitParser */ +static int done; /* Set to 1 if user types exit or quit */ + + +/* static functions */ +static char *skipwhitespace(char *s); +static char *skiptowhitespace(char *s); +static command_t *find_cmd(char *name, command_t cmds[], char **next); +static int process(char *s, char **next, command_t *lookup, command_t **result, + char **prev); +static void print_commands(char *str, command_t *table); + +static char * skipwhitespace(char * s) +{ + char * t; + int len; + + len = (int)strlen(s); + for (t = s; t <= s + len && isspace(*t); t++); + return(t); +} + + +static char * skiptowhitespace(char * s) +{ + char * t; + + for (t = s; *t && !isspace(*t); t++); + return(t); +} + +static int line2args(char *line, char **argv, int maxargs) +{ + char *arg; + int i = 0; + + arg = strtok(line, " \t"); + if ( arg ) { + argv[i] = arg; + i++; + } else + return 0; + + while( (arg = strtok(NULL, " \t")) && (i <= maxargs)) { + argv[i] = arg; + i++; + } + return i; +} + +/* find a command -- return it if unique otherwise print alternatives */ +static command_t *Parser_findargcmd(char *name, command_t cmds[]) +{ + command_t *cmd; + + for (cmd = cmds; cmd->pc_name; cmd++) { + if (strcmp(name, cmd->pc_name) == 0) + return cmd; + } + return NULL; +} + +int Parser_execarg(int argc, char **argv, command_t cmds[]) +{ + command_t *cmd; + + cmd = Parser_findargcmd(argv[0], cmds); + if ( cmd ) { + return (cmd->pc_func)(argc, argv); + } else { + printf("Try interactive use without arguments or use one of:\n"); + for (cmd = cmds; cmd->pc_name; cmd++) + printf("\"%s\" ", cmd->pc_name); + printf("\nas argument.\n"); + } + return -1; +} + +/* returns the command_t * (NULL if not found) corresponding to a + _partial_ match with the first token in name. It sets *next to + point to the following token. Does not modify *name. */ +static command_t * find_cmd(char * name, command_t cmds[], char ** next) +{ + int i, len; + + if (!cmds || !name ) + return NULL; + + /* This sets name to point to the first non-white space character, + and next to the first whitespace after name, len to the length: do + this with strtok*/ + name = skipwhitespace(name); + *next = skiptowhitespace(name); + len = *next - name; + if (len == 0) + return NULL; + + for (i = 0; cmds[i].pc_name; i++) { + if (strncasecmp(name, cmds[i].pc_name, len) == 0) { + *next = skipwhitespace(*next); + return(&cmds[i]); + } + } + return NULL; +} + +/* Recursively process a command line string s and find the command + corresponding to it. This can be ambiguous, full, incomplete, + non-existent. */ +static int process(char *s, char ** next, command_t *lookup, + command_t **result, char **prev) +{ + *result = find_cmd(s, lookup, next); + *prev = s; + + /* non existent */ + if ( ! *result ) + return CMD_NONE; + + /* found entry: is it ambigous, i.e. not exact command name and + more than one command in the list matches. Note that find_cmd + points to the first ambiguous entry */ + if ( strncasecmp(s, (*result)->pc_name, strlen((*result)->pc_name)) && + find_cmd(s, (*result) + 1, next)) + return CMD_AMBIG; + + /* found a unique command: component or full? */ + if ( (*result)->pc_func ) { + return CMD_COMPLETE; + } else { + if ( *next == '\0' ) { + return CMD_INCOMPLETE; + } else { + return process(*next, next, (*result)->pc_sub_cmd, result, prev); + } + } +} + +#ifdef HAVE_LIBREADLINE +static command_t * match_tbl; /* Command completion against this table */ +static char * command_generator(const char * text, int state) +{ + static int index, + len; + char *name; + + /* Do we have a match table? */ + if (!match_tbl) + return NULL; + + /* If this is the first time called on this word, state is 0 */ + if (!state) { + index = 0; + len = (int)strlen(text); + } + + /* Return next name in the command list that paritally matches test */ + while ( (name = (match_tbl + index)->pc_name) ) { + index++; + + if (strncasecmp(name, text, len) == 0) { + return(strdup(name)); + } + } + + /* No more matches */ + return NULL; +} + +/* probably called by readline */ +static char **command_completion(char * text, int start, int end) +{ + command_t * table; + char * pos; + + match_tbl = top_level; + for (table = find_cmd(rl_line_buffer, match_tbl, &pos); + table; + table = find_cmd(pos, match_tbl, &pos)) { + + if (*(pos - 1) == ' ') match_tbl = table->pc_sub_cmd; + } + + return(completion_matches(text, command_generator)); +} +#endif + +/* take a string and execute the function or print help */ +int execute_line(char * line) +{ + command_t *cmd, *ambig; + char *prev; + char *next, *tmp; + char *argv[MAXARGS]; + int i; + int rc = 0; + + switch( process(line, &next, top_level, &cmd, &prev) ) { + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, cmd, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + cmd = ambig + 1; + } + fprintf(stderr, "\n"); + break; + case CMD_NONE: + fprintf(stderr, "No such command, type help\n"); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; cmd->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", cmd->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_COMPLETE: + i = line2args(line, argv, MAXARGS); + rc = (cmd->pc_func)(i, argv); + + if (rc == CMD_HELP) + fprintf(stderr, "%s\n", cmd->pc_help); + + break; + } + + return rc; +} + +int +noop_fn () +{ + return (0); +} + +/* just in case you're ever in an airplane and discover you + forgot to install readline-dev. :) */ +int init_input() +{ + int interactive = isatty (fileno (stdin)); + +#ifdef HAVE_LIBREADLINE + using_history(); + stifle_history(HISTORY); + + if (!interactive) + { + rl_prep_term_function = (rl_vintfunc_t *)noop_fn; + rl_deprep_term_function = (rl_voidfunc_t *)noop_fn; + } + + rl_attempted_completion_function = (CPPFunction *)command_completion; + rl_completion_entry_function = (void *)command_generator; +#endif + return interactive; +} + +#ifndef HAVE_LIBREADLINE +#define add_history(s) +char * readline(char * prompt) +{ + char line[2048]; + int n = 0; + if (prompt) + printf ("%s", prompt); + if (fgets(line, sizeof(line), stdin) == NULL) + return (NULL); + n = strlen(line); + if (n && line[n-1] == '\n') + line[n-1] = '\0'; + return strdup(line); +} +#endif + +/* this is the command execution machine */ +int Parser_commands(void) +{ + char *line, *s; + int rc = 0; + int interactive; + + interactive = init_input(); + + while(!done) { + line = readline(interactive ? parser_prompt : NULL); + + if (!line) break; + + s = skipwhitespace(line); + + if (*s) { + add_history(s); + rc = execute_line(s); + } + + free(line); + } + return rc; +} + + +/* sets the parser prompt */ +void Parser_init(char * prompt, command_t * cmds) +{ + done = 0; + top_level = cmds; + if (parser_prompt) free(parser_prompt); + parser_prompt = strdup(prompt); +} + +/* frees the parser prompt */ +void Parser_exit(int argc, char *argv[]) +{ + done = 1; + free(parser_prompt); + parser_prompt = NULL; +} + +/* convert a string to an integer */ +int Parser_int(char *s, int *val) +{ + int ret; + + if (*s != '0') + ret = sscanf(s, "%d", val); + else if (*(s+1) != 'x') + ret = sscanf(s, "%o", val); + else { + s++; + ret = sscanf(++s, "%x", val); + } + + return(ret); +} + + +void Parser_qhelp(int argc, char *argv[]) { + + printf("Available commands are:\n"); + + print_commands(NULL, top_level); + printf("For more help type: help command-name\n"); +} + +int Parser_help(int argc, char **argv) +{ + char line[1024]; + char *next, *prev, *tmp; + command_t *result, *ambig; + int i; + + if ( argc == 1 ) { + Parser_qhelp(argc, argv); + return 0; + } + + line[0]='\0'; + for ( i = 1 ; i < argc ; i++ ) { + strcat(line, argv[i]); + } + + switch ( process(line, &next, top_level, &result, &prev) ) { + case CMD_COMPLETE: + fprintf(stderr, "%s: %s\n",line, result->pc_help); + break; + case CMD_NONE: + fprintf(stderr, "%s: Unknown command.\n", line); + break; + case CMD_INCOMPLETE: + fprintf(stderr, + "'%s' incomplete command. Use '%s x' where x is one of:\n", + line, line); + fprintf(stderr, "\t"); + for (i = 0; result->pc_sub_cmd[i].pc_name; i++) { + fprintf(stderr, "%s ", result->pc_sub_cmd[i].pc_name); + } + fprintf(stderr, "\n"); + break; + case CMD_AMBIG: + fprintf(stderr, "Ambiguous command \'%s\'\nOptions: ", line); + while( (ambig = find_cmd(prev, result, &tmp)) ) { + fprintf(stderr, "%s ", ambig->pc_name); + result = ambig + 1; + } + fprintf(stderr, "\n"); + break; + } + return 0; +} + + +void Parser_printhelp(char *cmd) +{ + char *argv[] = { "help", cmd }; + Parser_help(2, argv); +} + +/************************************************************************* + * COMMANDS * + *************************************************************************/ + + +static void print_commands(char * str, command_t * table) { + command_t * cmds; + char buf[80]; + + for (cmds = table; cmds->pc_name; cmds++) { + if (cmds->pc_func) { + if (str) printf("\t%s %s\n", str, cmds->pc_name); + else printf("\t%s\n", cmds->pc_name); + } + if (cmds->pc_sub_cmd) { + if (str) { + sprintf(buf, "%s %s", str, cmds->pc_name); + print_commands(buf, cmds->pc_sub_cmd); + } else { + print_commands(cmds->pc_name, cmds->pc_sub_cmd); + } + } + } +} + +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len) +{ + char *line = NULL; + int size = strlen(prompt) + strlen(deft) + 8; + char *theprompt; + theprompt = malloc(size); + assert(theprompt); + + sprintf(theprompt, "%s [%s]: ", prompt, deft); + + line = readline(theprompt); + free(theprompt); + + if ( line == NULL || *line == '\0' ) { + strncpy(res, deft, len); + } else { + strncpy(res, line, len); + } + + if ( line ) { + free(line); + return res; + } else { + return NULL; + } +} + +/* get integer from prompt, loop forever to get it */ +int Parser_getint(const char *prompt, long min, long max, long deft, int base) +{ + int rc; + long result; + char *line; + int size = strlen(prompt) + 40; + char *theprompt = malloc(size); + assert(theprompt); + sprintf(theprompt,"%s [%ld, (0x%lx)]: ", prompt, deft, deft); + + fflush(stdout); + + do { + line = NULL; + line = readline(theprompt); + if ( !line ) { + fprintf(stdout, "Please enter an integer.\n"); + fflush(stdout); + continue; + } + if ( *line == '\0' ) { + free(line); + result = deft; + break; + } + rc = Parser_arg2int(line, &result, base); + free(line); + if ( rc != 0 ) { + fprintf(stdout, "Invalid string.\n"); + fflush(stdout); + } else if ( result > max || result < min ) { + fprintf(stdout, "Error: response must lie between %ld and %ld.\n", + min, max); + fflush(stdout); + } else { + break; + } + } while ( 1 ) ; + + if (theprompt) + free(theprompt); + return result; + +} + +/* get boolean (starting with YyNn; loop forever */ +int Parser_getbool(const char *prompt, int deft) +{ + int result = 0; + char *line; + int size = strlen(prompt) + 8; + char *theprompt = malloc(size); + assert(theprompt); + + fflush(stdout); + + if ( deft != 0 && deft != 1 ) { + fprintf(stderr, "Error: Parser_getbool given bad default (%d).\n", + deft); + assert ( 0 ); + } + sprintf(theprompt, "%s [%s]: ", prompt, (deft==0)? "N" : "Y"); + + do { + line = NULL; + line = readline(theprompt); + if ( line == NULL ) { + result = deft; + break; + } + if ( *line == '\0' ) { + result = deft; + break; + } + if ( *line == 'y' || *line == 'Y' ) { + result = 1; + break; + } + if ( *line == 'n' || *line == 'N' ) { + result = 0; + break; + } + if ( line ) + free(line); + fprintf(stdout, "Invalid string. Must start with yY or nN\n"); + fflush(stdout); + } while ( 1 ); + + if ( line ) + free(line); + if ( theprompt ) + free(theprompt); + return result; +} + +/* parse int out of a string or prompt for it */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base) +{ + long result; + int rc; + + rc = Parser_arg2int(inp, &result, base); + + if ( rc == 0 ) { + return result; + } else { + return Parser_getint(prompt, deft, min, max, base); + } +} + +/* parse int out of a string or prompt for it */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len) +{ + if ( inp == NULL || *inp == '\0' ) { + return Parser_getstr(prompt, deft, answer, len); + } else + return inp; +} + +/* change a string into a number: return 0 on success. No invalid characters + allowed. The processing of base and validity follows strtol(3)*/ +int Parser_arg2int(const char *inp, long *result, int base) +{ + char *endptr; + + if ( (base !=0) && (base < 2 || base > 36) ) + return 1; + + *result = strtol(inp, &endptr, base); + + if ( *inp != '\0' && *endptr == '\0' ) + return 0; + else + return 1; +} + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size (int *sizep, char *str) { + int size; + char mod[32]; + + switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { + default: + return (-1); + + case 1: + *sizep = size; + return (0); + + case 2: + switch (*mod) { + case 'g': + case 'G': + *sizep = size << 30; + return (0); + + case 'm': + case 'M': + *sizep = size << 20; + return (0); + + case 'k': + case 'K': + *sizep = size << 10; + return (0); + + default: + *sizep = size; + return (0); + } + } +} + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool (int *b, char *str) { + if (!strcasecmp (str, "no") || + !strcasecmp (str, "n") || + !strcasecmp (str, "off") || + !strcasecmp (str, "disable")) + { + *b = 0; + return (0); + } + + if (!strcasecmp (str, "yes") || + !strcasecmp (str, "y") || + !strcasecmp (str, "on") || + !strcasecmp (str, "enable")) + { + *b = 1; + return (0); + } + + return (-1); +} + +int Parser_quit(int argc, char **argv) +{ + argc = argc; + argv = argv; + done = 1; + return 0; +} diff --git a/lustre/portals/utils/parser.h b/lustre/portals/utils/parser.h new file mode 100644 index 0000000..dead9f5 --- /dev/null +++ b/lustre/portals/utils/parser.h @@ -0,0 +1,73 @@ +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#define HISTORY 100 /* Don't let history grow unbounded */ +#define MAXARGS 100 + +#define CMD_COMPLETE 0 +#define CMD_INCOMPLETE 1 +#define CMD_NONE 2 +#define CMD_AMBIG 3 +#define CMD_HELP 4 + +typedef struct parser_cmd { + char *pc_name; + int (* pc_func)(int, char **); + struct parser_cmd * pc_sub_cmd; + char *pc_help; +} command_t; + +typedef struct argcmd { + char *ac_name; + int (*ac_func)(int, char **); + char *ac_help; +} argcmd_t; + +typedef struct network { + char *type; + char *server; + int port; +} network_t; + +int Parser_quit(int argc, char **argv); +void Parser_init(char *, command_t *); /* Set prompt and load command list */ +int Parser_commands(void); /* Start the command parser */ +void Parser_qhelp(int, char **); /* Quick help routine */ +int Parser_help(int, char **); /* Detailed help routine */ +void Parser_printhelp(char *); /* Detailed help routine */ +void Parser_exit(int, char **); /* Shuts down command parser */ +int Parser_execarg(int argc, char **argv, command_t cmds[]); +int execute_line(char * line); + +/* Converts a string to an integer */ +int Parser_int(char *, int *); + +/* Prompts for a string, with default values and a maximum length */ +char *Parser_getstr(const char *prompt, const char *deft, char *res, + size_t len); + +/* Prompts for an integer, with minimum, maximum and default values and base */ +int Parser_getint(const char *prompt, long min, long max, long deft, + int base); + +/* Prompts for a yes/no, with default */ +int Parser_getbool(const char *prompt, int deft); + +/* Extracts an integer from a string, or prompts if it cannot get one */ +long Parser_intarg(const char *inp, const char *prompt, int deft, + int min, int max, int base); + +/* Extracts a word from the input, or propmts if it cannot get one */ +char *Parser_strarg(char *inp, const char *prompt, const char *deft, + char *answer, int len); + +/* Extracts an integer from a string with a base */ +int Parser_arg2int(const char *inp, long *result, int base); + +/* Convert human readable size string to and int; "1k" -> 1000 */ +int Parser_size(int *sizep, char *str); + +/* Convert a string boolean to an int; "enable" -> 1 */ +int Parser_bool(int *b, char *str); + +#endif diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c new file mode 100644 index 0000000..90d66f5 --- /dev/null +++ b/lustre/portals/utils/portals.c @@ -0,0 +1,985 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/tcp.h> +#include <netdb.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <errno.h> +#include <unistd.h> +#include <time.h> +#include <asm/byteorder.h> + +#include <portals/api-support.h> +#include <portals/ptlctl.h> +#include <portals/list.h> +#include <portals/lib-types.h> +#include "parser.h" + +unsigned int portal_debug; +unsigned int portal_printk; +unsigned int portal_stack; + + +static ptl_nid_t g_nid = 0; +static unsigned int g_nal = 0; +static unsigned short g_port = 0; + +static int g_socket_txmem = 0; +static int g_socket_rxmem = 0; +static int g_socket_nonagle = 1; + +typedef struct +{ + char *name; + int num; +} name2num_t; + +static name2num_t nalnames[] = { + {"tcp", SOCKNAL}, + {"toe", TOENAL}, + {"elan", QSWNAL}, + {"gm", GMNAL}, + {"scimac", SCIMACNAL}, + {NULL, -1} +}; + +static name2num_t * +name2num_lookup_name (name2num_t *table, char *str) +{ + while (table->name != NULL) + if (!strcmp (str, table->name)) + return (table); + else + table++; + return (NULL); +} + +static name2num_t * +name2num_lookup_num (name2num_t *table, int num) +{ + while (table->name != NULL) + if (num == table->num) + return (table); + else + table++; + return (NULL); +} + +int +ptl_name2nal (char *str) +{ + name2num_t *e = name2num_lookup_name (nalnames, str); + + return ((e == NULL) ? 0 : e->num); +} + +static char * +nal2name (int nal) +{ + name2num_t *e = name2num_lookup_num (nalnames, nal); + + return ((e == NULL) ? "???" : e->name); +} + +int +ptl_parse_nid (ptl_nid_t *nidp, char *str) +{ + struct hostent *he; + int a; + int b; + int c; + int d; + + if (sscanf (str, "%d.%d.%d.%d", &a, &b, &c, &d) == 4 && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) + { + __u32 addr = (a<<24)|(b<<16)|(c<<8)|d; + + *nidp = (ptl_nid_t)addr; + return (0); + } + + if ((('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) && + (he = gethostbyname (str)) != NULL) + { + __u32 addr = *(__u32 *)he->h_addr; + + *nidp = (ptl_nid_t)ntohl(addr); /* HOST byte order */ + return (0); + } + + if (sscanf (str, "%i", &a) == 1) + { + *nidp = (ptl_nid_t)a; + return (0); + } + + if (sscanf (str, "%x", &a) == 1) + { + *nidp = (ptl_nid_t) a; + return (0); + } + + return (-1); +} + +char * +ptl_nid2str (char *buffer, ptl_nid_t nid) +{ + __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ + struct hostent *he = gethostbyaddr ((const char *)&addr, sizeof (addr), AF_INET); + + if (he != NULL) + strcpy (buffer, he->h_name); + else + sprintf (buffer, "0x"LPX64, nid); + + return (buffer); +} + +int +sock_write (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = write (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) + { + fprintf (stderr, "Unexpected zero sock_write\n"); + abort(); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int +sock_read (int cfd, void *buffer, int nob) +{ + while (nob > 0) + { + int rc = read (cfd, buffer, nob); + + if (rc < 0) + { + if (errno == EINTR) + continue; + + return (rc); + } + + if (rc == 0) /* EOF */ + { + errno = ECONNABORTED; + return (-1); + } + + nob -= rc; + buffer = (char *)buffer + nob; + } + + return (0); +} + +int ptl_initialize(int argc, char **argv) +{ + register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + return 0; +} + + +int jt_ptl_network(int argc, char **argv) +{ + int nal; + + if (argc != 2 || + (nal = ptl_name2nal (argv[1])) == 0) + { + name2num_t *entry; + + fprintf(stderr, "usage: %s \n", argv[0]); + for (entry = nalnames; entry->name != NULL; entry++) + fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); + fprintf(stderr, ">\n"); + } + else + g_nal = nal; + + return (0); +} + +int +exchange_nids (int cfd, ptl_nid_t my_nid, ptl_nid_t *peer_nid) +{ + int rc; + ptl_hdr_t hdr; + ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + + LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); + + memset (&hdr, 0, sizeof (hdr)); + + hmv->magic = __cpu_to_le32 (PORTALS_PROTO_MAGIC); + hmv->version_major = __cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); + hmv->version_minor = __cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); + + hdr.src_nid = __cpu_to_le64 (my_nid); + hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); + + /* Assume there's sufficient socket buffering for a portals HELLO header */ + rc = sock_write (cfd, &hdr, sizeof (hdr)); + if (rc != 0) { + perror ("Can't send initial HELLO"); + return (-1); + } + + /* First few bytes down the wire are the portals protocol magic and + * version, no matter what protocol version we're running. */ + + rc = sock_read (cfd, hmv, sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read from peer"); + return (-1); + } + + if (__cpu_to_le32 (hmv->magic) != PORTALS_PROTO_MAGIC) { + fprintf (stderr, "Bad magic %#08x (%#08x expected)\n", + __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC); + return (-1); + } + + if (__cpu_to_le16 (hmv->version_major) != PORTALS_PROTO_VERSION_MAJOR || + __cpu_to_le16 (hmv->version_minor) != PORTALS_PROTO_VERSION_MINOR) { + fprintf (stderr, "Incompatible protocol version %d.%d (%d.%d expected)\n", + __cpu_to_le16 (hmv->version_major), + __cpu_to_le16 (hmv->version_minor), + PORTALS_PROTO_VERSION_MAJOR, + PORTALS_PROTO_VERSION_MINOR); + } + + /* version 0 sends magic/version as the dest_nid of a 'hello' header, + * so read the rest of it in now... */ + LASSERT (PORTALS_PROTO_VERSION_MAJOR == 0); + rc = sock_read (cfd, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + if (rc != 0) { + perror ("Can't read rest of HELLO hdr"); + return (-1); + } + + /* ...and check we got what we expected */ + if (__cpu_to_le32 (hdr.type) != PTL_MSG_HELLO || + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr)) != 0) { + fprintf (stderr, "Expecting a HELLO hdr with 0 payload," + " but got type %d with %d payload\n", + __cpu_to_le32 (hdr.type), + __cpu_to_le32 (PTL_HDR_LENGTH (&hdr))); + return (-1); + } + + *peer_nid = __le64_to_cpu (hdr.src_nid); + return (0); +} + +int jt_ptl_connect(int argc, char **argv) +{ + if (argc < 2) { + usage: + fprintf(stderr, "usage: %s <hostname port [xi]> or <elan ID>\n", + argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + ptl_nid_t peer_nid; + struct hostent *he; + struct portal_ioctl_data data; + struct sockaddr_in srvaddr; + char *flag; + int fd, rc; + int nonagle = 0; + int rxmem = 0; + int txmem = 0; + int bind_irq = 0; + int xchange_nids = 0; + int o; + int olen; + + if (argc < 3) { + goto usage; + } + + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + g_port = atol(argv[2]); + + if (argc > 3) + for (flag = argv[3]; *flag != 0; flag++) + switch (*flag) + { + case 'i': + bind_irq = 1; + break; + + case 'x': + xchange_nids = 1; + break; + + default: + fprintf (stderr, "unrecognised flag '%c'\n", + *flag); + return (-1); + } + + memset(&srvaddr, 0, sizeof(srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(g_port); + srvaddr.sin_addr.s_addr = *(__u32 *)he->h_addr; + + fd = socket(PF_INET, SOCK_STREAM, 0); + if ( fd < 0 ) { + fprintf(stderr, "socket() failed: %s\n", + strerror(errno)); + return -1; + } + + if (g_socket_nonagle) + { + o = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot disable nagle: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_rxmem != 0) + { + o = g_socket_rxmem; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set receive buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + if (g_socket_txmem != 0) + { + o = g_socket_txmem; + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &o, sizeof (o)) != 0) + { + fprintf(stderr, "cannot set send buffer size: %s\n", strerror(errno)); + return (-1); + } + } + + rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); + if ( rc == -1 ) { + fprintf(stderr, "connect() failed: %s\n", + strerror(errno)); + return -1; + } + + olen = sizeof (txmem); + if (getsockopt (fd, SOL_SOCKET, SO_SNDBUF, &txmem, &olen) != 0) + fprintf (stderr, "Can't get send buffer size: %s\n", strerror (errno)); + olen = sizeof (rxmem); + if (getsockopt (fd, SOL_SOCKET, SO_RCVBUF, &rxmem, &olen) != 0) + fprintf (stderr, "Can't get receive buffer size: %s\n", strerror (errno)); + olen = sizeof (nonagle); + if (getsockopt (fd, IPPROTO_TCP, TCP_NODELAY, &nonagle, &olen) != 0) + fprintf (stderr, "Can't get nagle: %s\n", strerror (errno)); + + if (xchange_nids) { + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc != 0) + { + fprintf (stderr, "failed to get my nid: %s\n", + strerror (errno)); + close (fd); + return (-1); + } + + rc = exchange_nids (fd, data.ioc_nid, &peer_nid); + if (rc != 0) + { + close (fd); + return (-1); + } + } + else + peer_nid = ntohl (srvaddr.sin_addr.s_addr); /* HOST byte order */ + + printf("Connected host: %s NID "LPX64" snd: %d rcv: %d nagle: %s\n", argv[1], + peer_nid, txmem, rxmem, nonagle ? "Disabled" : "Enabled"); + + PORTAL_IOC_INIT(data); + data.ioc_fd = fd; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_PEER_FD; + data.ioc_nid = peer_nid; + data.ioc_flags = bind_irq; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to register fd with portals: " + "%s\n", strerror(errno)); + close (fd); + return -1; + } + + g_nid = peer_nid; + printf("Connection to "LPX64" registered with socknal\n", g_nid); + + rc = close(fd); + if (rc) { + fprintf(stderr, "close failed: %d\n", rc); + } + } else if (g_nal == QSWNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == GMNAL) { + g_nid = atoi(argv[1]); + } else if (g_nal == SCIMACNAL) { + unsigned int tmpnid; + if(sscanf(argv[1], "%x", &tmpnid) == 1) { + g_nid=tmpnid; + } + else { + fprintf(stderr, "nid %s invalid for SCI nal\n", argv[1]); + } + + + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + } + + return 0; +} + +int jt_ptl_disconnect(int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Disconnecting ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_CLOSE_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to remove connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'disconnect' doesn't make any sense for " + "elan.\n"); + } else if (g_nal == GMNAL) { + printf("'disconnect' doesn't make any sense for " + "GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'disconnect' doesn't make any sense for " + "SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_push_connection (int argc, char **argv) +{ + if (argc > 2) { + fprintf(stderr, "usage: %s [hostname]\n", argv[0]); + return 0; + } + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + if (g_nal == SOCKNAL || g_nal == TOENAL) { + struct hostent *he; + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + if (argc == 2) { + he = gethostbyname(argv[1]); + if (!he) { + fprintf(stderr, "gethostbyname error: %s\n", + strerror(errno)); + return -1; + } + + data.ioc_nid = ntohl (*(__u32 *)he->h_addr); /* HOST byte order */ + + } else { + printf("Pushing ALL connections.\n"); + /* leave ioc_nid zeroed == disconnect all */ + } + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_PUSH_CONNECTION; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc) { + fprintf(stderr, "failed to push connection: %s\n", + strerror(errno)); + return -1; + } + } else if (g_nal == QSWNAL) { + printf("'push' doesn't make any sense for elan.\n"); + } else if (g_nal == GMNAL) { + printf("'push' doesn't make any sense for GM.\n"); + } else if (g_nal == SCIMACNAL) { + printf("'push' doesn't make any sense for SCI.\n"); + } else { + fprintf(stderr, "This should never happen. Also it is very " + "bad.\n"); + return -1; + } + + return 0; +} + +int jt_ptl_ping(int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + long count = 1; + long size = 4; + long timeout = 1; + struct portal_ioctl_data data; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc > 2) + { + count = atol(argv[2]); + + if (count < 0 || count > 20000) + { + fprintf(stderr, "are you insane? %ld is a crazy count.\n", count); + return -1; + } + } + + if (argc > 3) + size= atol(argv[3]); + + if (argc > 4) + timeout = atol (argv[4]); + + PORTAL_IOC_INIT (data); + data.ioc_count = count; + data.ioc_size = size; + data.ioc_nid = nid; + data.ioc_nal = g_nal; + data.ioc_timeout = timeout; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); + if (rc) { + fprintf(stderr, "failed to start pinger: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int jt_ptl_shownid(int argc, char **argv) +{ + struct portal_ioctl_data data; + int rc; + + if (argc > 1) { + fprintf(stderr, "usage: %s\n", argv[0]); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command first\n"); + return -1; + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); + if (rc < 0) + fprintf(stderr, "getting my NID failed: %s\n", + strerror (errno)); + else + printf(LPX64"\n", data.ioc_nid); + return 0; +} + +int jt_ptl_mynid(int argc, char **argv) +{ + int rc; + char hostname[1024]; + char *nidstr; + struct portal_ioctl_data data; + ptl_nid_t mynid; + + if (argc > 2) { + fprintf(stderr, "usage: %s [NID]\n", argv[0]); + fprintf(stderr, "NID defaults to the primary IP address of the machine.\n"); + return 0; + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return -1; + } + + if (argc >= 2) + nidstr = argv[1]; + else if (gethostname(hostname, sizeof(hostname)) != 0) { + fprintf(stderr, "gethostname failed: %s\n", + strerror(errno)); + return -1; + } + else + nidstr = hostname; + + rc = ptl_parse_nid (&mynid, nidstr); + if (rc != 0) { + fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr); + return -1; + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = mynid; + data.ioc_nal = g_nal; + data.ioc_nal_cmd = NAL_CMD_REGISTER_MYNID; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); + if (rc < 0) + fprintf(stderr, "setting my NID failed: %s\n", + strerror(errno)); + else + printf("registered my nid "LPX64" (%s)\n", mynid, hostname); + return 0; +} + +int +jt_ptl_fail_nid (int argc, char **argv) +{ + int rc; + ptl_nid_t nid; + unsigned int threshold; + struct portal_ioctl_data data; + + if (argc < 2 || argc > 3) + { + fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (!strcmp (argv[1], "_all_")) + nid = PTL_NID_ANY; + else if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); + return (-1); + } + + if (argc < 3) + threshold = PTL_MD_THRESH_INF; + else if (sscanf (argv[2], "%i", &threshold) != 1) { + fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); + return (-1); + } + + PORTAL_IOC_INIT (data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_count = threshold; + + rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); + if (rc < 0) + fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", + strerror (errno)); + else + printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); + + return (0); +} + +int +jt_ptl_rxmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + + g_socket_rxmem = size; + } + printf ("Socket rmem = %d\n", g_socket_rxmem); + return (0); +} + +int +jt_ptl_txmem (int argc, char **argv) +{ + int size; + + if (argc > 1) + { + if (Parser_size (&size, argv[1]) != 0 || size < 0) + { + fprintf (stderr, "Can't parse size %s\n", argv[1]); + return (0); + } + g_socket_txmem = size; + } + printf ("Socket txmem = %d\n", g_socket_txmem); + return (0); +} + +int +jt_ptl_nagle (int argc, char **argv) +{ + int enable; + + if (argc > 1) + { + if (Parser_bool (&enable, argv[1]) != 0) + { + fprintf (stderr, "Can't parse boolean %s\n", argv[1]); + return (0); + } + g_socket_nonagle = !enable; + } + printf ("Nagle %s\n", g_socket_nonagle ? "disabled" : "enabled"); + return (0); +} + +int +jt_ptl_add_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid1; + ptl_nid_t nid2; + ptl_nid_t gateway_nid; + int rc; + + if (argc < 3) + { + fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); + return (0); + } + + if (g_nal == 0) { + fprintf(stderr, "Error: you must run the 'network' command " + "first.\n"); + return (-1); + } + + if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); + return (-1); + } + + if (ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) + nid2 = nid1; + else if (ptl_parse_nid (&nid2, argv[3]) != 0) + { + fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = gateway_nid; + data.ioc_nal = g_nal; + data.ioc_nid2 = MIN (nid1, nid2); + data.ioc_nid3 = MAX (nid1, nid2); + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_ADD_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_ADD_ROUTE failed: %s\n", strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_del_route (int argc, char **argv) +{ + struct portal_ioctl_data data; + ptl_nid_t nid; + int rc; + + if (argc < 2) + { + fprintf (stderr, "usage: %s targetNID\n", argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nid = nid; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_DEL_ROUTE ("LPX64") failed: %s\n", nid, strerror (errno)); + return (-1); + } + + return (0); +} + +int +jt_ptl_print_routes (int argc, char **argv) +{ + char buffer[3][128]; + struct portal_ioctl_data data; + int rc; + int index; + int gateway_nal; + ptl_nid_t gateway_nid; + ptl_nid_t nid1; + ptl_nid_t nid2; + + + for (index = 0;;index++) + { + PORTAL_IOC_INIT(data); + data.ioc_count = index; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_ROUTE, &data); + if (rc != 0) + break; + + gateway_nal = data.ioc_nal; + gateway_nid = data.ioc_nid; + nid1 = data.ioc_nid2; + nid2 = data.ioc_nid3; + + printf ("%8s %18s : %s - %s\n", + nal2name (gateway_nal), + ptl_nid2str (buffer[0], gateway_nid), + ptl_nid2str (buffer[1], nid1), + ptl_nid2str (buffer[2], nid2)); + } + return (0); +} + diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c new file mode 100644 index 0000000..8c56d93 --- /dev/null +++ b/lustre/portals/utils/ptlctl.c @@ -0,0 +1,65 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Portals, http://www.sf.net/projects/lustre/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <portals/api-support.h> +#include <portals/ptlctl.h> + +#include "parser.h" + + +command_t list[] = { + {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, + {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: <hostname port> | <id> for tcp/elan respectively)"}, + {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [hostname]"}, + {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [hostname]"}, + {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"shownid", jt_ptl_shownid, 0, "print the local NID"}, + {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, + {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, + {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, + {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, + {"nagle", jt_ptl_nagle, 0, "Enable/Disable Nagle (args: [on/off])"}, + {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, + {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, + {"help", Parser_help, 0, "help"}, + {"exit", Parser_quit, 0, "quit"}, + {"quit", Parser_quit, 0, "quit"}, + { 0, 0, 0, NULL } +}; + +int main(int argc, char **argv) +{ + if (ptl_initialize(argc, argv) < 0) + exit(1); + + Parser_init("ptlctl > ", list); + if (argc > 1) + return Parser_execarg(argc - 1, &argv[1], list); + + Parser_commands(); + + return 0; +} diff --git a/lustre/portals/utils/routerstat.c b/lustre/portals/utils/routerstat.c new file mode 100644 index 0000000..37da12c --- /dev/null +++ b/lustre/portals/utils/routerstat.c @@ -0,0 +1,99 @@ +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/time.h> + +double +timenow () +{ + struct timeval tv; + + gettimeofday (&tv, NULL); + return (tv.tv_sec + tv.tv_usec / 1000000.0); +} + +void +do_stat (int fd) +{ + static char buffer[1024]; + static double last = 0.0; + double now; + double t; + long long bytes; + long packets; + long errors; + long depth; + int n; + + lseek (fd, 0, SEEK_SET); + now = timenow(); + n = read (fd, buffer, sizeof (buffer)); + if (n < 0) + { + fprintf (stderr, "Can't read statfile\n"); + exit (1); + } + buffer[n] = 0; + + n = sscanf (buffer, "%Ld %ld %ld %ld", &bytes, &packets, &errors, &depth); + + if (n < 3) + { + fprintf (stderr, "Can't parse statfile\n"); + exit (1); + } + + if (last == 0.0) + printf ("%Ld bytes, %ld packets (sz %Ld) %ld errors", + bytes, packets, (long long)((packets == 0) ? 0LL : bytes/packets), errors); + else + { + t = now - last; + + printf ("%9Ld (%7.2fMb/s), %7ld packets (sz %5Ld, %5ld/s) %ld errors (%ld/s)", + bytes, ((double)bytes)/((1<<20) * t), + packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t), + errors, (long)(errors/t)); + } + + if (n == 4) + printf (" (%ld)\n", depth); + else + printf ("\n"); + + fflush (stdout); + + lseek (fd, 0, SEEK_SET); + write (fd, "\n", 1); + last = timenow(); +} + +int main (int argc, char **argv) +{ + int interval = 0; + int fd; + + if (argc > 1) + interval = atoi (argv[1]); + + fd = open ("/proc/sys/portals/router", O_RDWR); + if (fd < 0) + { + fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); + return (1); + } + + do_stat (fd); + if (interval == 0) + return (0); + + for (;;) + { + sleep (interval); + do_stat (fd); + } +} diff --git a/lustre/portals/utils/wirecheck.c b/lustre/portals/utils/wirecheck.c new file mode 100644 index 0000000..6a4377b --- /dev/null +++ b/lustre/portals/utils/wirecheck.c @@ -0,0 +1,141 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include <stdio.h> +#include <portals/api-support.h> +#include <portals/list.h> +#include <portals/lib-types.h> + +#define BLANK_LINE() \ +do { \ + printf ("\n"); \ +} while (0) + +#define COMMENT(c) \ +do { \ + printf (" /* "c" */\n"); \ +} while (0) + +#define STRINGIFY(a) #a + +#define CHECK_DEFINE(a) \ +do { \ + printf (" LASSERT ("#a" == "STRINGIFY(a)");\n"); \ +} while (0) + +#define CHECK_VALUE(a) \ +do { \ + printf (" LASSERT ("#a" == %d);\n", a); \ +} while (0) + +#define CHECK_MEMBER_OFFSET(s,m) \ +do { \ + CHECK_VALUE(offsetof(s, m)); \ +} while (0) + +#define CHECK_MEMBER_SIZEOF(s,m) \ +do { \ + CHECK_VALUE((int)sizeof(((s *)0)->m)); \ +} while (0) + +#define CHECK_MEMBER(s,m) \ +do { \ + CHECK_MEMBER_OFFSET(s, m); \ + CHECK_MEMBER_SIZEOF(s, m); \ +} while (0) + +#define CHECK_STRUCT(s) \ +do { \ + BLANK_LINE (); \ + COMMENT ("Checks for struct "#s); \ + CHECK_VALUE((int)sizeof(s)); \ +} while (0) + +void +check_ptl_handle_wire (void) +{ + CHECK_STRUCT (ptl_handle_wire_t); + CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie); + CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie); +} + +void +check_ptl_magicversion (void) +{ + CHECK_STRUCT (ptl_magicversion_t); + CHECK_MEMBER (ptl_magicversion_t, magic); + CHECK_MEMBER (ptl_magicversion_t, version_major); + CHECK_MEMBER (ptl_magicversion_t, version_minor); +} + +void +check_ptl_hdr (void) +{ + CHECK_STRUCT (ptl_hdr_t); + CHECK_MEMBER (ptl_hdr_t, dest_nid); + CHECK_MEMBER (ptl_hdr_t, src_nid); + CHECK_MEMBER (ptl_hdr_t, dest_pid); + CHECK_MEMBER (ptl_hdr_t, src_pid); + CHECK_MEMBER (ptl_hdr_t, type); + + BLANK_LINE (); + COMMENT ("Ack"); + CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength); + CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.ack.length); + + BLANK_LINE (); + COMMENT ("Put"); + CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index); + CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.put.length); + CHECK_MEMBER (ptl_hdr_t, msg.put.offset); + CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data); + + BLANK_LINE (); + COMMENT ("Get"); + CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index); + CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits); + CHECK_MEMBER (ptl_hdr_t, msg.get.length); + CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset); + CHECK_MEMBER (ptl_hdr_t, msg.get.return_offset); + CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length); + + BLANK_LINE (); + COMMENT ("Reply"); + CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd); + CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_offset); + CHECK_MEMBER (ptl_hdr_t, msg.reply.length); +} + +int +main (int argc, char **argv) +{ + printf ("void lib_assert_wire_constants (void)\n" + "{\n"); + + COMMENT ("Wire protocol assertions generated by 'wirecheck'"); + BLANK_LINE (); + + COMMENT ("Constants..."); + CHECK_DEFINE (PORTALS_PROTO_MAGIC); + CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR); + CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR); + + CHECK_VALUE (PTL_MSG_ACK); + CHECK_VALUE (PTL_MSG_PUT); + CHECK_VALUE (PTL_MSG_GET); + CHECK_VALUE (PTL_MSG_REPLY); + CHECK_VALUE (PTL_MSG_HELLO); + + check_ptl_handle_wire (); + check_ptl_magicversion (); + check_ptl_hdr (); + + printf ("}\n\n"); + + return (0); +} diff --git a/lustre/ptlbd/blk.c b/lustre/ptlbd/blk.c index 28ca368..a367903 100644 --- a/lustre/ptlbd/blk.c +++ b/lustre/ptlbd/blk.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/major.h> #include <linux/smp.h> +#include <linux/hdreg.h> #define DEBUG_SUBSYSTEM S_PTLBD @@ -95,20 +96,26 @@ static int ptlbd_open(struct inode *inode, struct file *file) struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode); ENTRY; + if ( IS_ERR(ptlbd) ) RETURN(PTR_ERR(ptlbd)); - if ( ptlbd->bd_import.imp_connection == NULL ) - RETURN(-ENODEV); + + if (! ptlbd->bd_import->imp_remote_handle.cookie) + if (ptlbd_do_connect(ptlbd)) + RETURN(-ENOTCONN); ptlbd->refcount++; RETURN(0); } + static int ptlbd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct ptlbd_obd *ptlbd; int ret; + __u16 major, minor, dev; + struct hd_geometry geo; if ( ! capable(CAP_SYS_ADMIN) ) RETURN(-EPERM); @@ -117,11 +124,50 @@ static int ptlbd_ioctl(struct inode *inode, struct file *file, if ( IS_ERR(ptlbd) ) RETURN( PTR_ERR(ptlbd) ); + major = MAJOR(inode->i_rdev); + minor = MINOR(inode->i_rdev); + dev = inode->i_rdev; + switch(cmd) { + case HDIO_GETGEO: + geo.heads = 64; + geo.sectors = 32; + geo.start = 4; + geo.cylinders = blk_size[major][minor]/ + (geo.heads * geo.sectors); + if (copy_to_user((void *) arg, &geo, sizeof(geo))) + ret = -EFAULT; + else + ret = 0; + break; + + case BLKSECTGET: + ret = copy_to_user((void *) arg, + & max_sectors[major][minor], sizeof(arg)); + break; + case BLKFLSBUF: - ret = blk_ioctl(inode->i_rdev, cmd, arg); + ret = blk_ioctl(dev, cmd, arg); + ptlbd_send_flush_req(ptlbd, PTLBD_FLUSH); break; + + case BLKGETSIZE: + case BLKGETSIZE64: + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: + case BLKSSZGET: + case BLKELVGET: + case BLKELVSET: default: + ret = blk_ioctl(dev, cmd, arg); + break; + + case BLKSECTSET: /* don't allow setting of max_sectors */ + + case BLKRRPART: /* not a partitionable device */ + case BLKPG: /* "" */ ret = -EINVAL; break; } @@ -137,7 +183,9 @@ static int ptlbd_release(struct inode *inode, struct file *file) if ( IS_ERR(ptlbd) ) RETURN( PTR_ERR(ptlbd) ); - ptlbd->refcount--; + if (--ptlbd->refcount == 0) + ptlbd_do_disconnect(ptlbd); + RETURN(0); } @@ -174,6 +222,7 @@ static void ptlbd_request(request_queue_t *q) struct ptlbd_obd *ptlbd; struct request *req; ptlbd_cmd_t cmd; + int errors = 0; ENTRY; while ( !QUEUE_EMPTY ) { @@ -190,19 +239,18 @@ static void ptlbd_request(request_queue_t *q) spin_unlock_irq(&io_request_lock); - /* XXX dunno if we're supposed to get this or not.. */ - /* __make_request() changes READA to READ - Kris */ - LASSERT(req->cmd != READA); - if ( req->cmd == READ ) cmd = PTLBD_READ; else cmd = PTLBD_WRITE; - ptlbd_send_req(ptlbd, cmd, req); + errors = ptlbd_send_rw_req(ptlbd, cmd, req->bh); spin_lock_irq(&io_request_lock); + if (errors) + req->errors += errors; + ptlbd_end_request_havelock(req); } } @@ -228,7 +276,6 @@ int ptlbd_blk_init(void) blksize_size[PTLBD_MAJOR] = ptlbd_size_size; hardsect_size[PTLBD_MAJOR] = ptlbd_hardsect_size; max_sectors[PTLBD_MAJOR] = ptlbd_max_sectors; - //RHism blkdev_varyio[PTLBD_MAJOR] = ptlbd_dev_varyio; blk_init_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR), ptlbd_request); blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); @@ -238,9 +285,7 @@ int ptlbd_blk_init(void) /* avoid integer overflow */ ptlbd_size[i] = (16*1024*((1024*1024) >> BLOCK_SIZE_BITS)); ptlbd_hardsect_size[i] = 4096; - ptlbd_max_sectors[i] = 2; - //RHism ptlbd_dev_varyio[i] = 0; - /* XXX register_disk? */ + ptlbd_max_sectors[i] = PTL_MD_MAX_IOV * (4096/512); } return 0; diff --git a/lustre/ptlbd/client.c b/lustre/ptlbd/client.c index 8d957db..f36a3c7 100644 --- a/lustre/ptlbd/client.c +++ b/lustre/ptlbd/client.c @@ -32,15 +32,14 @@ #include <linux/lprocfs_status.h> #include <linux/obd_ptlbd.h> -static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) +static int ptlbd_cl_setup(struct obd_device *obd, obd_count len, void *buf) { - struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; - struct obd_import *imp = &ptlbd->bd_import; + struct ptlbd_obd *ptlbd = &obd->u.ptlbd; + struct obd_import *imp; struct obd_ioctl_data* data = buf; - struct obd_uuid server_uuid; ENTRY; - if ( ptlbd->bd_import.imp_connection != NULL ) + if (ptlbd->bd_import != NULL) RETURN(-EALREADY); if (data->ioc_inllen1 < 1) { @@ -53,82 +52,144 @@ static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - obd_str2uuid(&server_uuid, data->ioc_inlbuf1); - - imp->imp_connection = ptlrpc_uuid_to_connection(&server_uuid); - if (!imp->imp_connection) - RETURN(-ENOENT); + obd_str2uuid(&ptlbd->bd_server_uuid, data->ioc_inlbuf1); - INIT_LIST_HEAD(&imp->imp_replay_list); - INIT_LIST_HEAD(&imp->imp_sending_list); - INIT_LIST_HEAD(&imp->imp_delayed_list); - spin_lock_init(&imp->imp_lock); /* * from client_obd_connect.. *shrug* */ - INIT_LIST_HEAD(&imp->imp_chain); - imp->imp_max_transno = 0; - imp->imp_peer_committed_transno = 0; + imp = ptlbd->bd_import = class_new_import(); + imp->imp_connection = ptlrpc_uuid_to_connection(&ptlbd->bd_server_uuid); + if (!imp->imp_connection) { + class_destroy_import(imp); + class_import_put(imp); + RETURN(-ENOENT); + } imp->imp_level = LUSTRE_CONN_FULL; ptlrpc_init_client(PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, "ptlbd", &ptlbd->bd_client); imp->imp_client = &ptlbd->bd_client; - imp->imp_obd = obddev; - + imp->imp_obd = obd; + memcpy(imp->imp_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1); ptlbd_blk_register(ptlbd); RETURN(0); } -static int ptlbd_cl_cleanup(struct obd_device *obddev) +static int ptlbd_cl_cleanup(struct obd_device *obd, int force, int failover) { - struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; + struct ptlbd_obd *ptlbd = &obd->u.ptlbd; + struct obd_import *imp; ENTRY; - if (!ptlbd) + if ((!ptlbd) || (!(imp = ptlbd->bd_import))) RETURN(-ENOENT); - if (!ptlbd->bd_import.imp_connection) + if (!imp->imp_connection) RETURN(-ENOENT); - ptlrpc_cleanup_client(&ptlbd->bd_import); - ptlrpc_put_connection(ptlbd->bd_import.imp_connection); + ptlrpc_cleanup_client(imp); + ptlrpc_put_connection(imp->imp_connection); + + class_destroy_import(imp); + class_import_put(imp); RETURN(0); } -#if 0 -static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) + +/* modelled after ptlrpc_import_connect() */ +int ptlbd_cl_connect(struct lustre_handle *conn, + struct obd_device *obd, + struct obd_uuid *target_uuid) { struct ptlbd_obd *ptlbd = &obd->u.ptlbd; - struct obd_import *imp = &ptlbd->bd_import; - int rc; + struct obd_import *imp = ptlbd->bd_import; + struct obd_export *exp; + struct ptlrpc_request *request; + int rc, size[] = {sizeof(imp->imp_target_uuid), + sizeof(obd->obd_uuid), + sizeof(*conn)}; + char *tmp[] = {imp->imp_target_uuid.uuid, + obd->obd_uuid.uuid, + (char*)conn}; ENTRY; - rc = class_connect(conn, obd, cluuid); + if (!conn || !obd || !target_uuid) + RETURN(-EINVAL); + + rc = class_connect(conn, obd, target_uuid); if (rc) RETURN(rc); - INIT_LIST_HEAD(&imp->imp_chain); - imp->imp_max_transno = 0; - imp->imp_peer_committed_transno = 0; + request = ptlrpc_prep_req(imp, PTLBD_CONNECT, 3, size, tmp); + if (!request) + GOTO(out_disco, rc = -ENOMEM); + request->rq_level = LUSTRE_CONN_NEW; + request->rq_replen = lustre_msg_size(0, NULL); + + imp->imp_dlm_handle = *conn; + + imp->imp_level = LUSTRE_CONN_CON; + rc = ptlrpc_queue_wait(request); + if (rc) + GOTO(out_req, rc); + + exp = class_conn2export(conn); + exp->exp_connection = ptlrpc_connection_addref(request->rq_connection); + class_export_put(exp); + imp->imp_level = LUSTRE_CONN_FULL; + imp->imp_remote_handle = request->rq_repmsg->handle; + +out_req: + ptlrpc_req_finished(request); +out_disco: + if (rc) + class_disconnect(conn, 0); + RETURN(rc); +} - RETURN(0); + +/* modelled after ptlrpc_import_disconnect() */ +int ptlbd_cl_disconnect(struct lustre_handle *conn, int failover) +{ + struct obd_device *obd = class_conn2obd(conn); + struct ptlbd_obd *ptlbd = &obd->u.ptlbd; + struct obd_import *imp = ptlbd->bd_import; + struct ptlrpc_request *request; + int rc, err; + ENTRY; + + if (!obd) + RETURN(-EINVAL); + + request = ptlrpc_prep_req(imp, PTLBD_DISCONNECT, 0, NULL, NULL); + if (!request) + GOTO(out_req, rc = -ENOMEM); + + request->rq_replen = lustre_msg_size(0, NULL); + request->rq_level = LUSTRE_CONN_RECOVD; + + rc = ptlrpc_queue_wait(request); + +out_req: + if (request) + ptlrpc_req_finished(request); + err = class_disconnect(conn, 0); + memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + if (!rc && err) + rc = err; + RETURN(rc); } -#endif + static struct obd_ops ptlbd_cl_obd_ops = { o_owner: THIS_MODULE, o_setup: ptlbd_cl_setup, o_cleanup: ptlbd_cl_cleanup, -#if 0 o_connect: ptlbd_cl_connect, - o_disconnect: class_disconnect -#endif + o_disconnect: ptlbd_cl_disconnect, }; int ptlbd_cl_init(void) @@ -144,3 +205,28 @@ void ptlbd_cl_exit(void) { class_unregister_type(OBD_PTLBD_CL_DEVICENAME); } + + + +int ptlbd_do_connect(struct ptlbd_obd *ptlbd) +{ + int rc; + struct obd_device *obd = ptlbd->bd_import->imp_obd; + ENTRY; + + memset(&ptlbd->bd_connect_handle, 0, sizeof(ptlbd->bd_connect_handle)); + rc = obd_connect(&ptlbd->bd_connect_handle, obd, + &ptlbd->bd_server_uuid); + RETURN(rc); +} + + +int ptlbd_do_disconnect(struct ptlbd_obd *ptlbd) +{ + int rc; + ENTRY; + + rc = obd_disconnect(&ptlbd->bd_connect_handle, 0); + RETURN(rc); +} + diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c index d3e5083..9829900 100644 --- a/lustre/ptlbd/rpc.c +++ b/lustre/ptlbd/rpc.c @@ -32,15 +32,10 @@ #include <linux/lprocfs_status.h> #include <linux/obd_ptlbd.h> -#define RSP_OK 0 -#define RSP_NOTOK -1 -#define RQ_OK 0 - -int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, - struct request *blkreq) +int ptlbd_send_rw_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, + struct buffer_head *first_bh) { - struct buffer_head *first_bh = blkreq->bh; - struct obd_import *imp = &ptlbd->bd_import; + struct obd_import *imp = ptlbd->bd_import; struct ptlbd_op *op; struct ptlbd_niob *niob, *niobs; struct ptlbd_rsp *rsp; @@ -49,12 +44,11 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, struct buffer_head *bh; unsigned int page_count; int rc, rep_size, size[2]; - __u32 xid; ENTRY; LASSERT(cmd == PTLBD_READ || cmd == PTLBD_WRITE); - for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next ) + for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_reqnext ) page_count++; size[0] = sizeof(struct ptlbd_op); @@ -62,10 +56,10 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, req = ptlrpc_prep_req(imp, cmd, 2, size, NULL); if (!req) - RETURN(-ENOMEM); + RETURN(rc = 1); /* need to return error cnt */ - op = lustre_msg_buf(req->rq_reqmsg, 0); - niobs = lustre_msg_buf(req->rq_reqmsg, 1); + op = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*op)); + niobs = lustre_msg_buf(req->rq_reqmsg, 1, size[1]); /* XXX pack */ op->op_cmd = cmd; @@ -74,38 +68,26 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, op->op__padding = 0; op->op_block_cnt = page_count; - desc = ptlrpc_prep_bulk(imp->imp_connection); + if (cmd == PTLBD_READ) + desc = ptlrpc_prep_bulk_imp (req, BULK_PUT_SINK, PTLBD_BULK_PORTAL); + else + desc = ptlrpc_prep_bulk_imp (req, BULK_GET_SOURCE, PTLBD_BULK_PORTAL); if ( desc == NULL ) - GOTO(out_req, rc = -ENOMEM); - desc->bd_portal = PTLBD_BULK_PORTAL; - desc->bd_ptl_ev_hdlr = NULL; - - xid = ptlrpc_next_xid(); - - for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out_req, rc = -ENOMEM); + GOTO(out, rc = 1); /* need to return error cnt */ + /* NB req now owns desc, and frees it when she frees herself */ + + for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_reqnext, niob++ ) { + rc = ptlrpc_prep_bulk_page(desc, bh->b_page, + bh_offset (bh) & (PAGE_SIZE - 1), + bh->b_size); + if (rc != 0) + GOTO(out, rc = 1); /* need to return error cnt */ - niob->n_xid = xid; niob->n_block_nr = bh->b_blocknr; niob->n_offset = bh_offset(bh); niob->n_length = bh->b_size; - - bulk->bp_xid = xid; - bulk->bp_buf = bh->b_data; - bulk->bp_page = bh->b_page; - bulk->bp_buflen = bh->b_size; } - if ( cmd == PTLBD_READ ) - rc = ptlrpc_register_bulk_put(desc); - else - rc = ptlrpc_register_bulk_get(desc); - - if (rc) - GOTO(out_desc, rc); - rep_size = sizeof(struct ptlbd_rsp); req->rq_replen = lustre_msg_size(1, &rep_size); @@ -113,38 +95,77 @@ int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, req->rq_level = imp->imp_level; rc = ptlrpc_queue_wait(req); - if ( rc != 0 ) { - blkreq->errors++; - GOTO(out_desc, rc); + if ( rc != 0 ) + GOTO(out, rc = 1); /* need to return error count */ + + rsp = lustre_swab_repbuf(req, 0, sizeof (*rsp), + lustre_swab_ptlbd_rsp); + if (rsp == NULL) { + CERROR ("can't unpack response\n"); + GOTO (out, rc = 1); /* need to return error count */ } - rsp = lustre_msg_buf(req->rq_repmsg, 0); - if (rsp->r_status != RSP_OK) { - blkreq->errors += rsp->r_error_cnt; + else if (rsp->r_status != 0) { + rc = rsp->r_error_cnt; } -out_desc: - ptlrpc_bulk_decref(desc); -out_req: +out: ptlrpc_req_finished(req); RETURN(rc); } -static int ptlbd_bulk_timeout(void *data) + +int ptlbd_send_flush_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd) { -/* struct ptlrpc_bulk_desc *desc = data;*/ + struct obd_import *imp = ptlbd->bd_import; + struct ptlbd_op *op; + struct ptlbd_rsp *rsp; + struct ptlrpc_request *req; + int rc, rep_size, size[1]; ENTRY; - CERROR("ugh, timed out\n"); + LASSERT(cmd == PTLBD_FLUSH); + + size[0] = sizeof(struct ptlbd_op); + + req = ptlrpc_prep_req(imp, cmd, 1, size, NULL); + if (!req) + RETURN(-ENOMEM); + + op = lustre_msg_buf(req->rq_reqmsg, 0, sizeof (*op)); + + /* XXX pack */ + op->op_cmd = cmd; + op->op_lun = 0; + op->op_niob_cnt = 0; + op->op__padding = 0; + op->op_block_cnt = 0; + + rep_size = sizeof(struct ptlbd_rsp); + req->rq_replen = lustre_msg_size(1, &rep_size); + + /* XXX find out how we're really supposed to manage levels */ + req->rq_level = imp->imp_level; - RETURN(1); + rc = ptlrpc_queue_wait(req); + if ( rc != 0 ) + GOTO(out_req, rc = 1); + rsp = lustre_swab_repbuf(req, 0, sizeof (*rsp), + lustre_swab_ptlbd_rsp); + if (rsp->r_status != 0) + rc = rsp->r_status; + +out_req: + ptlrpc_req_finished(req); + RETURN(rc); } + int ptlbd_do_filp(struct file *filp, int op, struct ptlbd_niob *niobs, int page_count, struct list_head *page_list) { mm_segment_t old_fs; struct list_head *pos; - int status = RSP_OK; + int status = 0; ENTRY; old_fs = get_fs(); @@ -155,118 +176,210 @@ int ptlbd_do_filp(struct file *filp, int op, struct ptlbd_niob *niobs, struct page *page = list_entry(pos, struct page, list); loff_t offset = (niobs->n_block_nr << PAGE_SHIFT) + niobs->n_offset; - if ( op == PTLBD_READ ) { - if ((ret = filp->f_op->read(filp, page_address(page), - niobs->n_length, &offset)) != niobs->n_length) - status = ret; - goto out; - } else { - if ((ret = filp->f_op->write(filp, page_address(page), - niobs->n_length, &offset)) != niobs->n_length) - status = ret; - goto out; - } - + if ( op == PTLBD_READ ) + ret = filp->f_op->read(filp, page_address(page), + niobs->n_length, &offset); + else + ret = filp->f_op->write(filp, page_address(page), + niobs->n_length, &offset); + if (ret != niobs->n_length) { + status = ret; + break; + } niobs++; } -out: set_fs(old_fs); RETURN(status); } -int ptlbd_parse_req(struct ptlrpc_request *req) + +int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, + struct ptlrpc_request *req, int swab) { - struct ptlbd_op *op; struct ptlbd_niob *niob, *niobs; struct ptlbd_rsp *rsp; - struct ptlrpc_bulk_desc *desc; + struct ptlrpc_bulk_desc *desc = NULL; struct file *filp = req->rq_obd->u.ptlbd.filp; struct l_wait_info lwi; - int size[1], wait_flag, i, page_count, rc, error_cnt = 0, - status = RSP_OK; + int size[1], i, page_count, rc = 0, error_cnt = 0; struct list_head *pos, *n; + struct page *page; LIST_HEAD(tmp_pages); ENTRY; - rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + niobs = lustre_swab_reqbuf (req, 1, sizeof (*niobs), + lustre_swab_ptlbd_niob); + if (niobs == NULL) + GOTO (out, rc = -EFAULT); + + size[0] = sizeof(struct ptlbd_rsp); + rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); if ( rc ) - RETURN(rc); + GOTO(out, rc); - op = lustre_msg_buf(req->rq_reqmsg, 0); - LASSERT(op->op_cmd == PTLBD_READ || op->op_cmd == PTLBD_WRITE); + rsp = lustre_msg_buf(req->rq_repmsg, 0, sizeof (*rsp)); + if ( rsp == NULL ) + GOTO (out, rc = -EFAULT); - niobs = lustre_msg_buf(req->rq_reqmsg, 1); page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); - - desc = ptlrpc_prep_bulk(req->rq_connection); - if (desc == NULL) - GOTO(out, rc = -ENOMEM); - desc->bd_ptl_ev_hdlr = NULL; + if (swab) { /* swab remaining niobs */ + for (i = 1; i < page_count; i++) + lustre_swab_ptlbd_niob(&niobs[i]); + } + if (req->rq_export == NULL) { + error_cnt++; + GOTO(out_reply, rc = -EFAULT); + } + + if (cmd == PTLBD_READ) + desc = ptlrpc_prep_bulk_exp (req, BULK_PUT_SOURCE, PTLBD_BULK_PORTAL); + else + desc = ptlrpc_prep_bulk_exp (req, BULK_GET_SINK, PTLBD_BULK_PORTAL); + if (desc == NULL) { + error_cnt++; + GOTO(out_reply, rc = -ENOMEM); + } desc->bd_portal = PTLBD_BULK_PORTAL; + LASSERT (page_count > 0); for ( i = 0, niob = niobs ; i < page_count; niob++, i++) { - struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); - if (bulk == NULL) - GOTO(out_bulk, rc = -ENOMEM); - - bulk->bp_page = alloc_page(GFP_KERNEL); - if (bulk->bp_page == NULL) - GOTO(out_bulk, rc = -ENOMEM); - list_add(&bulk->bp_page->list, &tmp_pages); - - bulk->bp_xid = niob->n_xid; - bulk->bp_buf = page_address(bulk->bp_page); - bulk->bp_buflen = niob->n_length; + page = alloc_page(GFP_KERNEL); + if (page == NULL) { + error_cnt++; + GOTO(out_reply, rc = -ENOMEM); + } + list_add_tail(&page->list, &tmp_pages); + + rc = ptlrpc_prep_bulk_page(desc, page, + niob->n_offset & (PAGE_SIZE - 1), + niob->n_length); + if (rc != 0) { + error_cnt++; + GOTO(out_reply, rc); + } } - if ( op->op_cmd == PTLBD_READ ) { - if ((status = ptlbd_do_filp(filp, PTLBD_READ, niobs, - page_count, &tmp_pages)) < 0) { + if ( cmd == PTLBD_READ ) { + if ((rc = ptlbd_do_filp(filp, PTLBD_READ, niobs, + page_count, &tmp_pages)) < 0) { error_cnt++; + GOTO(out_reply, rc); } rc = ptlrpc_bulk_put(desc); - wait_flag = PTL_BULK_FL_SENT; } else { rc = ptlrpc_bulk_get(desc); - wait_flag = PTL_BULK_FL_RCVD; } - if ( rc ) - GOTO(out_bulk, rc); - - /* this synchronization probably isn't good enough */ - lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, desc->bd_flags & wait_flag, &lwi); - - size[0] = sizeof(struct ptlbd_rsp); - rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); - if ( rc ) - GOTO(out, rc); + if ( rc ) { + error_cnt++; + GOTO(out_reply, rc); + } - rsp = lustre_msg_buf(req->rq_repmsg, 0); - if ( rsp == NULL ) - GOTO(out, rc = -EINVAL); + lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc); + rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); + if (rc != 0) { + LASSERT(rc == -ETIMEDOUT); + ptlrpc_abort_bulk(desc); + error_cnt++; + GOTO(out_reply, rc); + } - if ( op->op_cmd == PTLBD_WRITE ) { - if ((status = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, + if ( cmd == PTLBD_WRITE ) { + if ((rc = ptlbd_do_filp(filp, PTLBD_WRITE, niobs, page_count, &tmp_pages)) < 0) { error_cnt++; } } +out_reply: rsp->r_error_cnt = error_cnt; - rsp->r_status = status; /* I/O status */ - req->rq_status = RQ_OK ; /* XXX */ /* ptlbd req status */ + rsp->r_status = rc; + req->rq_status = rc; - ptlrpc_reply(req->rq_svc, req); + ptlrpc_reply(req); -out_bulk: list_for_each_safe(pos, n, &tmp_pages) { struct page *page = list_entry(pos, struct page, list); list_del(&page->list); __free_page(page); } - ptlrpc_bulk_decref(desc); + if (desc) + ptlrpc_free_bulk(desc); out: RETURN(rc); } + + +int ptlbd_srv_flush_req(ptlbd_cmd_t cmd, __u16 index, + struct ptlrpc_request *req) +{ + struct ptlbd_rsp *rsp; + struct file *filp = req->rq_obd->u.ptlbd.filp; + int size[1], rc, status; + ENTRY; + + size[0] = sizeof(struct ptlbd_rsp); + rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); + if ( rc ) + RETURN(rc); + + rsp = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*rsp)); + if ( rsp == NULL ) + RETURN(-EINVAL); + + if (! (filp) && (filp->f_op) && (filp->f_op->fsync) && + (filp->f_dentry)) + GOTO(out_reply, status = -EINVAL); + + status = filp->f_op->fsync(filp, filp->f_dentry, 1); + +out_reply: + rsp->r_error_cnt = 0; + rsp->r_status = status; + req->rq_status = 0; + + ptlrpc_reply(req); + RETURN(0); +} + + +int ptlbd_handle(struct ptlrpc_request *req) +{ + struct ptlbd_op *op; + int swab; + int rc; + ENTRY; + + swab = lustre_msg_swabbed (req->rq_reqmsg); + + if (req->rq_reqmsg->opc == PTLBD_CONNECT) { + rc = target_handle_connect(req, ptlbd_handle); + target_send_reply(req, rc, OBD_FAIL_PTLRPC); + RETURN(0); + } + if (req->rq_reqmsg->opc == PTLBD_DISCONNECT) { + rc = target_handle_disconnect(req); + target_send_reply(req, rc, OBD_FAIL_PTLRPC); + RETURN(0); + } + op = lustre_swab_reqbuf (req, 0, sizeof (*op), + lustre_swab_ptlbd_op); + if (op == NULL) + RETURN(-EFAULT); + + switch (op->op_cmd) { + case PTLBD_READ: + case PTLBD_WRITE: + rc = ptlbd_srv_rw_req(op->op_cmd, op->op_lun, req, + swab); + break; + + case PTLBD_FLUSH: + rc = ptlbd_srv_flush_req(op->op_cmd, op->op_lun, req); + break; + default: + rc = -EINVAL; + } + + RETURN(rc); +} diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c index e4a7046..34ec737 100644 --- a/lustre/ptlbd/server.c +++ b/lustre/ptlbd/server.c @@ -52,7 +52,7 @@ static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf) ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE, PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, - ptlbd_parse_req, "ptlbd_sv"); + ptlbd_handle, "ptlbd_sv", obddev); if (ptlbd->ptlbd_service == NULL) GOTO(out_filp, rc = -ENOMEM); @@ -74,7 +74,7 @@ out_filp: RETURN(rc); } -static int ptlbd_sv_cleanup(struct obd_device *obddev) +static int ptlbd_sv_cleanup(struct obd_device *obddev, int force, int failover) { struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; ENTRY; @@ -94,6 +94,8 @@ static struct obd_ops ptlbd_sv_obd_ops = { o_owner: THIS_MODULE, o_setup: ptlbd_sv_setup, o_cleanup: ptlbd_sv_cleanup, + o_connect: class_connect, + o_disconnect: class_disconnect, }; int ptlbd_sv_init(void) diff --git a/lustre/ptlrpc/Makefile.am b/lustre/ptlrpc/Makefile.am index 446f110..eb44329 100644 --- a/lustre/ptlrpc/Makefile.am +++ b/lustre/ptlrpc/Makefile.am @@ -7,13 +7,16 @@ DEFS= if LIBLUSTRE lib_LIBRARIES = libptlrpc.a -libptlrpc_a_SOURCES = client.c niobuf.c pack_generic.c recovd.c recover.c connection.c rpc.c events.c # lproc_ptlrpc.c service.c +libptlrpc_a_SOURCES = client.c niobuf.c pack_generic.c recover.c connection.c \ +ptlrpc_module.c events.c ptlrpc_lib.c else MODULE = ptlrpc modulefs_DATA = ptlrpc.o EXTRA_PROGRAMS = ptlrpc -ptlrpc_SOURCES = recovd.c recover.c connection.c rpc.c events.c service.c client.c niobuf.c pack_generic.c lproc_ptlrpc.c +ptlrpc_SOURCES = recover.c connection.c ptlrpc_module.c events.c service.c \ +client.c niobuf.c pack_generic.c lproc_ptlrpc.c pinger.c ptlrpc_lib.c \ +ptlrpc_internal.h endif include $(top_srcdir)/Rules diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 998c462..94a068d 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -33,6 +33,8 @@ #include <linux/lustre_ha.h> #include <linux/lustre_import.h> +#include "ptlrpc_internal.h" + void ptlrpc_init_client(int req_portal, int rep_portal, char *name, struct ptlrpc_client *cl) { @@ -70,7 +72,8 @@ struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) return c; } -void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid *uuid) +void ptlrpc_readdress_connection(struct ptlrpc_connection *conn, + struct obd_uuid *uuid) { struct ptlrpc_peer peer; int err; @@ -85,69 +88,123 @@ void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,struct obd_uuid return; } -struct ptlrpc_bulk_desc *ptlrpc_prep_bulk(struct ptlrpc_connection *conn) +static inline struct ptlrpc_bulk_desc *new_bulk(void) { struct ptlrpc_bulk_desc *desc; OBD_ALLOC(desc, sizeof(*desc)); - if (desc != NULL) { - desc->bd_connection = ptlrpc_connection_addref(conn); - atomic_set(&desc->bd_refcount, 1); - init_waitqueue_head(&desc->bd_waitq); - INIT_LIST_HEAD(&desc->bd_page_list); - INIT_LIST_HEAD(&desc->bd_set_chain); - ptl_set_inv_handle(&desc->bd_md_h); - ptl_set_inv_handle(&desc->bd_me_h); - } + if (!desc) + return NULL; + + spin_lock_init (&desc->bd_lock); + init_waitqueue_head(&desc->bd_waitq); + INIT_LIST_HEAD(&desc->bd_page_list); + desc->bd_md_h = PTL_HANDLE_NONE; + desc->bd_me_h = PTL_HANDLE_NONE; return desc; } -int ptlrpc_bulk_error(struct ptlrpc_bulk_desc *desc) +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, + int type, int portal) { - int rc = 0; - if (desc->bd_flags & PTL_RPC_FL_TIMEOUT) { - rc = (desc->bd_flags & PTL_RPC_FL_INTR ? -ERESTARTSYS : - -ETIMEDOUT); - } - return rc; + struct obd_import *imp = req->rq_import; + unsigned long flags; + struct ptlrpc_bulk_desc *desc; + + LASSERT (type == BULK_PUT_SINK || type == BULK_GET_SOURCE); + + desc = new_bulk(); + if (desc == NULL) + RETURN(NULL); + + /* Is this sampled at the right place? Do we want to get the import + * generation just before we send? Should it match the generation of + * the request? */ + spin_lock_irqsave(&imp->imp_lock, flags); + desc->bd_import_generation = imp->imp_generation; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + desc->bd_import = class_import_get(imp); + desc->bd_req = req; + desc->bd_type = type; + desc->bd_portal = portal; + + /* This makes req own desc, and free it when she frees herself */ + req->rq_bulk = desc; + + return desc; } -struct ptlrpc_bulk_page *ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc) +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_exp (struct ptlrpc_request *req, + int type, int portal) +{ + struct obd_export *exp = req->rq_export; + struct ptlrpc_bulk_desc *desc; + + LASSERT (type == BULK_PUT_SOURCE || type == BULK_GET_SINK); + + desc = new_bulk(); + if (desc == NULL) + RETURN(NULL); + + desc->bd_export = class_export_get(exp); + desc->bd_req = req; + desc->bd_type = type; + desc->bd_portal = portal; + + /* NB we don't assign rq_bulk here; server-side requests are + * re-used, and the handler frees the bulk desc explicitly. */ + + return desc; +} + +int ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len) { struct ptlrpc_bulk_page *bulk; OBD_ALLOC(bulk, sizeof(*bulk)); - if (bulk != NULL) { - bulk->bp_desc = desc; - list_add_tail(&bulk->bp_link, &desc->bd_page_list); - desc->bd_page_count++; - } - return bulk; + if (bulk == NULL) + return (-ENOMEM); + + LASSERT (page != NULL); + LASSERT (pageoffset >= 0); + LASSERT (len > 0); + LASSERT (pageoffset + len <= PAGE_SIZE); + + bulk->bp_page = page; + bulk->bp_pageoffset = pageoffset; + bulk->bp_buflen = len; + + bulk->bp_desc = desc; + list_add_tail(&bulk->bp_link, &desc->bd_page_list); + desc->bd_page_count++; + return 0; } void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) { struct list_head *tmp, *next; ENTRY; - if (desc == NULL) { - EXIT; - return; - } - - LASSERT(list_empty(&desc->bd_set_chain)); - - if (atomic_read(&desc->bd_refcount) != 0) - CERROR("freeing desc %p with refcount %d!\n", desc, - atomic_read(&desc->bd_refcount)); + LASSERT (desc != NULL); + LASSERT (desc->bd_page_count != 0x5a5a5a5a); /* not freed already */ + LASSERT (!desc->bd_network_rw); /* network hands off or */ + list_for_each_safe(tmp, next, &desc->bd_page_list) { struct ptlrpc_bulk_page *bulk; bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); ptlrpc_free_bulk_page(bulk); } - ptlrpc_put_connection(desc->bd_connection); + LASSERT (desc->bd_page_count == 0); + LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); + + if (desc->bd_export) + class_export_put(desc->bd_export); + else + class_import_put(desc->bd_import); OBD_FREE(desc, sizeof(*desc)); EXIT; @@ -155,168 +212,666 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) void ptlrpc_free_bulk_page(struct ptlrpc_bulk_page *bulk) { - ENTRY; - if (bulk == NULL) { - EXIT; - return; - } - + LASSERT (bulk != NULL); + list_del(&bulk->bp_link); bulk->bp_desc->bd_page_count--; OBD_FREE(bulk, sizeof(*bulk)); - EXIT; } -static int ll_sync_brw_timeout(void *data) +struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, + int count, int *lengths, char **bufs) { - struct obd_brw_set *set = data; - struct list_head *tmp; - int failed = 0; + struct ptlrpc_request *request; + int rc; ENTRY; - LASSERT(set); + LASSERT((unsigned long)imp > 0x1000); - set->brw_flags |= PTL_RPC_FL_TIMEOUT; + OBD_ALLOC(request, sizeof(*request)); + if (!request) { + CERROR("request allocation out of memory\n"); + RETURN(NULL); + } - list_for_each(tmp, &set->brw_desc_head) { - struct ptlrpc_bulk_desc *desc = - list_entry(tmp, struct ptlrpc_bulk_desc, bd_set_chain); + rc = lustre_pack_msg(count, lengths, bufs, + &request->rq_reqlen, &request->rq_reqmsg); + if (rc) { + CERROR("cannot pack request %d\n", rc); + OBD_FREE(request, sizeof(*request)); + RETURN(NULL); + } - /* Skip descriptors that were completed successfully. */ - if (desc->bd_flags & (PTL_BULK_FL_RCVD | PTL_BULK_FL_SENT)) - continue; + request->rq_timeout = obd_timeout; + request->rq_level = LUSTRE_CONN_FULL; + request->rq_type = PTL_RPC_MSG_REQUEST; + request->rq_import = class_import_get(imp); + request->rq_phase = RQ_PHASE_NEW; + + /* XXX FIXME bug 249 */ + request->rq_request_portal = imp->imp_client->cli_request_portal; + request->rq_reply_portal = imp->imp_client->cli_reply_portal; - LASSERT(desc->bd_connection); - - /* If PtlMDUnlink succeeds, then bulk I/O on the MD hasn't - * even started yet. XXX where do we kunmup the thing? - * - * If it fail with PTL_MD_BUSY, then the network is still - * reading/writing the buffers and we must wait for it to - * complete (which it will within finite time, most - * probably with failure; we really need portals error - * events to detect that). - * - * Otherwise (PTL_INV_MD) it completed after the bd_flags - * test above! - */ - if (PtlMDUnlink(desc->bd_md_h) != PTL_OK) { - CERROR("Near-miss on OST %s -- need to adjust " - "obd_timeout?\n", - desc->bd_connection->c_remote_uuid.uuid); - continue; - } + request->rq_connection = ptlrpc_connection_addref(imp->imp_connection); - CERROR("IO of %d pages to/from %s:%d (conn %p) timed out\n", - desc->bd_page_count, - desc->bd_connection->c_remote_uuid.uuid, - desc->bd_portal, desc->bd_connection); + spin_lock_init (&request->rq_lock); + INIT_LIST_HEAD(&request->rq_list); + init_waitqueue_head(&request->rq_wait_for_rep); + request->rq_xid = ptlrpc_next_xid(); + atomic_set(&request->rq_refcount, 1); - /* This one will "never" arrive, don't wait for it. */ - if (atomic_dec_and_test(&set->brw_refcount)) - wake_up(&set->brw_waitq); + request->rq_reqmsg->opc = opcode; + request->rq_reqmsg->flags = 0; - if (class_signal_connection_failure) - class_signal_connection_failure(desc->bd_connection); - else - failed = 1; + RETURN(request); +} + +struct ptlrpc_request_set *ptlrpc_prep_set(void) +{ + struct ptlrpc_request_set *set; + + OBD_ALLOC(set, sizeof *set); + if (!set) + RETURN(NULL); + INIT_LIST_HEAD(&set->set_requests); + init_waitqueue_head(&set->set_waitq); + set->set_remaining = 0; + + RETURN(set); +} + +/* Finish with this set; opposite of prep_set. */ +void ptlrpc_set_destroy(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct list_head *next; + int expected_phase; + int n = 0; + ENTRY; + + /* Requests on the set should either all be completed, or all be new */ + expected_phase = (set->set_remaining == 0) ? + RQ_PHASE_COMPLETE : RQ_PHASE_NEW; + list_for_each (tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT (req->rq_phase == expected_phase); + n++; + } + + LASSERT (set->set_remaining == 0 || set->set_remaining == n); + + list_for_each_safe(tmp, next, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + list_del_init(&req->rq_set_chain); + + LASSERT (req->rq_phase == expected_phase); + + if (req->rq_phase == RQ_PHASE_NEW) { + + if (req->rq_interpret_reply != NULL) { + int (*interpreter)(struct ptlrpc_request *, void *, int) = + req->rq_interpret_reply; + + /* higher level (i.e. LOV) failed; + * let the sub reqs clean up */ + req->rq_status = -EBADR; + interpreter(req, &req->rq_async_args, req->rq_status); + } + set->set_remaining--; + } + + req->rq_set = NULL; + ptlrpc_req_finished (req); } - /* 0 = We go back to sleep, until we're resumed or interrupted */ - /* 1 = We can't be recovered, just abort the syscall with -ETIMEDOUT */ - RETURN(failed); + LASSERT(set->set_remaining == 0); + + OBD_FREE(set, sizeof(*set)); + EXIT; } -static int ll_sync_brw_intr(void *data) +void ptlrpc_set_add_req(struct ptlrpc_request_set *set, + struct ptlrpc_request *req) { - struct obd_brw_set *set = data; + /* The set takes over the caller's request reference */ + list_add_tail(&req->rq_set_chain, &set->set_requests); + req->rq_set = set; + set->set_remaining++; +} +static int ptlrpc_check_reply(struct ptlrpc_request *req) +{ + unsigned long flags; + int rc = 0; ENTRY; - set->brw_flags |= PTL_RPC_FL_INTR; - RETURN(1); /* ignored, as of this writing */ + + /* serialise with network callback */ + spin_lock_irqsave (&req->rq_lock, flags); + + if (req->rq_replied) { + DEBUG_REQ(D_NET, req, "REPLIED:"); + GOTO(out, rc = 1); + } + + if (req->rq_err) { + DEBUG_REQ(D_ERROR, req, "ABORTED:"); + GOTO(out, rc = 1); + } + + if (req->rq_resend) { + DEBUG_REQ(D_ERROR, req, "RESEND:"); + GOTO(out, rc = 1); + } + + if (req->rq_restart) { + DEBUG_REQ(D_ERROR, req, "RESTART:"); + GOTO(out, rc = 1); + } + EXIT; + out: + spin_unlock_irqrestore (&req->rq_lock, flags); + DEBUG_REQ(D_NET, req, "rc = %d for", rc); + return rc; } -int ll_brw_sync_wait(struct obd_brw_set *set, int phase) +static int ptlrpc_check_status(struct ptlrpc_request *req) { - struct l_wait_info lwi; - struct list_head *tmp, *next; - int rc = 0; + int err; + ENTRY; + + err = req->rq_repmsg->status; + if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) { + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err); + if (err >= 0) + CERROR("Error Reply has >= zero status\n"); + RETURN(err < 0 ? err : -EINVAL); + } + + if (err < 0) { + DEBUG_REQ(D_INFO, req, "status is %d", err); + } else if (err > 0) { + /* XXX: translate this error from net to host */ + DEBUG_REQ(D_INFO, req, "status is %d", err); + } + + RETURN(err); +} + +#warning this needs to change after robert fixes eviction handling +static int +after_reply(struct ptlrpc_request *req, int *restartp) +{ + unsigned long flags; + struct obd_import *imp = req->rq_import; + int rc; ENTRY; - obd_brw_set_addref(set); - switch(phase) { - case CB_PHASE_START: - lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, ll_sync_brw_timeout, - ll_sync_brw_intr, set); - rc = l_wait_event(set->brw_waitq, - atomic_read(&set->brw_desc_count) == 0, &lwi); - - list_for_each_safe(tmp, next, &set->brw_desc_head) { - struct ptlrpc_bulk_desc *desc = - list_entry(tmp, struct ptlrpc_bulk_desc, - bd_set_chain); - list_del_init(&desc->bd_set_chain); - ptlrpc_bulk_decref(desc); + LASSERT (!req->rq_receiving_reply); + LASSERT (req->rq_replied); + + if (restartp != NULL) + *restartp = 0; + + /* NB Until this point, the whole of the incoming message, + * including buflens, status etc is in the sender's byte order. */ + +#if SWAB_PARANOIA + /* Clear reply swab mask; this is a new reply in sender's byte order */ + req->rq_rep_swab_mask = 0; +#endif + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); + if (rc) { + CERROR("unpack_rep failed: %d\n", rc); + RETURN (-EPROTO); + } + + if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY && + req->rq_repmsg->type != PTL_RPC_MSG_ERR) { + CERROR("invalid packet type received (type=%u)\n", + req->rq_repmsg->type); + RETURN (-EPROTO); + } + + /* Store transno in reqmsg for replay. */ + req->rq_reqmsg->transno = req->rq_transno = req->rq_repmsg->transno; + + rc = ptlrpc_check_status(req); + + /* Either we've been evicted, or the server has failed for + * some reason. Try to reconnect, and if that fails, punt to + * upcall */ + if (rc == -ENOTCONN) { + if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov || + imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { + RETURN(-ENOTCONN); } - break; - case CB_PHASE_FINISH: - if (atomic_dec_and_test(&set->brw_desc_count)) - wake_up(&set->brw_waitq); - break; - default: + + rc = ptlrpc_request_handle_eviction(req); + if (rc) + CERROR("can't reconnect to %s@%s: %d\n", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid, rc); + else + ptlrpc_wake_delayed(imp); + + if (req->rq_err) + RETURN(-EIO); + + if (req->rq_resend) { + if (restartp == NULL) + LBUG(); /* async resend not supported yet */ + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_resend = 0; + spin_unlock_irqrestore (&req->rq_lock, flags); + *restartp = 1; + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + DEBUG_REQ(D_HA, req, "resending: "); + RETURN (0); + } + + CERROR("request should be err or resend: %p\n", req); LBUG(); } - obd_brw_set_decref(set); + if (req->rq_import->imp_replayable) { + spin_lock_irqsave(&imp->imp_lock, flags); + if ((req->rq_replay || req->rq_transno != 0) && rc >= 0) + ptlrpc_retain_replayable_request(req, imp); + + if (req->rq_transno > imp->imp_max_transno) + imp->imp_max_transno = req->rq_transno; + + /* Replay-enabled imports return commit-status information. */ + if (req->rq_repmsg->last_committed) { + if (req->rq_repmsg->last_committed < + imp->imp_peer_committed_transno) { + CERROR("%s went back in time (transno "LPD64 + " was committed, server claims "LPD64 + ")! is shared storage not coherent?\n", + imp->imp_target_uuid.uuid, + imp->imp_peer_committed_transno, + req->rq_repmsg->last_committed); + } + imp->imp_peer_committed_transno = + req->rq_repmsg->last_committed; + } + ptlrpc_free_committed(imp); + spin_unlock_irqrestore(&imp->imp_lock, flags); + } + RETURN(rc); } -struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, - int count, int *lengths, char **bufs) +static int check_set(struct ptlrpc_request_set *set) { - struct ptlrpc_connection *conn; - struct ptlrpc_request *request; - int rc; + unsigned long flags; + struct list_head *tmp; + ENTRY; + + if (set->set_remaining == 0) + RETURN(1); + + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + struct obd_import *imp = req->rq_import; + int rc = 0; + + LASSERT (req->rq_phase == RQ_PHASE_RPC || + req->rq_phase == RQ_PHASE_BULK || + req->rq_phase == RQ_PHASE_COMPLETE); + + if (req->rq_phase == RQ_PHASE_COMPLETE) + continue; + + if (req->rq_err) { + ptlrpc_unregister_reply(req); + if (req->rq_status == 0) + req->rq_status = -EIO; + req->rq_phase = RQ_PHASE_INTERPRET; + + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + GOTO (interpret, req->rq_status); + } + + if (req->rq_intr) { + /* NB could be on delayed list */ + ptlrpc_unregister_reply(req); + req->rq_status = -EINTR; + req->rq_phase = RQ_PHASE_INTERPRET; + + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + GOTO (interpret, req->rq_status); + } + + if (req->rq_phase == RQ_PHASE_RPC) { + int do_restart = 0; + if (req->rq_waiting || req->rq_resend) { + spin_lock_irqsave(&imp->imp_lock, flags); + + if (req->rq_level > imp->imp_level) { + spin_unlock_irqrestore(&imp->imp_lock, + flags); + continue; + } + + list_del(&req->rq_list); + list_add_tail(&req->rq_list, + &imp->imp_sending_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + req->rq_waiting = 0; + if (req->rq_resend) { + lustre_msg_add_flags(req->rq_reqmsg, + MSG_RESENT); + spin_lock_irqsave(&req->rq_lock, flags); + req->rq_resend = 0; + spin_unlock_irqrestore(&req->rq_lock, + flags); + ptlrpc_unregister_reply(req); + if (req->rq_bulk) + ptlrpc_unregister_bulk(req); + } + + rc = ptl_send_rpc(req); + if (rc) { + req->rq_status = rc; + req->rq_phase = RQ_PHASE_INTERPRET; + GOTO (interpret, req->rq_status); + } + + } + + /* Ensure the network callback returned */ + spin_lock_irqsave (&req->rq_lock, flags); + if (!req->rq_replied) { + spin_unlock_irqrestore (&req->rq_lock, flags); + continue; + } + spin_unlock_irqrestore (&req->rq_lock, flags); + + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + req->rq_status = after_reply(req, &do_restart); + if (do_restart) { + req->rq_resend = 1; /* ugh */ + continue; + } + + if (req->rq_bulk == NULL) { + req->rq_phase = RQ_PHASE_INTERPRET; + GOTO (interpret, req->rq_status); + } + + req->rq_phase = RQ_PHASE_BULK; + } + + LASSERT (req->rq_phase == RQ_PHASE_BULK); + if (!ptlrpc_bulk_complete (req->rq_bulk)) + continue; + + req->rq_phase = RQ_PHASE_INTERPRET; + + interpret: + LASSERT (req->rq_phase == RQ_PHASE_INTERPRET); + LASSERT (!req->rq_receiving_reply); + + if (req->rq_bulk != NULL) + ptlrpc_unregister_bulk (req); + + if (req->rq_interpret_reply != NULL) { + int (*interpreter)(struct ptlrpc_request *, void *, int) = + req->rq_interpret_reply; + req->rq_status = interpreter(req, &req->rq_async_args, + req->rq_status); + } + + CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:" + "opc %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, + req->rq_xid, + imp->imp_connection->c_peer.peer_ni->pni_name, + imp->imp_connection->c_peer.peer_nid, + req->rq_reqmsg->opc); + + req->rq_phase = RQ_PHASE_COMPLETE; + set->set_remaining--; + } + + RETURN (set->set_remaining == 0); +} + +static int expire_one_request(struct ptlrpc_request *req) +{ + unsigned long flags; + struct obd_import *imp = req->rq_import; + ENTRY; + + DEBUG_REQ(D_ERROR, req, "timeout"); + + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_timedout = 1; + spin_unlock_irqrestore (&req->rq_lock, flags); + + ptlrpc_unregister_reply (req); + + if (imp == NULL) { + DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); + RETURN(1); + } + + /* The DLM server doesn't want recovery run on its imports. */ + if (imp->imp_dlm_fake) + RETURN(1); + + /* If this request is for recovery or other primordial tasks, + * don't go back to sleep, and don't start recovery again.. */ + if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov || + imp->imp_obd->obd_no_recov) + RETURN(1); + + ptlrpc_fail_import(imp, req->rq_import_generation); + + RETURN(0); +} + +static int expired_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + time_t now = LTIME_S (CURRENT_TIME); ENTRY; - LASSERT((unsigned long)imp > 0x1000); - conn = imp->imp_connection; + LASSERT (set != NULL); + CERROR("EXPIRED SET %p\n", set); + + /* A timeout expired; see which reqs it applies to... */ + list_for_each (tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + /* request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_sent + req->rq_timeout > now) /* not expired */ + continue; + + /* deal with this guy */ + expire_one_request (req); + } + + /* When waiting for a whole set, we always to break out of the + * sleep so we can recalculate the timeout, or enable interrupts + * iff everyone's timed out. + */ + RETURN(1); +} + +static void interrupted_set(void *data) +{ + struct ptlrpc_request_set *set = data; + struct list_head *tmp; + unsigned long flags; + + LASSERT (set != NULL); + CERROR("INTERRUPTED SET %p\n", set); + + list_for_each(tmp, &set->set_requests) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + if (req->rq_phase != RQ_PHASE_RPC) + continue; + + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_intr = 1; + spin_unlock_irqrestore (&req->rq_lock, flags); + } +} + +int ptlrpc_set_wait(struct ptlrpc_request_set *set) +{ + struct list_head *tmp; + struct obd_import *imp; + struct ptlrpc_request *req; + struct l_wait_info lwi; + unsigned long flags; + int rc; + time_t now; + time_t deadline; + int timeout; + ENTRY; + + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT (req->rq_level == LUSTRE_CONN_FULL); + LASSERT (req->rq_phase == RQ_PHASE_NEW); + req->rq_phase = RQ_PHASE_RPC; + + imp = req->rq_import; + spin_lock_irqsave(&imp->imp_lock, flags); + + if (imp->imp_invalid) { + spin_unlock_irqrestore(&imp->imp_lock, flags); + req->rq_status = -EIO; + req->rq_phase = RQ_PHASE_INTERPRET; + continue; + } - OBD_ALLOC(request, sizeof(*request)); - if (!request) { - CERROR("request allocation out of memory\n"); - RETURN(NULL); - } + if (req->rq_level > imp->imp_level) { + if (req->rq_no_recov || imp->imp_obd->obd_no_recov || + imp->imp_dlm_fake) { + spin_unlock_irqrestore(&imp->imp_lock, flags); + req->rq_status = -EWOULDBLOCK; + req->rq_phase = RQ_PHASE_INTERPRET; + continue; + } + + spin_lock (&req->rq_lock); + req->rq_waiting = 1; + spin_unlock (&req->rq_lock); + LASSERT (list_empty (&req->rq_list)); + // list_del(&req->rq_list); + list_add_tail(&req->rq_list, &imp->imp_delayed_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + continue; + } - rc = lustre_pack_msg(count, lengths, bufs, - &request->rq_reqlen, &request->rq_reqmsg); - if (rc) { - CERROR("cannot pack request %d\n", rc); - OBD_FREE(request, sizeof(*request)); - RETURN(NULL); - } + /* XXX this is the same as ptlrpc_queue_wait */ + LASSERT(list_empty(&req->rq_list)); + list_add_tail(&req->rq_list, &imp->imp_sending_list); + req->rq_import_generation = imp->imp_generation; + spin_unlock_irqrestore(&imp->imp_lock, flags); - request->rq_timeout = obd_timeout; - request->rq_level = LUSTRE_CONN_FULL; - request->rq_type = PTL_RPC_MSG_REQUEST; - request->rq_import = imp; + CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc" + " %s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + imp->imp_obd->obd_uuid.uuid, req->rq_reqmsg->status, + req->rq_xid, + imp->imp_connection->c_peer.peer_ni->pni_name, + imp->imp_connection->c_peer.peer_nid, + req->rq_reqmsg->opc); - /* XXX FIXME bug 625069, now 249 */ - request->rq_request_portal = imp->imp_client->cli_request_portal; - request->rq_reply_portal = imp->imp_client->cli_reply_portal; + rc = ptl_send_rpc(req); + if (rc) { + req->rq_status = rc; + req->rq_phase = RQ_PHASE_INTERPRET; + } + } - request->rq_connection = ptlrpc_connection_addref(conn); + do { + now = LTIME_S (CURRENT_TIME); + timeout = 0; + list_for_each (tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); - INIT_LIST_HEAD(&request->rq_list); - atomic_set(&request->rq_refcount, 1); + /* request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; - request->rq_reqmsg->magic = PTLRPC_MSG_MAGIC; - request->rq_reqmsg->version = PTLRPC_MSG_VERSION; - request->rq_reqmsg->opc = HTON__u32(opcode); - request->rq_reqmsg->flags = 0; + if (req->rq_timedout) /* already timed out */ + continue; + + deadline = req->rq_sent + req->rq_timeout; + if (deadline <= now) /* actually expired already */ + timeout = 1; /* ASAP */ + else if (timeout == 0 || timeout > deadline - now) + timeout = deadline - now; + } - ptlrpc_hdl2req(request, &imp->imp_handle); - RETURN(request); + /* wait until all complete, interrupted, or an in-flight + * req times out */ + CDEBUG(D_HA, "set %p going to sleep for %d seconds\n", + set, timeout); + lwi = LWI_TIMEOUT_INTR(timeout * HZ, + expired_set, interrupted_set, set); + rc = l_wait_event(set->set_waitq, check_set(set), &lwi); + + LASSERT (rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); + + /* -EINTR => all requests have been flagged rq_intr so next + * check completes. + * -ETIMEOUTD => someone timed out. When all reqs have + * timed out, signals are enabled allowing completion with + * EINTR. + * I don't really care if we go once more round the loop in + * the error cases -eeb. */ + } while (rc != 0); + + LASSERT (set->set_remaining == 0); + + rc = 0; + list_for_each(tmp, &set->set_requests) { + req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); + + LASSERT (req->rq_phase == RQ_PHASE_COMPLETE); + if (req->rq_status != 0) + rc = req->rq_status; + } + + if (set->set_interpret != NULL) { + int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = + set->set_interpret; + rc = interpreter (set, &set->set_args, rc); + } + + RETURN(rc); } static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) @@ -327,9 +882,11 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) return; } + LASSERT (!request->rq_receiving_reply); + /* We must take it off the imp_replay_list first. Otherwise, we'll set * request->rq_reqmsg to NULL while osc_close is dereferencing it. */ - if (request->rq_import) { + if (request->rq_import != NULL) { unsigned long flags = 0; if (!locked) spin_lock_irqsave(&request->rq_import->imp_lock, flags); @@ -340,23 +897,29 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) } if (atomic_read(&request->rq_refcount) != 0) { - CERROR("freeing request %p (%d->%s:%d) with refcount %d\n", - request, request->rq_reqmsg->opc, - request->rq_connection->c_remote_uuid.uuid, - request->rq_import->imp_client->cli_request_portal, - atomic_read (&request->rq_refcount)); + DEBUG_REQ(D_ERROR, request, + "freeing request with nonzero refcount"); LBUG(); } if (request->rq_repmsg != NULL) { OBD_FREE(request->rq_repmsg, request->rq_replen); request->rq_repmsg = NULL; - request->rq_reply_md.start = NULL; } if (request->rq_reqmsg != NULL) { OBD_FREE(request->rq_reqmsg, request->rq_reqlen); request->rq_reqmsg = NULL; } + if (request->rq_export != NULL) { + class_export_put(request->rq_export); + request->rq_export = NULL; + } + if (request->rq_import != NULL) { + class_import_put(request->rq_import); + request->rq_import = NULL; + } + if (request->rq_bulk != NULL) + ptlrpc_free_bulk(request->rq_bulk); ptlrpc_put_connection(request->rq_connection); OBD_FREE(request, sizeof(*request)); @@ -396,81 +959,81 @@ void ptlrpc_req_finished(struct ptlrpc_request *request) __ptlrpc_req_finished(request, 0); } -static int ptlrpc_check_reply(struct ptlrpc_request *req) +static void ptlrpc_cleanup_request_buf(struct ptlrpc_request *request) { - int rc = 0; - - ENTRY; - if (req->rq_repmsg != NULL) { - req->rq_transno = NTOH__u64(req->rq_repmsg->transno); - /* Store transno in reqmsg for replay. */ - req->rq_reqmsg->transno = req->rq_repmsg->transno; - req->rq_flags |= PTL_RPC_FL_REPLIED; - GOTO(out, rc = 1); - } - - if (req->rq_flags & PTL_RPC_FL_RESEND) { - DEBUG_REQ(D_ERROR, req, "RESEND:"); - GOTO(out, rc = 1); - } - - if (req->rq_flags & PTL_RPC_FL_ERR) { - ENTRY; - DEBUG_REQ(D_ERROR, req, "ABORTED:"); - GOTO(out, rc = 1); - } - - if (req->rq_flags & PTL_RPC_FL_RESTART) { - DEBUG_REQ(D_ERROR, req, "RESTART:"); - GOTO(out, rc = 1); - } - EXIT; - out: - DEBUG_REQ(D_NET, req, "rc = %d for", rc); - return rc; + OBD_FREE(request->rq_reqmsg, request->rq_reqlen); + request->rq_reqmsg = NULL; + request->rq_reqlen = 0; } -static int ptlrpc_check_status(struct ptlrpc_request *req) +/* Disengage the client's reply buffer from the network + * NB does _NOT_ unregister any client-side bulk. + * IDEMPOTENT, but _not_ safe against concurrent callers. + * The request owner (i.e. the thread doing the I/O) must call... + */ +void ptlrpc_unregister_reply (struct ptlrpc_request *request) { - int err; + unsigned long flags; + int rc; ENTRY; - err = req->rq_repmsg->status; - if (req->rq_repmsg->type == NTOH__u32(PTL_RPC_MSG_ERR)) { - DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err); - RETURN(err ? err : -EINVAL); - } + LASSERT (!in_interrupt ()); /* might sleep */ - if (err < 0) { - DEBUG_REQ(D_INFO, req, "status is %d", err); - } else if (err > 0) { - /* XXX: translate this error from net to host */ - DEBUG_REQ(D_INFO, req, "status is %d", err); + spin_lock_irqsave (&request->rq_lock, flags); + if (!request->rq_receiving_reply) { /* not waiting for a reply */ + spin_unlock_irqrestore (&request->rq_lock, flags); + EXIT; + /* NB reply buffer not freed here */ + return; } - RETURN(err); -} - -static void ptlrpc_cleanup_request_buf(struct ptlrpc_request *request) -{ - OBD_FREE(request->rq_reqmsg, request->rq_reqlen); - request->rq_reqmsg = NULL; - request->rq_reqlen = 0; -} - -/* Abort this request and cleanup any resources associated with it. */ -int ptlrpc_abort(struct ptlrpc_request *request) -{ - /* First remove the ME for the reply; in theory, this means - * that we can tear down the buffer safely. */ - if (PtlMEUnlink(request->rq_reply_me_h) != PTL_OK) - RETURN(0); - OBD_FREE(request->rq_reply_md.start, request->rq_replen); + LASSERT (!request->rq_replied); /* callback hasn't completed */ + spin_unlock_irqrestore (&request->rq_lock, flags); + + rc = PtlMDUnlink (request->rq_reply_md_h); + switch (rc) { + default: + LBUG (); + + case PTL_OK: /* unlinked before completion */ + LASSERT (request->rq_receiving_reply); + LASSERT (!request->rq_replied); + spin_lock_irqsave (&request->rq_lock, flags); + request->rq_receiving_reply = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); + OBD_FREE(request->rq_repmsg, request->rq_replen); + request->rq_repmsg = NULL; + EXIT; + return; + + case PTL_MD_INUSE: /* callback in progress */ + for (;;) { + /* Network access will complete in finite time but + * the timeout lets us CERROR for visibility */ + struct l_wait_info lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL); + + rc = l_wait_event (request->rq_wait_for_rep, + request->rq_replied, &lwi); + LASSERT (rc == 0 || rc == -ETIMEDOUT); + if (rc == 0) { + spin_lock_irqsave (&request->rq_lock, flags); + /* Ensure the callback has completed scheduling me + * and taken its hands off the request */ + spin_unlock_irqrestore (&request->rq_lock, flags); + break; + } + + CERROR ("Unexpectedly long timeout: req %p\n", request); + } + /* fall through */ - memset(&request->rq_reply_me_h, 0, sizeof(request->rq_reply_me_h)); - request->rq_reply_md.start = NULL; - request->rq_repmsg = NULL; - return 0; + case PTL_INV_MD: /* callback completed */ + LASSERT (!request->rq_receiving_reply); + LASSERT (request->rq_replied); + EXIT; + return; + } + /* Not Reached */ } /* caller must hold imp->imp_lock */ @@ -478,6 +1041,7 @@ void ptlrpc_free_committed(struct obd_import *imp) { struct list_head *tmp, *saved; struct ptlrpc_request *req; + struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ ENTRY; LASSERT(imp != NULL); @@ -492,7 +1056,11 @@ void ptlrpc_free_committed(struct obd_import *imp) list_for_each_safe(tmp, saved, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); - if (req->rq_flags & PTL_RPC_FL_REPLAY) { + /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ + LASSERT (req != last_req); + last_req = req; + + if (req->rq_replay) { DEBUG_REQ(D_HA, req, "keeping (FL_REPLAY)"); continue; } @@ -515,104 +1083,67 @@ void ptlrpc_free_committed(struct obd_import *imp) void ptlrpc_cleanup_client(struct obd_import *imp) { - struct list_head *tmp, *saved; - struct ptlrpc_request *req; - struct ptlrpc_connection *conn = imp->imp_connection; - unsigned long flags; ENTRY; - - LASSERT(conn); - - spin_lock_irqsave(&imp->imp_lock, flags); - list_for_each_safe(tmp, saved, &imp->imp_replay_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - - /* XXX we should make sure that nobody's sleeping on these! */ - DEBUG_REQ(D_HA, req, "cleaning up from sending list"); - list_del_init(&req->rq_list); - req->rq_import = NULL; - __ptlrpc_req_finished(req, 0); - } - spin_unlock_irqrestore(&imp->imp_lock, flags); - EXIT; return; } -void ptlrpc_continue_req(struct ptlrpc_request *req) -{ - DEBUG_REQ(D_HA, req, "continuing delayed request"); - req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; - req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie; - wake_up(&req->rq_wait_for_rep); -} - void ptlrpc_resend_req(struct ptlrpc_request *req) { + unsigned long flags; + DEBUG_REQ(D_HA, req, "resending"); - req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; - req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie; + req->rq_reqmsg->handle.cookie = 0; + ptlrpc_put_connection(req->rq_connection); + req->rq_connection = + ptlrpc_connection_addref(req->rq_import->imp_connection); req->rq_status = -EAGAIN; - req->rq_level = LUSTRE_CONN_RECOVD; - req->rq_flags |= PTL_RPC_FL_RESEND; - req->rq_flags &= ~PTL_RPC_FL_TIMEOUT; - wake_up(&req->rq_wait_for_rep); + + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_resend = 1; + req->rq_timedout = 0; + if (req->rq_set != NULL) + wake_up (&req->rq_set->set_waitq); + else + wake_up(&req->rq_wait_for_rep); + spin_unlock_irqrestore (&req->rq_lock, flags); } +/* XXX: this function and rq_status are currently unused */ void ptlrpc_restart_req(struct ptlrpc_request *req) { + unsigned long flags; + DEBUG_REQ(D_HA, req, "restarting (possibly-)completed request"); req->rq_status = -ERESTARTSYS; - req->rq_flags |= PTL_RPC_FL_RESTART; - req->rq_flags &= ~PTL_RPC_FL_TIMEOUT; - wake_up(&req->rq_wait_for_rep); + + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_restart = 1; + req->rq_timedout = 0; + if (req->rq_set != NULL) + wake_up (&req->rq_set->set_waitq); + else + wake_up(&req->rq_wait_for_rep); + spin_unlock_irqrestore (&req->rq_lock, flags); } static int expired_request(void *data) { struct ptlrpc_request *req = data; - ENTRY; - if (!req) { - CERROR("NULL req!"); - LBUG(); - RETURN(0); - } - - DEBUG_REQ(D_ERROR, req, "timeout"); - ptlrpc_abort(req); - req->rq_flags |= PTL_RPC_FL_TIMEOUT; - - if (!req->rq_import) { - DEBUG_REQ(D_HA, req, "NULL import; already cleaned up?"); - RETURN(1); - } - - if (!req->rq_import->imp_connection) { - DEBUG_REQ(D_ERROR, req, "NULL connection"); - LBUG(); - RETURN(0); - } - - if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd) - RETURN(1); - - recovd_conn_fail(req->rq_import->imp_connection); - /* If this request is for recovery or other primordial tasks, - * don't go back to sleep. - */ - if (req->rq_level < LUSTRE_CONN_FULL) - RETURN(1); - RETURN(0); + RETURN(expire_one_request(req)); } -static int interrupted_request(void *data) +static void interrupted_request(void *data) { + unsigned long flags; + struct ptlrpc_request *req = data; - ENTRY; - req->rq_flags |= PTL_RPC_FL_INTR; - RETURN(1); /* ignored, as of this writing */ + DEBUG_REQ(D_HA, req, "request interrupted"); + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_intr = 1; + spin_unlock_irqrestore (&req->rq_lock, flags); } struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) @@ -631,7 +1162,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, LASSERT(spin_is_locked(&imp->imp_lock)); #endif - LASSERT(imp->imp_flags & IMP_REPLAYABLE); + LASSERT(imp->imp_replayable); /* Balanced in ptlrpc_free_committed, usually. */ ptlrpc_request_addref(req); list_for_each_prev(tmp, &imp->imp_replay_list) { @@ -642,6 +1173,7 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, * open a file, or for closes retained if to match creating * opens, so use req->rq_xid as a secondary key. * (See bugs 684, 685, and 428.) + * XXX no longer needed, but all opens need transnos! */ if (iter->rq_transno > req->rq_transno) continue; @@ -662,196 +1194,228 @@ void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, int ptlrpc_queue_wait(struct ptlrpc_request *req) { int rc = 0; + int brc; struct l_wait_info lwi; struct obd_import *imp = req->rq_import; + struct obd_device *obd = imp->imp_obd; struct ptlrpc_connection *conn = imp->imp_connection; unsigned int flags; + int do_restart = 0; + int timeout = 0; ENTRY; - init_waitqueue_head(&req->rq_wait_for_rep); - - req->rq_xid = HTON__u32(ptlrpc_next_xid()); - + LASSERT (req->rq_set == NULL); + LASSERT (!req->rq_receiving_reply); + /* for distributed debugging */ - req->rq_reqmsg->status = HTON__u32(current->pid); - CDEBUG(D_RPCTRACE, "Sending RPC pid:xid:nid:opc %d:"LPU64":%s:"LPX64 - ":%d\n", NTOH__u32(req->rq_reqmsg->status), req->rq_xid, + req->rq_reqmsg->status = current->pid; + LASSERT(imp->imp_obd != NULL); + CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:ni:nid:opc " + "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + imp->imp_obd->obd_uuid.uuid, + req->rq_reqmsg->status, req->rq_xid, conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid, - NTOH__u32(req->rq_reqmsg->opc)); - - spin_lock_irqsave(&imp->imp_lock, flags); + req->rq_reqmsg->opc); + /* Mark phase here for a little debug help */ + req->rq_phase = RQ_PHASE_RPC; + +restart: /* * If the import has been invalidated (such as by an OST failure), the - * request must fail with -EIO. + * request must fail with -EIO. Recovery requests are allowed to go + * through, though, so that they have a chance to revalidate the + * import. */ - if (req->rq_import->imp_flags & IMP_INVALID) { + spin_lock_irqsave(&imp->imp_lock, flags); + if (req->rq_import->imp_invalid && req->rq_level == LUSTRE_CONN_FULL) { DEBUG_REQ(D_ERROR, req, "IMP_INVALID:"); spin_unlock_irqrestore(&imp->imp_lock, flags); - RETURN(-EIO); + GOTO (out, rc = -EIO); } if (req->rq_level > imp->imp_level) { list_del(&req->rq_list); + if (req->rq_no_recov || obd->obd_no_recov || + imp->imp_dlm_fake) { + spin_unlock_irqrestore(&imp->imp_lock, flags); + GOTO (out, rc = -EWOULDBLOCK); + } + list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock_irqrestore(&imp->imp_lock, flags); - DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d < %d)", + DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d > %d)", current->comm, req->rq_level, imp->imp_level); lwi = LWI_INTR(NULL, NULL); rc = l_wait_event(req->rq_wait_for_rep, - (req->rq_level <= imp->imp_level) || - (req->rq_flags & PTL_RPC_FL_ERR), &lwi); - - if (req->rq_flags & PTL_RPC_FL_ERR) - rc = -EIO; - - if (!req->rq_import) - RETURN(rc); + (req->rq_level <= imp->imp_level || + req->rq_err), + &lwi); + DEBUG_REQ(D_HA, req, "\"%s\" awake: (%d > %d)", + current->comm, req->rq_level, imp->imp_level); spin_lock_irqsave(&imp->imp_lock, flags); list_del_init(&req->rq_list); + if (req->rq_err) + rc = -EIO; + if (rc) { spin_unlock_irqrestore(&imp->imp_lock, flags); - RETURN(rc); + GOTO (out, rc); } - + CERROR("process %d resumed\n", current->pid); } - resend: + /* XXX this is the same as ptlrpc_set_wait */ LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_sending_list); + req->rq_import_generation = imp->imp_generation; spin_unlock_irqrestore(&imp->imp_lock, flags); + rc = ptl_send_rpc(req); if (rc) { - CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc, - req->rq_reqmsg->opc); - /* sleep for a jiffy, then trigger recovery */ - lwi = LWI_TIMEOUT_INTR(1, expired_request, - interrupted_request, req); + /* The DLM's fake imports want to avoid all forms of + * recovery. */ + if (imp->imp_dlm_fake) { + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + GOTO(out, rc); + } + + DEBUG_REQ(D_ERROR, req, "send failed (%d); recovering", rc); + + ptlrpc_fail_import(imp, req->rq_import_generation); + + /* If we've been told to not wait, we're done. */ + if (req->rq_level < LUSTRE_CONN_FULL || req->rq_no_recov || + obd->obd_no_recov) { + spin_lock_irqsave(&imp->imp_lock, flags); + list_del_init(&req->rq_list); + spin_unlock_irqrestore(&imp->imp_lock, flags); + GOTO(out, rc); + } + + /* If we errored, allow the user to interrupt immediately */ + timeout = 1; } else { + timeout = req->rq_timeout * HZ; DEBUG_REQ(D_NET, req, "-- sleeping"); - lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request, - interrupted_request, req); } #ifdef __KERNEL__ + lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request, + req); l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi); -#else - { +#else + { extern int reply_in_callback(ptl_event_t *ev); ptl_event_t reply_ev; - PtlEQWait(req->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h, &reply_ev); - reply_in_callback(&reply_ev); + PtlEQWait(req->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h, + &reply_ev); + reply_in_callback(&reply_ev); + + LASSERT (reply_ev.mem_desc.user_ptr == (void *)req); + // ptlrpc_check_reply(req); + // not required now it only tests } -#endif +#endif DEBUG_REQ(D_NET, req, "-- done sleeping"); + CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:ni:nid:opc " + "%s:%s:%d:"LPU64":%s:"LPX64":%d\n", current->comm, + imp->imp_obd->obd_uuid.uuid, + req->rq_reqmsg->status, req->rq_xid, + conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid, + req->rq_reqmsg->opc); + spin_lock_irqsave(&imp->imp_lock, flags); list_del_init(&req->rq_list); spin_unlock_irqrestore(&imp->imp_lock, flags); - if (req->rq_flags & PTL_RPC_FL_ERR) { - ptlrpc_abort(req); + /* If the reply was received normally, this just grabs the spinlock + * (ensuring the reply callback has returned), sees that + * req->rq_receiving_reply is clear and returns. */ + ptlrpc_unregister_reply (req); + + if (req->rq_err) GOTO(out, rc = -EIO); - } - /* Don't resend if we were interrupted. */ - if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) == - PTL_RPC_FL_RESEND) { - if (req->rq_flags & PTL_RPC_FL_NO_RESEND) { - ptlrpc_abort(req); /* clean up reply buffers */ - req->rq_flags &= ~PTL_RPC_FL_NO_RESEND; + /* Resend if we need to, unless we were interrupted. */ + if (req->rq_resend && !req->rq_intr) { + /* ...unless we were specifically told otherwise. */ + if (req->rq_no_resend) { + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_no_resend = 0; + spin_unlock_irqrestore (&req->rq_lock, flags); GOTO(out, rc = -ETIMEDOUT); } - req->rq_flags &= ~PTL_RPC_FL_RESEND; + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_resend = 0; + spin_unlock_irqrestore (&req->rq_lock, flags); lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); + + if (req->rq_bulk != NULL) + ptlrpc_unregister_bulk (req); + DEBUG_REQ(D_HA, req, "resending: "); - spin_lock_irqsave(&imp->imp_lock, flags); - goto resend; + goto restart; } - if (req->rq_flags & PTL_RPC_FL_INTR) { - if (!(req->rq_flags & PTL_RPC_FL_TIMEOUT)) - LBUG(); /* should only be interrupted if we timed out */ - /* Clean up the dangling reply buffers */ - ptlrpc_abort(req); + if (req->rq_intr) { + /* Should only be interrupted if we timed out. */ + if (!req->rq_timedout) + DEBUG_REQ(D_ERROR, req, + "rq_intr set but rq_timedout not"); GOTO(out, rc = -EINTR); } - if (req->rq_flags & PTL_RPC_FL_TIMEOUT) + if (req->rq_timedout) { /* non-recoverable timeout */ GOTO(out, rc = -ETIMEDOUT); - - if (!(req->rq_flags & PTL_RPC_FL_REPLIED)) - GOTO(out, rc = req->rq_status); - - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); - if (rc) { - CERROR("unpack_rep failed: %d\n", rc); - GOTO(out, rc); } -#if 0 - /* FIXME: Enable when BlueArc makes new release */ - if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY && - req->rq_repmsg->type != PTL_RPC_MSG_ERR) { - CERROR("invalid packet type received (type=%u)\n", - req->rq_repmsg->type); + + if (!req->rq_replied) { + /* How can this be? -eeb */ + DEBUG_REQ(D_ERROR, req, "!rq_replied: "); LBUG(); - GOTO(out, rc = -EINVAL); + GOTO(out, rc = req->rq_status); } -#endif - DEBUG_REQ(D_NET, req, "status %d", req->rq_repmsg->status); - /* We're a rejected connection, need to invalidate and rebuild. */ - if (req->rq_repmsg->status == -ENOTCONN) { - spin_lock_irqsave(&imp->imp_lock, flags); - /* If someone else is reconnecting us (CONN_RECOVD) or has - * already completed it (handle mismatch), then we just need - * to get out. - */ - if (imp->imp_level == LUSTRE_CONN_RECOVD || - imp->imp_handle.addr != req->rq_reqmsg->addr || - imp->imp_handle.cookie != req->rq_reqmsg->cookie) { - spin_unlock_irqrestore(&imp->imp_lock, flags); - GOTO(out, rc = -EIO); - } - imp->imp_level = LUSTRE_CONN_RECOVD; - spin_unlock_irqrestore(&imp->imp_lock, flags); - if (imp->imp_recover != NULL) { - rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN); - if (rc) - LBUG(); - } - GOTO(out, rc = -EIO); + rc = after_reply (req, &do_restart); + /* NB may return +ve success rc */ + if (do_restart) { + if (req->rq_bulk != NULL) + ptlrpc_unregister_bulk (req); + DEBUG_REQ(D_HA, req, "resending: "); + goto restart; } - rc = ptlrpc_check_status(req); - - if (req->rq_import->imp_flags & IMP_REPLAYABLE) { - spin_lock_irqsave(&imp->imp_lock, flags); - if ((req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0) - && rc >= 0) { - ptlrpc_retain_replayable_request(req, imp); - } - - if (req->rq_transno > imp->imp_max_transno) { - imp->imp_max_transno = req->rq_transno; + out: + if (req->rq_bulk != NULL) { + if (rc >= 0) { /* success so far */ + lwi = LWI_TIMEOUT (timeout, NULL, NULL); + brc = l_wait_event (req->rq_wait_for_rep, + ptlrpc_bulk_complete (req->rq_bulk), &lwi); + if (brc != 0) { + LASSERT (brc == -ETIMEDOUT); + CERROR ("Timed out waiting for bulk\n"); + rc = brc; + } } - - /* Replay-enabled imports return commit-status information. */ - if (req->rq_repmsg->last_committed) { - imp->imp_peer_committed_transno = - req->rq_repmsg->last_committed; + if (rc < 0) { + /* MDS blocks for put ACKs before replying */ + /* OSC sets rq_no_resend for the time being */ + LASSERT (req->rq_no_resend); + ptlrpc_unregister_bulk (req); } - ptlrpc_free_committed(imp); - spin_unlock_irqrestore(&imp->imp_lock, flags); } - - EXIT; - out: - return rc; + + LASSERT (!req->rq_receiving_reply); + req->rq_phase = RQ_PHASE_INTERPRET; + RETURN (rc); } int ptlrpc_replay_req(struct ptlrpc_request *req) @@ -861,15 +1425,22 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) struct l_wait_info lwi; ENTRY; - init_waitqueue_head(&req->rq_wait_for_rep); - DEBUG_REQ(D_NET, req, ""); + /* I don't touch rq_phase here, so the debug log can show what + * state it was left in */ + + /* Not handling automatic bulk replay yet (or ever?) */ + LASSERT (req->rq_bulk == NULL); + + DEBUG_REQ(D_NET, req, "about to replay"); - req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; - req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie; + /* Update request's state, since we might have a new connection. */ + ptlrpc_put_connection(req->rq_connection); + req->rq_connection = + ptlrpc_connection_addref(req->rq_import->imp_connection); /* temporarily set request to RECOVD level (reset at out:) */ old_level = req->rq_level; - if (req->rq_flags & PTL_RPC_FL_REPLIED) + if (req->rq_replied) old_status = req->rq_repmsg->status; req->rq_level = LUSTRE_CONN_RECOVD; rc = ptl_send_rpc(req); @@ -887,18 +1458,40 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) // up(&cli->cli_rpc_sem); - if (!(req->rq_flags & PTL_RPC_FL_REPLIED)) { + /* If the reply was received normally, this just grabs the spinlock + * (ensuring the reply callback has returned), sees that + * req->rq_receiving_reply is clear and returns. */ + ptlrpc_unregister_reply (req); + + if (!req->rq_replied) { CERROR("Unknown reason for wakeup\n"); /* XXX Phil - I end up here when I kill obdctl */ - ptlrpc_abort(req); + /* ...that's because signals aren't all masked in + * l_wait_event() -eeb */ GOTO(out, rc = -EINTR); } +#if SWAB_PARANOIA + /* Clear reply swab mask; this is a new reply in sender's byte order */ + req->rq_rep_swab_mask = 0; +#endif rc = lustre_unpack_msg(req->rq_repmsg, req->rq_replen); if (rc) { CERROR("unpack_rep failed: %d\n", rc); - GOTO(out, rc); + GOTO(out, rc = -EPROTO); } +#if 0 + /* FIXME: Enable when BlueArc makes new release */ + if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY && + req->rq_repmsg->type != PTL_RPC_MSG_ERR) { + CERROR("invalid packet type received (type=%u)\n", + req->rq_repmsg->type); + GOTO(out, rc = -EPROTO); + } +#endif + + /* The transno had better not change over replay. */ + LASSERT(req->rq_reqmsg->transno == req->rq_repmsg->transno); CDEBUG(D_NET, "got rep "LPD64"\n", req->rq_xid); @@ -906,8 +1499,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) if (req->rq_replay_cb) req->rq_replay_cb(req); - if ((req->rq_flags & PTL_RPC_FL_REPLIED) && - req->rq_repmsg->status != old_status) { + if (req->rq_replied && req->rq_repmsg->status != old_status) { DEBUG_REQ(D_HA, req, "status %d, old was %d", req->rq_repmsg->status, old_status); } @@ -917,32 +1509,42 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) RETURN(rc); } -/* XXX looks a lot like super.c:invalidate_request_list, don't it? */ -void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import) +void ptlrpc_abort_inflight(struct obd_import *imp) { unsigned long flags; struct list_head *tmp, *n; ENTRY; /* Make sure that no new requests get processed for this import. - * ptlrpc_queue_wait must (and does) hold imp_lock while testing this - * flag and then putting requests on sending_list or delayed_list. + * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing + * this flag and then putting requests on sending_list or delayed_list. + */ + spin_lock_irqsave(&imp->imp_lock, flags); + if (!imp->imp_replayable) + /* on b_devel, I moved this line to + ptlrpc_set_import_active because I thought it made + more sense there and possibly not all callers of + this function expect this. I'll leave it here until + I can figure out if it's correct or not. - rread 5/12/03 */ + imp->imp_invalid = 1; + + /* XXX locking? Maybe we should remove each request with the list + * locked? Also, how do we know if the requests on the list are + * being freed at this time? */ - if ((imp->imp_flags & IMP_REPLAYABLE) == 0) { - spin_lock_irqsave(&imp->imp_lock, flags); - imp->imp_flags |= IMP_INVALID; - spin_unlock_irqrestore(&imp->imp_lock, flags); - } - list_for_each_safe(tmp, n, &imp->imp_sending_list) { struct ptlrpc_request *req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "inflight"); - req->rq_flags |= PTL_RPC_FL_ERR; - if (dying_import) - req->rq_import = NULL; - wake_up(&req->rq_wait_for_rep); + + spin_lock (&req->rq_lock); + req->rq_err = 1; + if (req->rq_set != NULL) + wake_up(&req->rq_set->set_waitq); + else + wake_up(&req->rq_wait_for_rep); + spin_unlock (&req->rq_lock); } list_for_each_safe(tmp, n, &imp->imp_delayed_list) { @@ -950,10 +1552,36 @@ void ptlrpc_abort_inflight(struct obd_import *imp, int dying_import) list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "aborting waiting req"); - req->rq_flags |= PTL_RPC_FL_ERR; - if (dying_import) - req->rq_import = NULL; - wake_up(&req->rq_wait_for_rep); + + spin_lock (&req->rq_lock); + req->rq_err = 1; + if (req->rq_set != NULL) + wake_up(&req->rq_set->set_waitq); + else + wake_up(&req->rq_wait_for_rep); + spin_unlock (&req->rq_lock); } + + /* Last chance to free reqs left on the replay list, but we + * will still leak reqs that haven't comitted. */ + if (imp->imp_replayable) + ptlrpc_free_committed(imp); + + spin_unlock_irqrestore(&imp->imp_lock, flags); + EXIT; } + +static __u64 ptlrpc_last_xid = 0; +static spinlock_t ptlrpc_last_xid_lock = SPIN_LOCK_UNLOCKED; + +__u64 ptlrpc_next_xid(void) +{ + __u64 tmp; + spin_lock(&ptlrpc_last_xid_lock); + tmp = ++ptlrpc_last_xid; + spin_unlock(&ptlrpc_last_xid_lock); + return tmp; +} + + diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c index 8f2cc2d..6b7690b 100644 --- a/lustre/ptlrpc/connection.c +++ b/lustre/ptlrpc/connection.c @@ -29,23 +29,25 @@ #include <liblustre.h> #endif +#include "ptlrpc_internal.h" + static spinlock_t conn_lock; static struct list_head conn_list; static struct list_head conn_unused_list; -/* If UUID is NULL, c->c_remote_uuid must be all zeroes - * If UUID is non-NULL, c->c_remote_uuid must match. */ -static int match_connection_uuid(struct ptlrpc_connection *c, - struct obd_uuid *uuid) +void ptlrpc_dump_connections(void) { - struct obd_uuid zero_uuid; - memset(&zero_uuid, 0, sizeof(zero_uuid)); - - if (uuid) - return memcmp(c->c_remote_uuid.uuid, uuid->uuid, - sizeof(uuid->uuid)); + struct list_head *tmp; + struct ptlrpc_connection *c; + ENTRY; - return memcmp(c->c_remote_uuid.uuid, &zero_uuid, sizeof(zero_uuid)); + list_for_each(tmp, &conn_list) { + c = list_entry(tmp, struct ptlrpc_connection, c_link); + CERROR("Connection %p/%s has refcount %d (nid="LPX64" on %s)\n", + c, c->c_remote_uuid.uuid, atomic_read(&c->c_refcount), + c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name); + } + EXIT; } struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, @@ -55,15 +57,22 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, struct ptlrpc_connection *c; ENTRY; + CDEBUG(D_INFO, "peer is "LPX64" on %s\n", peer->peer_nid, peer->peer_ni->pni_name); spin_lock(&conn_lock); + if (list_empty(&conn_list)) { + if (!ptlrpc_get_ldlm_hooks()) { + spin_unlock(&conn_lock); + RETURN(NULL); + } + } + list_for_each(tmp, &conn_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); if (peer->peer_nid == c->c_peer.peer_nid && - peer->peer_ni == c->c_peer.peer_ni && - !match_connection_uuid(c, uuid)) { + peer->peer_ni == c->c_peer.peer_ni) { ptlrpc_connection_addref(c); GOTO(out, c); } @@ -72,8 +81,7 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, list_for_each_safe(tmp, pos, &conn_unused_list) { c = list_entry(tmp, struct ptlrpc_connection, c_link); if (peer->peer_nid == c->c_peer.peer_nid && - peer->peer_ni == c->c_peer.peer_ni && - !match_connection_uuid(c, uuid)) { + peer->peer_ni == c->c_peer.peer_ni) { ptlrpc_connection_addref(c); list_del(&c->c_link); list_add(&c->c_link, &conn_list); @@ -91,13 +99,8 @@ struct ptlrpc_connection *ptlrpc_get_connection(struct ptlrpc_peer *peer, c->c_epoch = 1; c->c_bootcount = 0; c->c_flags = 0; - if (uuid->uuid) + if (uuid && uuid->uuid) /* XXX ???? */ obd_str2uuid(&c->c_remote_uuid, uuid->uuid); - INIT_LIST_HEAD(&c->c_imports); - INIT_LIST_HEAD(&c->c_exports); - INIT_LIST_HEAD(&c->c_sb_chain); - INIT_LIST_HEAD(&c->c_recovd_data.rd_managed_chain); - INIT_LIST_HEAD(&c->c_delayed_head); atomic_set(&c->c_refcount, 0); memcpy(&c->c_peer, peer, sizeof(c->c_peer)); spin_lock_init(&c->c_lock); @@ -123,14 +126,16 @@ int ptlrpc_put_connection(struct ptlrpc_connection *c) } CDEBUG (D_INFO, "connection=%p refcount %d to "LPX64" on %s\n", - c, atomic_read(&c->c_refcount), c->c_peer.peer_nid, + c, atomic_read(&c->c_refcount) - 1, c->c_peer.peer_nid, c->c_peer.peer_ni->pni_name); if (atomic_dec_and_test(&c->c_refcount)) { - recovd_conn_unmanage(c); spin_lock(&conn_lock); list_del(&c->c_link); list_add(&c->c_link, &conn_unused_list); + if (list_empty(&conn_list)) { + ptlrpc_put_ldlm_hooks(); + } spin_unlock(&conn_lock); rc = 1; } diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 4a6eb67..167898a 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -42,7 +42,7 @@ static int request_out_callback(ptl_event_t *ev) ENTRY; /* requests always contiguous */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); if (ev->type != PTL_EVENT_SENT) { // XXX make sure we understand all events, including ACK's @@ -50,33 +50,34 @@ static int request_out_callback(ptl_event_t *ev) LBUG(); } - /* this balances the atomic_inc in ptl_send_rpc */ + /* this balances the atomic_inc in ptl_send_rpc() */ ptlrpc_req_finished(req); RETURN(1); } - /* * Free the packet when it has gone out */ static int reply_out_callback(ptl_event_t *ev) { + struct ptlrpc_request *req = ev->mem_desc.user_ptr; + unsigned long flags; ENTRY; /* replies always contiguous */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); if (ev->type == PTL_EVENT_SENT) { + /* NB don't even know if this is the current reply! In fact + * we can't touch any state in the request, since the + * service handler zeros it on each incoming request. */ OBD_FREE(ev->mem_desc.start, ev->mem_desc.length); } else if (ev->type == PTL_EVENT_ACK) { - struct ptlrpc_request *req = ev->mem_desc.user_ptr; - if (req->rq_flags & PTL_RPC_FL_WANT_ACK) { - req->rq_flags &= ~PTL_RPC_FL_WANT_ACK; - wake_up(&req->rq_wait_for_rep); - } else { - DEBUG_REQ(D_ERROR, req, - "ack received for reply, not wanted"); - } + LASSERT(req->rq_want_ack); + spin_lock_irqsave(&req->rq_lock, flags); + req->rq_want_ack = 0; + wake_up(&req->rq_wait_for_rep); + spin_unlock_irqrestore(&req->rq_lock, flags); } else { // XXX make sure we understand all events CERROR("Unknown event %d\n", ev->type); @@ -92,10 +93,11 @@ static int reply_out_callback(ptl_event_t *ev) int reply_in_callback(ptl_event_t *ev) { struct ptlrpc_request *req = ev->mem_desc.user_ptr; + unsigned long flags; ENTRY; /* replies always contiguous */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); if (req->rq_xid == 0x5a5a5a5a5a5a5a5a) { CERROR("Reply received for freed request! Probably a missing " @@ -109,11 +111,21 @@ int reply_in_callback(ptl_event_t *ev) } if (ev->type == PTL_EVENT_PUT) { - req->rq_repmsg = ev->mem_desc.start + ev->offset; - barrier(); - wake_up(&req->rq_wait_for_rep); + /* Bug 1190: should handle non-zero offset as a protocol + * error */ + LASSERT (ev->offset == 0); + + spin_lock_irqsave (&req->rq_lock, flags); + LASSERT (req->rq_receiving_reply); + req->rq_receiving_reply = 0; + req->rq_replied = 1; + if (req->rq_set != NULL) + wake_up(&req->rq_set->set_waitq); + else + wake_up(&req->rq_wait_for_rep); + spin_unlock_irqrestore (&req->rq_lock, flags); } else { - // XXX make sure we understand all events, including ACK's + // XXX make sure we understand all events, including ACKs CERROR("Unknown event %d\n", ev->type); LBUG(); } @@ -128,7 +140,7 @@ int request_in_callback(ptl_event_t *ev) struct ptlrpc_service *service = srv_ni->sni_service; /* requests always contiguous */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) == 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); /* we only enable puts */ LASSERT(ev->type == PTL_EVENT_PUT); LASSERT(atomic_read(&srv_ni->sni_nrqbds_receiving) > 0); @@ -138,14 +150,14 @@ int request_in_callback(ptl_event_t *ev) CERROR("Warning: Possibly truncated rpc (%d/%d)\n", ev->mlength, ev->rlength); - if (ptl_is_valid_handle(&ev->unlinked_me)) { + if (!PtlHandleEqual (ev->unlinked_me, PTL_HANDLE_NONE)) { /* This is the last request to be received into this * request buffer. We don't bump the refcount, since the * thread servicing this event is effectively taking over * portals' reference. */ -#warning ev->unlinked_me.nal_idx is not set properly in a callback - LASSERT(ev->unlinked_me.handle_idx==rqbd->rqbd_me_h.handle_idx); + /* NB ev->unlinked_me.nal_idx is not set properly in a callback */ + LASSERT(ev->unlinked_me.cookie==rqbd->rqbd_me_h.cookie); /* we're off the air */ /* we'll probably start dropping packets in portals soon */ @@ -163,10 +175,8 @@ int request_in_callback(ptl_event_t *ev) static int bulk_put_source_callback(ptl_event_t *ev) { + unsigned long flags; struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; - struct ptlrpc_bulk_page *bulk; - struct list_head *tmp; - struct list_head *next; ENTRY; CDEBUG(D_NET, "got %s event %d\n", @@ -175,80 +185,77 @@ static int bulk_put_source_callback(ptl_event_t *ev) LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_ACK); - LASSERT(atomic_read(&desc->bd_source_callback_count) > 0 && - atomic_read(&desc->bd_source_callback_count) <= 2); - /* 1 fragment for each page always */ LASSERT(ev->mem_desc.niov == desc->bd_page_count); - if (atomic_dec_and_test(&desc->bd_source_callback_count)) { - void (*event_handler)(struct ptlrpc_bulk_desc *); - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - bulk = list_entry(tmp, struct ptlrpc_bulk_page, - bp_link); - - if (bulk->bp_cb != NULL) - bulk->bp_cb(bulk); - } - - /* We need to make a note of whether there's an event handler - * before we call wake_up, because if there is no event handler, - * 'desc' might be freed before we're scheduled again. */ - event_handler = desc->bd_ptl_ev_hdlr; - - desc->bd_flags |= PTL_BULK_FL_SENT; + spin_lock_irqsave (&desc->bd_lock, flags); + + LASSERT(desc->bd_callback_count > 0 && + desc->bd_callback_count <= 2); + + if (--desc->bd_callback_count == 0) { + desc->bd_network_rw = 0; + desc->bd_complete = 1; wake_up(&desc->bd_waitq); - if (event_handler) { - LASSERT(desc->bd_ptl_ev_hdlr == event_handler); - event_handler(desc); - } } + spin_unlock_irqrestore (&desc->bd_lock, flags); RETURN(0); } +struct ptlrpc_bulk_desc ptlrpc_bad_desc; +ptl_event_t ptlrpc_bad_event; + static int bulk_put_sink_callback(ptl_event_t *ev) { struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; - struct ptlrpc_bulk_page *bulk; - struct list_head *tmp; - struct list_head *next; - ptl_size_t total = 0; - void (*event_handler)(struct ptlrpc_bulk_desc *); + unsigned long flags; ENTRY; LASSERT(ev->type == PTL_EVENT_PUT); - /* put with zero offset */ - LASSERT(ev->offset == 0); /* used iovs */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) != 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == + PTL_MD_KIOV); + /* Honestly, it's best to find out early. */ + if (desc->bd_page_count == 0x5a5a5a5a5a || + desc->bd_page_count != ev->mem_desc.niov || + ev->mem_desc.start != &desc->bd_iov) { + /* not guaranteed (don't LASSERT) but good for this bug hunt */ + ptlrpc_bad_event = *ev; + ptlrpc_bad_desc = *desc; + CERROR ("XXX ev %p type %d portal %d match "LPX64", seq %ld\n", + ev, ev->type, ev->portal, ev->match_bits, ev->sequence); + CERROR ("XXX desc %p, export %p import %p gen %d " + " portal %d\n", + desc, desc->bd_export, + desc->bd_import, desc->bd_import_generation, + desc->bd_portal); + RETURN (0); + } + + LASSERT(desc->bd_page_count != 0x5a5a5a5a); /* 1 fragment for each page always */ LASSERT(ev->mem_desc.niov == desc->bd_page_count); - - list_for_each_safe (tmp, next, &desc->bd_page_list) { - bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); - - total += bulk->bp_buflen; - - if (bulk->bp_cb != NULL) - bulk->bp_cb(bulk); + LASSERT(ev->match_bits == desc->bd_req->rq_xid); + + /* peer must put with zero offset */ + if (ev->offset != 0) { + /* Bug 1190: handle this as a protocol failure */ + CERROR ("Bad offset %d\n", ev->offset); + LBUG (); } - LASSERT(ev->mem_desc.length == total); - - /* We need to make a note of whether there's an event handler - * before we call wake_up, because if there is no event - * handler, 'desc' might be freed before we're scheduled again. */ - event_handler = desc->bd_ptl_ev_hdlr; + /* No check for total # bytes; this could be a short read */ - desc->bd_flags |= PTL_BULK_FL_RCVD; - wake_up(&desc->bd_waitq); - if (event_handler) { - LASSERT(desc->bd_ptl_ev_hdlr == event_handler); - event_handler(desc); - } + spin_lock_irqsave (&desc->bd_lock, flags); + desc->bd_network_rw = 0; + desc->bd_complete = 1; + if (desc->bd_req->rq_set != NULL) + wake_up (&desc->bd_req->rq_set->set_waitq); + else + wake_up (&desc->bd_req->rq_wait_for_rep); + spin_unlock_irqrestore (&desc->bd_lock, flags); RETURN(1); } @@ -258,122 +265,108 @@ static int bulk_get_source_callback(ptl_event_t *ev) struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; struct ptlrpc_bulk_page *bulk; struct list_head *tmp; - struct list_head *next; + unsigned long flags; ptl_size_t total = 0; - void (*event_handler)(struct ptlrpc_bulk_desc *); ENTRY; LASSERT(ev->type == PTL_EVENT_GET); - /* put with zero offset */ - LASSERT(ev->offset == 0); /* used iovs */ - LASSERT((ev->mem_desc.options & PTL_MD_IOV) != 0); + LASSERT((ev->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == + PTL_MD_KIOV); /* 1 fragment for each page always */ LASSERT(ev->mem_desc.niov == desc->bd_page_count); + LASSERT(ev->match_bits == desc->bd_req->rq_xid); - list_for_each_safe (tmp, next, &desc->bd_page_list) { + /* peer must get with zero offset */ + if (ev->offset != 0) { + /* Bug 1190: handle this as a protocol failure */ + CERROR ("Bad offset %d\n", ev->offset); + LBUG (); + } + + list_for_each (tmp, &desc->bd_page_list) { bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); total += bulk->bp_buflen; - - if (bulk->bp_cb != NULL) - bulk->bp_cb(bulk); } - LASSERT(ev->mem_desc.length == total); - - /* We need to make a note of whether there's an event handler - * before we call wake_up, because if there is no event - * handler, 'desc' might be freed before we're scheduled again. */ - event_handler = desc->bd_ptl_ev_hdlr; - - desc->bd_flags |= PTL_BULK_FL_SENT; - wake_up(&desc->bd_waitq); - if (event_handler) { - LASSERT(desc->bd_ptl_ev_hdlr == event_handler); - event_handler(desc); + /* peer must get everything */ + if (ev->mem_desc.length != total) { + /* Bug 1190: handle this as a protocol failure */ + CERROR ("Bad length/total %d/%d\n", ev->mem_desc.length, total); + LBUG (); } + spin_lock_irqsave (&desc->bd_lock, flags); + desc->bd_network_rw = 0; + desc->bd_complete = 1; + if (desc->bd_req->rq_set != NULL) + wake_up (&desc->bd_req->rq_set->set_waitq); + else + wake_up (&desc->bd_req->rq_wait_for_rep); + spin_unlock_irqrestore (&desc->bd_lock, flags); + RETURN(1); } - static int bulk_get_sink_callback(ptl_event_t *ev) { struct ptlrpc_bulk_desc *desc = ev->mem_desc.user_ptr; - struct ptlrpc_bulk_page *bulk; - struct list_head *tmp; - struct list_head *next; + unsigned long flags; ENTRY; CDEBUG(D_NET, "got %s event %d\n", (ev->type == PTL_EVENT_SENT) ? "SENT" : - (ev->type == PTL_EVENT_REPLY) ? "REPLY" : "UNEXPECTED", + (ev->type == PTL_EVENT_REPLY) ? "REPLY" : "UNEXPECTED", ev->type); LASSERT(ev->type == PTL_EVENT_SENT || ev->type == PTL_EVENT_REPLY); - LASSERT(atomic_read(&desc->bd_source_callback_count) > 0 && - atomic_read(&desc->bd_source_callback_count) <= 2); - /* 1 fragment for each page always */ LASSERT(ev->mem_desc.niov == desc->bd_page_count); - if (atomic_dec_and_test(&desc->bd_source_callback_count)) { - void (*event_handler)(struct ptlrpc_bulk_desc *); - - list_for_each_safe(tmp, next, &desc->bd_page_list) { - bulk = list_entry(tmp, struct ptlrpc_bulk_page, - bp_link); + spin_lock_irqsave (&desc->bd_lock, flags); + LASSERT(desc->bd_callback_count > 0 && + desc->bd_callback_count <= 2); - if (bulk->bp_cb != NULL) - bulk->bp_cb(bulk); - } - - /* We need to make a note of whether there's an event handler - * before we call wake_up, because if there is no event handler, - * 'desc' might be freed before we're scheduled again. */ - event_handler = desc->bd_ptl_ev_hdlr; - - desc->bd_flags |= PTL_BULK_FL_RCVD; + if (--desc->bd_callback_count == 0) { + desc->bd_network_rw = 0; + desc->bd_complete = 1; wake_up(&desc->bd_waitq); - if (event_handler) { - LASSERT(desc->bd_ptl_ev_hdlr == event_handler); - event_handler(desc); - } } + spin_unlock_irqrestore (&desc->bd_lock, flags); RETURN(0); } -int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) +int ptlrpc_uuid_to_peer (struct obd_uuid *uuid, struct ptlrpc_peer *peer) { struct ptlrpc_ni *pni; struct lustre_peer lpeer; int i; int rc = lustre_uuid_to_peer (uuid->uuid, &lpeer); - + if (rc != 0) RETURN (rc); - + for (i = 0; i < ptlrpc_ninterfaces; i++) { pni = &ptlrpc_interfaces[i]; - if (!memcmp (&lpeer.peer_ni, &pni->pni_ni_h, - sizeof (lpeer.peer_ni))) { + if (!memcmp(&lpeer.peer_ni, &pni->pni_ni_h, + sizeof (lpeer.peer_ni))) { peer->peer_nid = lpeer.peer_nid; peer->peer_ni = pni; return (0); } } - - CERROR ("Can't find ptlrpc interface for "LPX64" ni handle %08lx %08lx\n", - lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.handle_idx); + + CERROR("Can't find ptlrpc interface for "LPX64" ni handle %08lx."LPX64"\n", + lpeer.peer_nid, lpeer.peer_ni.nal_idx, lpeer.peer_ni.cookie); return (-ENOENT); } -void ptlrpc_ni_fini (struct ptlrpc_ni *pni) +void ptlrpc_ni_fini(struct ptlrpc_ni *pni) { PtlEQFree(pni->pni_request_out_eq_h); PtlEQFree(pni->pni_reply_out_eq_h); @@ -382,111 +375,116 @@ void ptlrpc_ni_fini (struct ptlrpc_ni *pni) PtlEQFree(pni->pni_bulk_put_sink_eq_h); PtlEQFree(pni->pni_bulk_get_source_eq_h); PtlEQFree(pni->pni_bulk_get_sink_eq_h); - - inter_module_put(pni->pni_name); + + kportal_put_ni (pni->pni_number); } -int ptlrpc_ni_init (char *name, struct ptlrpc_ni *pni) +int ptlrpc_ni_init(int number, char *name, struct ptlrpc_ni *pni) { int rc; - ptl_handle_ni_t *nip; + ptl_handle_ni_t *nip = kportal_get_ni (number); - nip = (ptl_handle_ni_t *)inter_module_get (name); if (nip == NULL) { CDEBUG (D_NET, "Network interface %s not loaded\n", name); return (-ENOENT); } - - CDEBUG (D_NET, "init %s: nal_idx %ld\n", name, nip->nal_idx); - + + CDEBUG (D_NET, "init %d %s: nal_idx %ld\n", number, name, nip->nal_idx); + pni->pni_name = name; + pni->pni_number = number; pni->pni_ni_h = *nip; - ptl_set_inv_handle (&pni->pni_request_out_eq_h); - ptl_set_inv_handle (&pni->pni_reply_out_eq_h); - ptl_set_inv_handle (&pni->pni_reply_in_eq_h); - ptl_set_inv_handle (&pni->pni_bulk_put_source_eq_h); - ptl_set_inv_handle (&pni->pni_bulk_put_sink_eq_h); - ptl_set_inv_handle (&pni->pni_bulk_get_source_eq_h); - ptl_set_inv_handle (&pni->pni_bulk_get_sink_eq_h); - + pni->pni_request_out_eq_h = PTL_HANDLE_NONE; + pni->pni_reply_out_eq_h = PTL_HANDLE_NONE; + pni->pni_reply_in_eq_h = PTL_HANDLE_NONE; + pni->pni_bulk_put_source_eq_h = PTL_HANDLE_NONE; + pni->pni_bulk_put_sink_eq_h = PTL_HANDLE_NONE; + pni->pni_bulk_get_source_eq_h = PTL_HANDLE_NONE; + pni->pni_bulk_get_sink_eq_h = PTL_HANDLE_NONE; + /* NB We never actually PtlEQGet() out of these events queues since * we're only interested in the event callback, so we can just let * them wrap. Their sizes aren't a big deal, apart from providing * a little history for debugging... */ - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback, + + rc = PtlEQAlloc(pni->pni_ni_h, 1024, request_out_callback, &pni->pni_request_out_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - - rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback, + + rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_out_callback, &pni->pni_reply_out_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + rc = PtlEQAlloc(pni->pni_ni_h, 1024, reply_in_callback, &pni->pni_reply_in_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_source_callback, &pni->pni_bulk_put_source_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_put_sink_callback, &pni->pni_bulk_put_sink_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_source_callback, &pni->pni_bulk_get_source_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + rc = PtlEQAlloc(pni->pni_ni_h, 1024, bulk_get_sink_callback, &pni->pni_bulk_get_sink_eq_h); if (rc != PTL_OK) GOTO (fail, rc = -ENOMEM); - + return (0); - fail: + fail: CERROR ("Failed to initialise network interface %s: %d\n", name, rc); - /* OK to do complete teardown since we invalidated the handles above... */ + /* OK to do complete teardown since we invalidated the handles above */ ptlrpc_ni_fini (pni); return (rc); } int ptlrpc_init_portals(void) { - /* Add new portals network interface names here. + /* Add new portals network interfaces here. * Order is irrelevent! */ - char *ni_names[] = { "kqswnal_ni", - "kgmnal_ni", - "ksocknal_ni", - "ktoenal_ni", - "tcpnal_ni", - NULL }; + static struct { + int number; + char *name; + } ptl_nis[] = { + {QSWNAL, "qswnal"}, + {SOCKNAL, "socknal"}, + {GMNAL, "gmnal"}, + {TOENAL, "toenal"}, + {TCPNAL, "tcpnal"}, + {SCIMACNAL, "scimacnal"}}; int rc; int i; - - LASSERT (ptlrpc_ninterfaces == 0); - - for (i = 0; ni_names[i] != NULL; i++) { - LASSERT (ptlrpc_ninterfaces < - sizeof (ptlrpc_interfaces)/sizeof (ptlrpc_interfaces[0])); - - rc = ptlrpc_ni_init (ni_names[i], - &ptlrpc_interfaces[ptlrpc_ninterfaces]); + + LASSERT(ptlrpc_ninterfaces == 0); + + for (i = 0; i < sizeof (ptl_nis) / sizeof (ptl_nis[0]); i++) { + LASSERT(ptlrpc_ninterfaces < (sizeof(ptlrpc_interfaces) / + sizeof(ptlrpc_interfaces[0]))); + + rc = ptlrpc_ni_init(ptl_nis[i].number, ptl_nis[i].name, + &ptlrpc_interfaces[ptlrpc_ninterfaces]); if (rc == 0) ptlrpc_ninterfaces++; } - + if (ptlrpc_ninterfaces == 0) { - CERROR("network initialisation failed: is a NAL module loaded?\n"); + CERROR("network initialisation failed: is a NAL module " + "loaded?\n"); return -EIO; } return 0; diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 1b3532e..cc9982c 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -21,21 +21,138 @@ */ #define DEBUG_SUBSYSTEM S_CLASS +#include <linux/obd_support.h> +#include <linux/obd.h> #include <linux/lprocfs_status.h> +#include <linux/lustre_idl.h> +#include <linux/lustre_net.h> +#include "ptlrpc_internal.h" + + +struct ll_rpc_opcode { + __u32 opcode; + const char *opname; +} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { + { OST_REPLY, "ost_reply" }, + { OST_GETATTR, "ost_getattr" }, + { OST_SETATTR, "ost_setattr" }, + { OST_READ, "ost_read" }, + { OST_WRITE, "ost_write" }, + { OST_CREATE , "ost_create" }, + { OST_DESTROY, "ost_destroy" }, + { OST_GET_INFO, "ost_get_info" }, + { OST_CONNECT, "ost_connect" }, + { OST_DISCONNECT, "ost_disconnect" }, + { OST_PUNCH, "ost_punch" }, + { OST_OPEN, "ost_open" }, + { OST_CLOSE, "ost_close" }, + { OST_STATFS, "ost_statfs" }, + { OST_SAN_READ, "ost_san_read" }, + { OST_SAN_WRITE, "ost_san_write" }, + { OST_SYNCFS, "ost_syncfs" }, + { MDS_GETATTR, "mds_getattr" }, + { MDS_GETATTR_NAME, "mds_getattr_name" }, + { MDS_CLOSE, "mds_close" }, + { MDS_REINT, "mds_reint" }, + { MDS_READPAGE, "mds_readpage" }, + { MDS_CONNECT, "mds_connect" }, + { MDS_DISCONNECT, "mds_disconnect" }, + { MDS_GETSTATUS, "mds_getstatus" }, + { MDS_STATFS, "mds_statfs" }, + { MDS_GETLOVINFO, "mds_getlovinfo" }, + { LDLM_ENQUEUE, "ldlm_enqueue" }, + { LDLM_CONVERT, "ldlm_convert" }, + { LDLM_CANCEL, "ldlm_cancel" }, + { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, + { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, + { PTLBD_QUERY, "ptlbd_query" }, + { PTLBD_READ, "ptlbd_read" }, + { PTLBD_WRITE, "ptlbd_write" }, + { PTLBD_FLUSH, "ptlbd_flush" }, + { OBD_PING, "obd_ping" } +}; + +const char* ll_opcode2str(__u32 opcode) +{ + /* When one of the assertions below fail, chances are that: + * 1) A new opcode was added in lustre_idl.h, but was + * is missing from the table above. + * or 2) The opcode space was renumbered or rearranged, + * and the opcode_offset() function in + * ptlrpc_internals.h needs to be modified. + */ + __u32 offset = opcode_offset(opcode); + LASSERT(offset < LUSTRE_MAX_OPCODES); + LASSERT(ll_rpc_opcode_table[offset].opcode == opcode); + return ll_rpc_opcode_table[offset].opname; +} #ifndef LPROCFS -struct lprocfs_vars lprocfs_obd_vars[] = { {0} }; -struct lprocfs_vars lprocfs_module_vars[] = { {0} }; +void ptlrpc_lprocfs_register_service(struct obd_device *obddev, + struct ptlrpc_service *svc) { return ; } +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) { return; } #else -struct lprocfs_vars lprocfs_obd_vars[] = { - { "uuid", lprocfs_rd_uuid, 0, 0}, - { 0 } -}; -struct lprocfs_vars lprocfs_module_vars[] = { - { "num_refs", lprocfs_rd_numrefs, 0, 0}, - { 0 } -}; +void ptlrpc_lprocfs_register_service(struct obd_device *obddev, + struct ptlrpc_service *svc) +{ + struct proc_dir_entry *svc_procroot; + struct lprocfs_counters *svc_cntrs; + int i, rc; + unsigned int svc_counter_config = LPROCFS_CNTR_EXTERNALLOCK | + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV; + + LASSERT(svc->svc_procroot == NULL); + LASSERT(svc->svc_counters == NULL); + + svc_procroot = lprocfs_register(svc->srv_name, obddev->obd_proc_entry, + NULL, NULL); + if (svc_procroot == NULL) + return; + + svc_cntrs = + lprocfs_alloc_counters(PTLRPC_LAST_CNTR+LUSTRE_MAX_OPCODES); + if (svc_cntrs == NULL) { + lprocfs_remove(svc_procroot); + return; + } + + LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_REQWAIT_CNTR], + svc_counter_config, &svc->srv_lock, + "req_waittime", "cycles"); + LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_SVCEQDEPTH_CNTR], + svc_counter_config, &svc->srv_lock, + "svc_eqdepth", "reqs"); + /* no stddev on idletime */ + LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_SVCIDLETIME_CNTR], + (LPROCFS_CNTR_EXTERNALLOCK | LPROCFS_CNTR_AVGMINMAX), + &svc->srv_lock, "svc_idletime", "cycles"); + for (i=0; i < LUSTRE_MAX_OPCODES; i++) { + __u32 opcode = ll_rpc_opcode_table[i].opcode; + LPROCFS_COUNTER_INIT(&svc_cntrs->cntr[PTLRPC_LAST_CNTR+i], + svc_counter_config, &svc->srv_lock, + ll_opcode2str(opcode), "cycles"); + } + rc = lprocfs_register_counters(svc_procroot, "service_stats", + svc_cntrs); + if (rc < 0) { + lprocfs_remove(svc_procroot); + lprocfs_free_counters(svc_cntrs); + } else { + svc->svc_procroot = svc_procroot; + svc->svc_counters = svc_cntrs; + } +} +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) +{ + if (svc->svc_procroot) { + lprocfs_remove(svc->svc_procroot); + svc->svc_procroot = NULL; + } + if (svc->svc_counters) { + lprocfs_free_counters(svc->svc_counters); + svc->svc_counters = NULL; + } +} #endif /* LPROCFS */ -LPROCFS_INIT_VARS(lprocfs_module_vars, lprocfs_obd_vars) diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 3b1d32f..017fb8b 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -34,12 +34,14 @@ static int ptl_send_buf(struct ptlrpc_request *request, struct ptlrpc_connection *conn, int portal) { int rc; + int rc2; ptl_process_id_t remote_id; ptl_handle_md_t md_h; ptl_ack_req_t ack_req; - LASSERT(conn); - CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" on %s\n", + LASSERT (portal != 0); + LASSERT (conn != NULL); + CDEBUG (D_INFO, "conn=%p ni %s nid "LPX64" on %s\n", conn, conn->c_peer.peer_ni->pni_name, conn->c_peer.peer_nid, conn->c_peer.peer_ni->pni_name); @@ -47,23 +49,26 @@ static int ptl_send_buf(struct ptlrpc_request *request, switch (request->rq_type) { case PTL_RPC_MSG_REQUEST: - request->rq_reqmsg->type = HTON__u32(request->rq_type); + request->rq_reqmsg->type = request->rq_type; request->rq_req_md.start = request->rq_reqmsg; request->rq_req_md.length = request->rq_reqlen; - request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_request_out_eq_h; + request->rq_req_md.eventq = + conn->c_peer.peer_ni->pni_request_out_eq_h; + LASSERT (!request->rq_want_ack); break; case PTL_RPC_MSG_ERR: case PTL_RPC_MSG_REPLY: - request->rq_repmsg->type = HTON__u32(request->rq_type); + request->rq_repmsg->type = request->rq_type; request->rq_req_md.start = request->rq_repmsg; request->rq_req_md.length = request->rq_replen; - request->rq_req_md.eventq = conn->c_peer.peer_ni->pni_reply_out_eq_h; + request->rq_req_md.eventq = + conn->c_peer.peer_ni->pni_reply_out_eq_h; break; default: LBUG(); return -1; /* notreached */ } - if (request->rq_flags & PTL_RPC_FL_WANT_ACK) { + if (request->rq_want_ack) { request->rq_req_md.threshold = 2; /* SENT and ACK */ ack_req = PTL_ACK_REQ; } else { @@ -78,12 +83,18 @@ static int ptl_send_buf(struct ptlrpc_request *request, obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED; } - rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md, &md_h); - if (rc != 0) { + /* NB if the send fails, we back out of the send and return + * failure; it's down to the caller to handle missing callbacks */ + + rc = PtlMDBind(conn->c_peer.peer_ni->pni_ni_h, request->rq_req_md, + &md_h); + if (rc != PTL_OK) { CERROR("PtlMDBind failed: %d\n", rc); - LBUG(); - return rc; + LASSERT (rc == PTL_NOSPACE); + RETURN (-ENOMEM); } + if (request->rq_type != PTL_RPC_MSG_REQUEST) + memcpy(&request->rq_reply_md_h, &md_h, sizeof(md_h)); remote_id.nid = conn->c_peer.peer_nid; remote_id.pid = 0; @@ -91,27 +102,27 @@ static int ptl_send_buf(struct ptlrpc_request *request, CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", request->rq_req_md.length, portal, request->rq_xid); - if (!portal) - LBUG(); rc = PtlPut(md_h, ack_req, remote_id, portal, 0, request->rq_xid, 0, 0); if (rc != PTL_OK) { CERROR("PtlPut("LPU64", %d, "LPD64") failed: %d\n", remote_id.nid, portal, request->rq_xid, rc); - PtlMDUnlink(md_h); + rc2 = PtlMDUnlink(md_h); + LASSERT (rc2 == PTL_OK); + RETURN ((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); } - return rc; + return 0; } -static inline struct iovec * +static inline ptl_kiov_t * ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc) { - struct iovec *iov; + ptl_kiov_t *iov; - if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (struct iovec)) + if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov)) return (desc->bd_iov); - OBD_ALLOC (iov, desc->bd_page_count * sizeof (struct iovec)); + OBD_ALLOC (iov, desc->bd_page_count * sizeof (*iov)); if (iov == NULL) LBUG(); @@ -119,39 +130,45 @@ ptlrpc_get_bulk_iov (struct ptlrpc_bulk_desc *desc) } static inline void -ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, struct iovec *iov) +ptlrpc_put_bulk_iov (struct ptlrpc_bulk_desc *desc, ptl_kiov_t *iov) { - if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (struct iovec)) + if (desc->bd_page_count <= sizeof (desc->bd_iov)/sizeof (*iov)) return; - OBD_FREE (iov, desc->bd_page_count * sizeof (struct iovec)); + OBD_FREE (iov, desc->bd_page_count * sizeof (*iov)); } int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) { int rc; + int rc2; struct ptlrpc_peer *peer; struct list_head *tmp, *next; ptl_process_id_t remote_id; - __u32 xid = 0; - struct iovec *iov; + ptl_kiov_t *iov; + __u64 xid; ENTRY; + /* NB no locking required until desc is on the network */ + LASSERT (!desc->bd_network_rw); + LASSERT (desc->bd_type == BULK_PUT_SOURCE); + desc->bd_complete = 0; + iov = ptlrpc_get_bulk_iov (desc); if (iov == NULL) RETURN (-ENOMEM); - peer = &desc->bd_connection->c_peer; + peer = &desc->bd_export->exp_connection->c_peer; desc->bd_md.start = iov; desc->bd_md.niov = 0; desc->bd_md.length = 0; desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_source_eq_h; desc->bd_md.threshold = 2; /* SENT and ACK */ - desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV; + desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV; desc->bd_md.user_ptr = desc; - atomic_set(&desc->bd_source_callback_count, 2); + desc->bd_callback_count = 2; list_for_each_safe(tmp, next, &desc->bd_page_list) { struct ptlrpc_bulk_page *bulk; @@ -159,26 +176,19 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) LASSERT(desc->bd_md.niov < desc->bd_page_count); - if (desc->bd_md.niov == 0) - xid = bulk->bp_xid; - LASSERT(xid == bulk->bp_xid); /* should all be the same */ - - iov[desc->bd_md.niov].iov_base = bulk->bp_buf; - iov[desc->bd_md.niov].iov_len = bulk->bp_buflen; - if (iov[desc->bd_md.niov].iov_len <= 0) { - CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov, - bulk->bp_buf, bulk->bp_buflen); - CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n", - xid, desc->bd_page_count, desc->bd_portal, - atomic_read(&desc->bd_refcount)); - LBUG(); - } + iov[desc->bd_md.niov].kiov_page = bulk->bp_page; + iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; + iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; + + LASSERT (iov[desc->bd_md.niov].kiov_offset + + iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE); desc->bd_md.niov++; desc->bd_md.length += bulk->bp_buflen; } + /* NB total length may be 0 for a read past EOF, so we send a 0 + * length bulk, since the client expects a bulk event. */ LASSERT(desc->bd_md.niov == desc->bd_page_count); - LASSERT(desc->bd_md.niov != 0); rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h); @@ -187,27 +197,31 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) if (rc != PTL_OK) { CERROR("PtlMDBind failed: %d\n", rc); - LBUG(); - RETURN(rc); + LASSERT (rc == PTL_NOSPACE); + RETURN(-ENOMEM); } + /* Client's bulk and reply matchbits are the same */ + xid = desc->bd_req->rq_xid; remote_id.nid = peer->peer_nid; remote_id.pid = 0; CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s " - "nid "LPX64" pid %d xid %d\n", + "nid "LPX64" pid %d xid "LPX64"\n", desc->bd_md.niov, desc->bd_md.length, desc->bd_portal, peer->peer_ni->pni_name, remote_id.nid, remote_id.pid, xid); + desc->bd_network_rw = 1; rc = PtlPut(desc->bd_md_h, PTL_ACK_REQ, remote_id, desc->bd_portal, 0, xid, 0, 0); if (rc != PTL_OK) { - CERROR("PtlPut("LPU64", %d, %d) failed: %d\n", + desc->bd_network_rw = 0; + CERROR("PtlPut("LPU64", %d, "LPX64") failed: %d\n", remote_id.nid, desc->bd_portal, xid, rc); - PtlMDUnlink(desc->bd_md_h); - LBUG(); - RETURN(rc); + rc2 = PtlMDUnlink(desc->bd_md_h); + LASSERT (rc2 == PTL_OK); + RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); } RETURN(0); @@ -216,28 +230,34 @@ int ptlrpc_bulk_put(struct ptlrpc_bulk_desc *desc) int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc) { int rc; + int rc2; struct ptlrpc_peer *peer; struct list_head *tmp, *next; ptl_process_id_t remote_id; - __u32 xid = 0; - struct iovec *iov; + ptl_kiov_t *iov; + __u64 xid; ENTRY; + /* NB no locking required until desc is on the network */ + LASSERT (!desc->bd_network_rw); + LASSERT (desc->bd_type == BULK_GET_SINK); + desc->bd_complete = 0; + iov = ptlrpc_get_bulk_iov (desc); if (iov == NULL) - RETURN (-ENOMEM); + RETURN(-ENOMEM); - peer = &desc->bd_connection->c_peer; + peer = &desc->bd_export->exp_connection->c_peer; desc->bd_md.start = iov; desc->bd_md.niov = 0; desc->bd_md.length = 0; desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_sink_eq_h; desc->bd_md.threshold = 2; /* SENT and REPLY */ - desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV; + desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV; desc->bd_md.user_ptr = desc; - atomic_set(&desc->bd_source_callback_count, 2); + desc->bd_callback_count = 2; list_for_each_safe(tmp, next, &desc->bd_page_list) { struct ptlrpc_bulk_page *bulk; @@ -245,20 +265,12 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc) LASSERT(desc->bd_md.niov < desc->bd_page_count); - if (desc->bd_md.niov == 0) - xid = bulk->bp_xid; - LASSERT(xid == bulk->bp_xid); /* should all be the same */ - - iov[desc->bd_md.niov].iov_base = bulk->bp_buf; - iov[desc->bd_md.niov].iov_len = bulk->bp_buflen; - if (iov[desc->bd_md.niov].iov_len <= 0) { - CERROR("bad bulk %p bp_buflen[%d] @ %p: %d\n", bulk, - desc->bd_md.niov, bulk->bp_buf, bulk->bp_buflen); - CERROR("desc %p: xid %u, pages %d, ptl %d, ref %d\n", - desc, xid, desc->bd_page_count, desc->bd_portal, - atomic_read(&desc->bd_refcount)); - LBUG(); - } + iov[desc->bd_md.niov].kiov_page = bulk->bp_page; + iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; + iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; + + LASSERT (iov[desc->bd_md.niov].kiov_offset + + iov[desc->bd_md.niov].kiov_len <= PAGE_SIZE); desc->bd_md.niov++; desc->bd_md.length += bulk->bp_buflen; } @@ -266,78 +278,156 @@ int ptlrpc_bulk_get(struct ptlrpc_bulk_desc *desc) LASSERT(desc->bd_md.niov == desc->bd_page_count); LASSERT(desc->bd_md.niov != 0); - rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, - &desc->bd_md_h); + rc = PtlMDBind(peer->peer_ni->pni_ni_h, desc->bd_md, &desc->bd_md_h); - ptlrpc_put_bulk_iov (desc, iov); /*move down to reduce latency to send*/ + ptlrpc_put_bulk_iov(desc, iov); /*move down to reduce latency to send*/ if (rc != PTL_OK) { CERROR("PtlMDBind failed: %d\n", rc); - LBUG(); - RETURN(rc); + LASSERT (rc == PTL_NOSPACE); + RETURN(-ENOMEM); } - remote_id.nid = desc->bd_connection->c_peer.peer_nid; + /* Client's bulk and reply matchbits are the same */ + xid = desc->bd_req->rq_xid; + remote_id.nid = desc->bd_export->exp_connection->c_peer.peer_nid; remote_id.pid = 0; - CDEBUG(D_NET, "Sending %u pages %u bytes to portal %d on %s " - "nid "LPX64" pid %d xid %d\n", - desc->bd_md.niov, desc->bd_md.length, - desc->bd_portal, peer->peer_ni->pni_name, - remote_id.nid, remote_id.pid, xid); + CDEBUG(D_NET, "Fetching %u pages %u bytes from portal %d on %s " + "nid "LPX64" pid %d xid "LPX64"\n", + desc->bd_md.niov, desc->bd_md.length, desc->bd_portal, + peer->peer_ni->pni_name, remote_id.nid, remote_id.pid, + xid); - rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, xid, 0); + desc->bd_network_rw = 1; + rc = PtlGet(desc->bd_md_h, remote_id, desc->bd_portal, 0, + xid, 0); if (rc != PTL_OK) { - CERROR("PtlGet("LPU64", %d, %d) failed: %d\n", + desc->bd_network_rw = 0; + CERROR("PtlGet("LPU64", %d, "LPX64") failed: %d\n", remote_id.nid, desc->bd_portal, xid, rc); - PtlMDUnlink(desc->bd_md_h); - LBUG(); - RETURN(rc); + rc2 = PtlMDUnlink(desc->bd_md_h); + LASSERT (rc2 == PTL_OK); + RETURN((rc == PTL_NOSPACE) ? -ENOMEM : -ECOMM); } RETURN(0); } -static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc) +void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) +{ + /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only + * serialises with completion callback) */ + unsigned long flags; + struct l_wait_info lwi; + int callback_count; + int rc; + + LASSERT (!in_interrupt ()); /* might sleep */ + + /* NB. server-side bulk gets 2 events, so we have to keep trying to + * unlink the MD until all callbacks have happened, or + * PtlMDUnlink() returns OK or INVALID */ + again: + spin_lock_irqsave (&desc->bd_lock, flags); + if (!desc->bd_network_rw) { + /* completed or never even registered. NB holding bd_lock + * guarantees callback has completed if it ran. */ + spin_unlock_irqrestore (&desc->bd_lock, flags); + return; + } + + /* sample callback count while we have the lock */ + callback_count = desc->bd_callback_count; + spin_unlock_irqrestore (&desc->bd_lock, flags); + + rc = PtlMDUnlink (desc->bd_md_h); + switch (rc) { + default: + CERROR("PtlMDUnlink returned %d\n", rc); + LBUG (); + case PTL_OK: /* Won the race with the network */ + LASSERT (!desc->bd_complete); /* Not all callbacks ran */ + desc->bd_network_rw = 0; + return; + + case PTL_MD_INUSE: /* MD is being accessed right now */ + for (;;) { + /* Network access will complete in finite time but the + * timeout lets us CERROR for visibility */ + lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL); + rc = l_wait_event(desc->bd_waitq, + desc->bd_callback_count != + callback_count, &lwi); + if (rc == -ETIMEDOUT) { + CERROR("Unexpectedly long timeout: desc %p\n", + desc); + continue; + } + LASSERT (rc == 0); + break; + } + /* go back and try again... */ + goto again; + + case PTL_INV_MD: /* Lost the race with completion */ + LASSERT (desc->bd_complete); /* Callbacks all ran */ + LASSERT (!desc->bd_network_rw); + return; + } +} + +int ptlrpc_register_bulk (struct ptlrpc_request *req) { + struct ptlrpc_bulk_desc *desc = req->rq_bulk; struct ptlrpc_peer *peer; struct list_head *tmp, *next; int rc; - __u32 xid = 0; - struct iovec *iov; + int rc2; + ptl_kiov_t *iov; ptl_process_id_t source_id; ENTRY; - if (desc->bd_page_count > PTL_MD_MAX_IOV) { - CERROR("iov longer than %d pages not supported (count=%d)\n", - PTL_MD_MAX_IOV, desc->bd_page_count); - RETURN(-EINVAL); - } + /* NB no locking required until desc is on the network */ + LASSERT (!desc->bd_network_rw); + LASSERT (desc->bd_page_count <= PTL_MD_MAX_IOV); + LASSERT (desc->bd_req != NULL); + LASSERT (desc->bd_type == BULK_PUT_SINK || + desc->bd_type == BULK_GET_SOURCE); + + desc->bd_complete = 0; iov = ptlrpc_get_bulk_iov (desc); if (iov == NULL) return (-ENOMEM); - peer = &desc->bd_connection->c_peer; - + peer = &desc->bd_import->imp_connection->c_peer; + desc->bd_md.start = iov; desc->bd_md.niov = 0; desc->bd_md.length = 0; desc->bd_md.threshold = 1; desc->bd_md.user_ptr = desc; + if (desc->bd_type == BULK_GET_SOURCE) { + desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_KIOV; + desc->bd_md.eventq = peer->peer_ni->pni_bulk_get_source_eq_h; + } else { + desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_KIOV; + desc->bd_md.eventq = peer->peer_ni->pni_bulk_put_sink_eq_h; + } + list_for_each_safe(tmp, next, &desc->bd_page_list) { struct ptlrpc_bulk_page *bulk; bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link); LASSERT(desc->bd_md.niov < desc->bd_page_count); - if (desc->bd_md.niov == 0) - xid = bulk->bp_xid; - LASSERT(xid == bulk->bp_xid); /* should all be the same */ + iov[desc->bd_md.niov].kiov_page = bulk->bp_page; + iov[desc->bd_md.niov].kiov_len = bulk->bp_buflen; + iov[desc->bd_md.niov].kiov_offset = bulk->bp_pageoffset; - iov[desc->bd_md.niov].iov_base = bulk->bp_buf; - iov[desc->bd_md.niov].iov_len = bulk->bp_buflen; + LASSERT (bulk->bp_pageoffset + bulk->bp_buflen <= PAGE_SIZE); desc->bd_md.niov++; desc->bd_md.length += bulk->bp_buflen; } @@ -345,157 +435,145 @@ static int ptlrpc_register_bulk_shared(struct ptlrpc_bulk_desc *desc) LASSERT(desc->bd_md.niov == desc->bd_page_count); LASSERT(desc->bd_md.niov != 0); - source_id.nid = desc->bd_connection->c_peer.peer_nid; + /* XXX Registering the same xid on retried bulk makes my head + * explode trying to understand how the original request's bulk + * might interfere with the retried request -eeb */ + LASSERT (!desc->bd_registered || req->rq_xid != desc->bd_last_xid); + desc->bd_registered = 1; + desc->bd_last_xid = desc->bd_last_xid; + + source_id.nid = desc->bd_import->imp_connection->c_peer.peer_nid; source_id.pid = PTL_PID_ANY; rc = PtlMEAttach(peer->peer_ni->pni_ni_h, - desc->bd_portal, source_id, xid, 0, + desc->bd_portal, source_id, req->rq_xid, 0, PTL_UNLINK, PTL_INS_AFTER, &desc->bd_me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); - LBUG(); - GOTO(cleanup, rc); + LASSERT (rc == PTL_NOSPACE); + GOTO(out, rc = -ENOMEM); } + /* About to let the network at it... */ + desc->bd_network_rw = 1; rc = PtlMDAttach(desc->bd_me_h, desc->bd_md, PTL_UNLINK, &desc->bd_md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); - LBUG(); - GOTO(cleanup, rc); + LASSERT (rc == PTL_NOSPACE); + desc->bd_network_rw = 0; + rc2 = PtlMEUnlink (desc->bd_me_h); + LASSERT (rc2 == PTL_OK); + GOTO(out, rc = -ENOMEM); } + rc = 0; - ptlrpc_put_bulk_iov (desc, iov); - - CDEBUG(D_NET, "Setup bulk sink buffers: %u pages %u bytes, xid %u, " - "portal %u on %s\n", desc->bd_md.niov, desc->bd_md.length, - xid, desc->bd_portal, peer->peer_ni->pni_name); - - RETURN(0); + CDEBUG(D_NET, "Setup bulk %s buffers: %u pages %u bytes, xid "LPX64", " + "portal %u on %s\n", + desc->bd_type == BULK_GET_SOURCE ? "get-source" : "put-sink", + desc->bd_md.niov, desc->bd_md.length, + req->rq_xid, desc->bd_portal, peer->peer_ni->pni_name); - cleanup: + out: ptlrpc_put_bulk_iov (desc, iov); - ptlrpc_abort_bulk(desc); - - return rc; -} - -int ptlrpc_register_bulk_get(struct ptlrpc_bulk_desc *desc) -{ - desc->bd_md.options = PTL_MD_OP_GET | PTL_MD_IOV; - desc->bd_md.eventq = - desc->bd_connection->c_peer.peer_ni->pni_bulk_get_source_eq_h; - - return ptlrpc_register_bulk_shared(desc); -} - -int ptlrpc_register_bulk_put(struct ptlrpc_bulk_desc *desc) -{ - desc->bd_md.options = PTL_MD_OP_PUT | PTL_MD_IOV; - desc->bd_md.eventq = - desc->bd_connection->c_peer.peer_ni->pni_bulk_put_sink_eq_h; - - return ptlrpc_register_bulk_shared(desc); -} - -int ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) -{ - int rc1, rc2; - /* This should be safe: these handles are initialized to be - * invalid in ptlrpc_prep_bulk() */ - rc1 = PtlMDUnlink(desc->bd_md_h); - if (rc1 != PTL_OK) - CERROR("PtlMDUnlink: %d\n", rc1); - rc2 = PtlMEUnlink(desc->bd_me_h); - if (rc2 != PTL_OK) - CERROR("PtlMEUnlink: %d\n", rc2); - - return rc1 ? rc1 : rc2; -} - -void obd_brw_set_addref(struct obd_brw_set *set) -{ - atomic_inc(&set->brw_refcount); -} - -void obd_brw_set_add(struct obd_brw_set *set, struct ptlrpc_bulk_desc *desc) -{ - LASSERT(list_empty(&desc->bd_set_chain)); - - ptlrpc_bulk_addref(desc); - atomic_inc(&set->brw_desc_count); - desc->bd_brw_set = set; - list_add(&desc->bd_set_chain, &set->brw_desc_head); -} - -void obd_brw_set_del(struct ptlrpc_bulk_desc *desc) -{ - atomic_dec(&desc->bd_brw_set->brw_desc_count); - list_del_init(&desc->bd_set_chain); - ptlrpc_bulk_decref(desc); + RETURN(rc); } -struct obd_brw_set *obd_brw_set_new(void) +void ptlrpc_unregister_bulk (struct ptlrpc_request *req) { - struct obd_brw_set *set; - - OBD_ALLOC(set, sizeof(*set)); - - if (set != NULL) { - init_waitqueue_head(&set->brw_waitq); - INIT_LIST_HEAD(&set->brw_desc_head); - atomic_set(&set->brw_refcount, 1); - atomic_set(&set->brw_desc_count, 0); + /* Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). */ + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + wait_queue_head_t *wq; + unsigned long flags; + struct l_wait_info lwi; + int rc; + + LASSERT (!in_interrupt ()); /* might sleep */ + + spin_lock_irqsave (&desc->bd_lock, flags); + if (!desc->bd_network_rw) { /* completed or never even registered */ + spin_unlock_irqrestore (&desc->bd_lock, flags); + return; } - - return set; -} - -static void obd_brw_set_free(struct obd_brw_set *set) -{ - struct list_head *tmp, *next; - ENTRY; - - list_for_each_safe(tmp, next, &set->brw_desc_head) { - struct ptlrpc_bulk_desc *desc = - list_entry(tmp, struct ptlrpc_bulk_desc, bd_set_chain); - - CERROR("Unfinished bulk descriptor: %p\n", desc); - - ptlrpc_abort_bulk(desc); + spin_unlock_irqrestore (&desc->bd_lock, flags); + + LASSERT (desc->bd_req == req); /* NB bd_req NULL until registered */ + + /* NB... + * 1. If the MD unlink is successful, the ME gets unlinked too. + * 2. Since client-side bulk only gets a single event and a + * .. threshold of 1. If the MD was inuse at the first link + * .. attempt, the callback is due any minute, and the MD/ME will + * .. unlink themselves. + */ + rc = PtlMDUnlink (desc->bd_md_h); + switch (rc) { + default: + CERROR("PtlMDUnlink returned %d\n", rc); + LBUG (); + case PTL_OK: /* Won the race with completion */ + LASSERT (!desc->bd_complete); /* Callback hasn't happened */ + desc->bd_network_rw = 0; + return; + case PTL_MD_INUSE: /* MD is being accessed right now */ + for (;;) { + /* Network access will complete in finite time but the + * timeout lets us CERROR for visibility */ + if (desc->bd_req->rq_set != NULL) + wq = &req->rq_set->set_waitq; + else + wq = &req->rq_wait_for_rep; + lwi = LWI_TIMEOUT (10 * HZ, NULL, NULL); + rc = l_wait_event(*wq, ptlrpc_bulk_complete(desc), &lwi); + LASSERT (rc == 0 || rc == -ETIMEDOUT); + if (rc == 0) + break; + CERROR ("Unexpectedly long timeout: desc %p\n", desc); + LBUG(); + } + /* Fall through */ + case PTL_INV_MD: /* Lost the race with completion */ + LASSERT (desc->bd_complete);/* Callback has run to completion */ + LASSERT (!desc->bd_network_rw); + return; } - OBD_FREE(set, sizeof(*set)); - EXIT; - return; } -void obd_brw_set_decref(struct obd_brw_set *set) +int ptlrpc_reply(struct ptlrpc_request *req) { - ENTRY; - if (atomic_dec_and_test(&set->brw_refcount)) - obd_brw_set_free(set); - EXIT; -} + unsigned long flags; + int rc; -int ptlrpc_reply(struct ptlrpc_service *svc, struct ptlrpc_request *req) -{ - if (req->rq_repmsg == NULL) { - CERROR("bad: someone called ptlrpc_reply when they meant " - "ptlrpc_error\n"); - return -EINVAL; - } + /* We must already have a reply buffer (only ptlrpc_error() may be + * called without one). We must also have a request buffer which + * is either the actual (swabbed) incoming request, or a saved copy + * if this is a req saved in target_queue_final_reply(). */ + LASSERT (req->rq_repmsg != NULL); + LASSERT (req->rq_reqmsg != NULL); /* FIXME: we need to increment the count of handled events */ if (req->rq_type != PTL_RPC_MSG_ERR) req->rq_type = PTL_RPC_MSG_REPLY; - //req->rq_repmsg->conn = req->rq_connection->c_remote_conn; - //req->rq_repmsg->token = req->rq_connection->c_remote_token; - req->rq_repmsg->status = HTON__u32(req->rq_status); - return ptl_send_buf(req, req->rq_connection, svc->srv_rep_portal); + + req->rq_repmsg->status = req->rq_status; + req->rq_repmsg->opc = req->rq_reqmsg->opc; + + init_waitqueue_head(&req->rq_wait_for_rep); + rc = ptl_send_buf(req, req->rq_connection, req->rq_svc->srv_rep_portal); + if (rc != 0) { + /* Do what the callback handler would have done */ + OBD_FREE (req->rq_repmsg, req->rq_replen); + + spin_lock_irqsave (&req->rq_lock, flags); + req->rq_want_ack = 0; + spin_unlock_irqrestore (&req->rq_lock, flags); + } + return rc; } -int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req) +int ptlrpc_error(struct ptlrpc_request *req) { int rc; ENTRY; @@ -510,94 +588,108 @@ int ptlrpc_error(struct ptlrpc_service *svc, struct ptlrpc_request *req) req->rq_type = PTL_RPC_MSG_ERR; - rc = ptlrpc_reply(svc, req); + rc = ptlrpc_reply(req); RETURN(rc); } int ptl_send_rpc(struct ptlrpc_request *request) { int rc; - char *repbuf; + int rc2; + unsigned long flags; ptl_process_id_t source_id; - + ptl_handle_me_t reply_me_h; ENTRY; - if (request->rq_type != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type sent %d\n", - NTOH__u32(request->rq_reqmsg->type)); - LBUG(); - RETURN(EINVAL); + LASSERT (request->rq_type == PTL_RPC_MSG_REQUEST); + + /* If this is a re-transmit, we're required to have disengaged + * cleanly from the previous attempt */ + LASSERT (!request->rq_receiving_reply); + + if (request->rq_bulk != NULL) { + rc = ptlrpc_register_bulk (request); + if (rc != 0) + RETURN(rc); } + request->rq_reqmsg->handle = request->rq_import->imp_remote_handle; + source_id.nid = request->rq_connection->c_peer.peer_nid; source_id.pid = PTL_PID_ANY; - /* add a ref, which will be balanced in request_out_callback */ - ptlrpc_request_addref(request); - if (request->rq_replen != 0) { - if (request->rq_reply_md.start != NULL) { - rc = PtlMEUnlink(request->rq_reply_me_h); - if (rc != PTL_OK && rc != PTL_INV_ME) { - CERROR("rc %d\n", rc); - LBUG(); - } - repbuf = (char *)request->rq_reply_md.start; - request->rq_repmsg = NULL; - } else { - OBD_ALLOC(repbuf, request->rq_replen); - if (!repbuf) { - LBUG(); - RETURN(ENOMEM); - } - } + LASSERT (request->rq_replen != 0); + OBD_ALLOC(request->rq_repmsg, request->rq_replen); + if (request->rq_repmsg == NULL) { + LBUG(); + RETURN(-ENOMEM); + } - rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni->pni_ni_h, - request->rq_reply_portal,/* XXX FIXME bug 625069 */ - source_id, request->rq_xid, 0, PTL_UNLINK, - PTL_INS_AFTER, &request->rq_reply_me_h); - if (rc != PTL_OK) { - CERROR("PtlMEAttach failed: %d\n", rc); - LBUG(); - GOTO(cleanup, rc); - } + rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni->pni_ni_h, + request->rq_reply_portal, /* XXX FIXME bug 249 */ + source_id, request->rq_xid, 0, PTL_UNLINK, + PTL_INS_AFTER, &reply_me_h); + if (rc != PTL_OK) { + CERROR("PtlMEAttach failed: %d\n", rc); + LASSERT (rc == PTL_NOSPACE); + LBUG(); + GOTO(cleanup, rc = -ENOMEM); + } - request->rq_reply_md.start = repbuf; - request->rq_reply_md.length = request->rq_replen; - request->rq_reply_md.threshold = 1; - request->rq_reply_md.options = PTL_MD_OP_PUT; - request->rq_reply_md.user_ptr = request; - request->rq_reply_md.eventq = - request->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h; - - rc = PtlMDAttach(request->rq_reply_me_h, request->rq_reply_md, - PTL_UNLINK, NULL); - if (rc != PTL_OK) { - CERROR("PtlMDAttach failed: %d\n", rc); - LBUG(); - GOTO(cleanup2, rc); - } + request->rq_reply_md.start = request->rq_repmsg; + request->rq_reply_md.length = request->rq_replen; + request->rq_reply_md.threshold = 1; + request->rq_reply_md.options = PTL_MD_OP_PUT; + request->rq_reply_md.user_ptr = request; + request->rq_reply_md.eventq = + request->rq_connection->c_peer.peer_ni->pni_reply_in_eq_h; - CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 - ", portal %u on %s\n", - request->rq_replen, request->rq_xid, - request->rq_reply_portal, - request->rq_connection->c_peer.peer_ni->pni_name); + rc = PtlMDAttach(reply_me_h, request->rq_reply_md, + PTL_UNLINK, &request->rq_reply_md_h); + if (rc != PTL_OK) { + CERROR("PtlMDAttach failed: %d\n", rc); + LASSERT (rc == PTL_NOSPACE); + LBUG(); + GOTO(cleanup2, rc -ENOMEM); } - /* Clear any flags that may be present from previous sends, - * except for REPLAY, NO_RESEND and WANT_ACK. */ - request->rq_flags &= (PTL_RPC_FL_REPLAY | PTL_RPC_FL_NO_RESEND | - PTL_RPC_FL_WANT_ACK); + CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 + ", portal %u on %s\n", + request->rq_replen, request->rq_xid, + request->rq_reply_portal, + request->rq_connection->c_peer.peer_ni->pni_name); + + ptlrpc_request_addref(request); /* 1 ref for the SENT callback */ + + spin_lock_irqsave (&request->rq_lock, flags); + request->rq_receiving_reply = 1; + /* Clear any flags that may be present from previous sends. */ + request->rq_replied = 0; + request->rq_err = 0; + request->rq_timedout = 0; + request->rq_resend = 0; + request->rq_restart = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); + + request->rq_sent = LTIME_S(CURRENT_TIME); rc = ptl_send_buf(request, request->rq_connection, request->rq_request_portal); - RETURN(rc); + if (rc == 0) + RETURN(rc); + spin_lock_irqsave (&request->rq_lock, flags); + request->rq_receiving_reply = 0; + spin_unlock_irqrestore (&request->rq_lock, flags); + ptlrpc_req_finished (request); /* drop callback ref */ cleanup2: - PtlMEUnlink(request->rq_reply_me_h); + /* MEUnlink is safe; the PUT didn't even get off the ground, and + * nobody apart from the PUT's target has the right nid+XID to + * access the reply buffer. */ + rc2 = PtlMEUnlink(reply_me_h); + LASSERT (rc2 == PTL_OK); cleanup: - OBD_FREE(repbuf, request->rq_replen); - // up(&request->rq_client->cli_rpc_sem); - + OBD_FREE(request->rq_repmsg, request->rq_replen); + request->rq_repmsg = NULL; return rc; } @@ -612,10 +704,10 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd) LASSERT(atomic_read(&rqbd->rqbd_refcount) == 0); - CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx.%lx\n", + CDEBUG(D_NET, "PtlMEAttach: portal %d on %s h %lx."LPX64"\n", service->srv_req_portal, srv_ni->sni_ni->pni_name, srv_ni->sni_ni->pni_ni_h.nal_idx, - srv_ni->sni_ni->pni_ni_h.handle_idx); + srv_ni->sni_ni->pni_ni_h.cookie); /* Attach the leading ME on which we build the ring */ rc = PtlMEAttach(srv_ni->sni_ni->pni_ni_h, service->srv_req_portal, @@ -623,6 +715,7 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd) PTL_UNLINK, PTL_INS_AFTER, &rqbd->rqbd_me_h); if (rc != PTL_OK) { CERROR("PtlMEAttach failed: %d\n", rc); + /* BUG 1191 */ LBUG(); } @@ -640,8 +733,9 @@ void ptlrpc_link_svc_me(struct ptlrpc_request_buffer_desc *rqbd) rc = PtlMDAttach(rqbd->rqbd_me_h, dummy, PTL_UNLINK, &md_h); if (rc != PTL_OK) { CERROR("PtlMDAttach failed: %d\n", rc); + LASSERT (rc == PTL_NOSPACE); LBUG(); -#warning proper cleanup required + /* BUG 1191 */ PtlMEUnlink (rqbd->rqbd_me_h); atomic_set(&rqbd->rqbd_refcount, 0); atomic_dec(&srv_ni->sni_nrqbds_receiving); diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 12be831..3811d2a 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -1,7 +1,10 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com> + * Copyright (C) 2001-2003 Cluster File Systems, Inc. + * Author: Peter J. Braam <braam@clusterfs.com> + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Eric Barton <eeb@clusterfs.com> * * This file is part of Lustre, http://www.lustre.org. * @@ -30,6 +33,10 @@ #include <linux/obd_support.h> #include <linux/lustre_net.h> + +#define HDR_SIZE(count) \ + size_round(offsetof (struct lustre_msg, buflens[(count)])) + int lustre_pack_msg(int count, int *lens, char **bufs, int *len, struct lustre_msg **msg) { @@ -37,26 +44,30 @@ int lustre_pack_msg(int count, int *lens, char **bufs, int *len, struct lustre_msg *m; int size = 0, i; + size = HDR_SIZE (count); for (i = 0; i < count; i++) size += size_round(lens[i]); - *len = size_round(sizeof(*m) + count * sizeof(__u32)) + size; + *len = size; OBD_ALLOC(*msg, *len); if (!*msg) RETURN(-ENOMEM); m = *msg; - m->bufcount = HTON__u32(count); + m->magic = PTLRPC_MSG_MAGIC; + m->version = PTLRPC_MSG_VERSION; + m->bufcount = count; for (i = 0; i < count; i++) - m->buflens[i] = HTON__u32(lens[i]); + m->buflens[i] = lens[i]; - ptr = (char *)m + size_round(sizeof(*m) + count * sizeof(__u32)); + ptr = (char *)m + HDR_SIZE(count); for (i = 0; i < count; i++) { char *tmp = NULL; if (bufs) tmp = bufs[i]; LOGL(tmp, lens[i], ptr); + } return 0; @@ -66,38 +77,84 @@ int lustre_pack_msg(int count, int *lens, char **bufs, int *len, * with the given sub-buffer lengths. */ int lustre_msg_size(int count, int *lengths) { - int size = 0, i; + int size; + int i; + size = HDR_SIZE (count); for (i = 0; i < count; i++) size += size_round(lengths[i]); - size += size_round(sizeof(struct lustre_msg) + count * sizeof(__u32)); - return size; } int lustre_unpack_msg(struct lustre_msg *m, int len) { - int required_len, i; + int flipped; + int required_len; + int i; ENTRY; - required_len = size_round(sizeof(*m)); - if (len < required_len) - RETURN(-EINVAL); + /* We can provide a slightly better error log, if we check the + * message magic and version first. In the future, struct + * lustre_msg may grow, and we'd like to log a version mismatch, + * rather than a short message. + * + */ + required_len = MAX (offsetof (struct lustre_msg, version) + + sizeof (m->version), + offsetof (struct lustre_msg, magic) + + sizeof (m->magic)); + if (len < required_len) { + /* can't even look inside the message */ + CERROR ("message length %d too small for magic/version check\n", + len); + RETURN (-EINVAL); + } + + flipped = lustre_msg_swabbed(m); + if (flipped) + __swab32s (&m->version); + else if (m->magic != PTLRPC_MSG_MAGIC) { + CERROR("wrong lustre_msg magic %#08x\n", m->magic); + RETURN (-EINVAL); + } - m->opc = NTOH__u32(m->opc); - m->status = NTOH__u32(m->status); - m->type = NTOH__u32(m->type); - m->bufcount = NTOH__u32(m->bufcount); - m->last_xid = NTOH__u64(m->last_xid); - m->last_committed = NTOH__u64(m->last_committed); + if (m->version != PTLRPC_MSG_VERSION) { + CERROR("wrong lustre_msg version %#08x\n", m->version); + RETURN (-EINVAL); + } + + /* Now we know the sender speaks my language (but possibly flipped)...*/ + required_len = HDR_SIZE(0); + if (len < required_len) { + /* can't even look inside the message */ + CERROR ("message length %d too small for lustre_msg\n", len); + RETURN (-EINVAL); + } + + if (flipped) { + __swab32s (&m->type); + __swab32s (&m->opc); + __swab64s (&m->last_xid); + __swab64s (&m->last_committed); + __swab64s (&m->transno); + __swab32s (&m->status); + __swab32s (&m->bufcount); + __swab32s (&m->flags); + } + + required_len = HDR_SIZE(m->bufcount); - required_len = size_round(sizeof(*m) + m->bufcount * sizeof(__u32)); - if (len < required_len) + if (len < required_len) { + /* didn't receive all the buffer lengths */ + CERROR ("message length %d too small for %d buflens\n", + len, m->bufcount); RETURN(-EINVAL); + } for (i = 0; i < m->bufcount; i++) { - m->buflens[i] = NTOH__u32(m->buflens[i]); + if (flipped) + __swab32s (&m->buflens[i]); required_len += size_round(m->buflens[i]); } @@ -112,33 +169,924 @@ int lustre_unpack_msg(struct lustre_msg *m, int len) RETURN(0); } -void *lustre_msg_buf(struct lustre_msg *m, int n) +void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) { - int i, offset; + int i; + int offset; + int buflen; + int bufcount; + + LASSERT (m != NULL); + LASSERT (n >= 0); - if (!m) { - CERROR("no message buffer!\n"); - LBUG(); + bufcount = m->bufcount; + if (n >= bufcount) { + CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", + m, n, bufcount); return NULL; } - if (n < 0 || n >= m->bufcount) { - CERROR("referencing bad sub buffer in %p (want %d, count " - "%d)!\n", m, n, m->bufcount); - LBUG(); + buflen = m->buflens[n]; + if (buflen == 0) { + CERROR("msg %p buffer[%d] is zero length\n", m, n); return NULL; } - if (m->buflens[n] == 0) { - CERROR("zero-length buffer requested for buffer %d in %p\n", - n, m); + if (buflen < min_size) { + CERROR("msg %p buffer[%d] size %d too small (required %d)\n", + m, n, buflen, min_size); return NULL; } - offset = size_round(sizeof(*m) + m->bufcount * sizeof(__u32)); - + offset = HDR_SIZE(bufcount); for (i = 0; i < n; i++) offset += size_round(m->buflens[i]); return (char *)m + offset; } + +char *lustre_msg_string (struct lustre_msg *m, int index, int max_len) +{ + /* max_len == 0 means the string should fill the buffer */ + char *str = lustre_msg_buf (m, index, 0); + int slen; + int blen; + + if (str == NULL) { + CERROR ("can't unpack string in msg %p buffer[%d]\n", m, index); + return (NULL); + } + + blen = m->buflens[index]; + slen = strnlen (str, blen); + + if (slen == blen) { /* not NULL terminated */ + CERROR ("can't unpack non-NULL terminated string in " + "msg %p buffer[%d] len %d\n", m, index, blen); + return (NULL); + } + + if (max_len == 0) { + if (slen != blen - 1) { + CERROR ("can't unpack short string in msg %p " + "buffer[%d] len %d: strlen %d\n", + m, index, blen, slen); + return (NULL); + } + } else if (slen > max_len) { + CERROR ("can't unpack oversized string in msg %p " + "buffer[%d] len %d strlen %d: max %d expected\n", + m, index, blen, slen, max_len); + return (NULL); + } + + return (str); +} + +/* Wrap up the normal fixed length case */ +void *lustre_swab_reqbuf (struct ptlrpc_request *req, int index, int min_size, + void *swabber) +{ + void *ptr; + + LASSERT_REQSWAB (req, index); + + ptr = lustre_msg_buf(req->rq_reqmsg, index, min_size); + if (ptr == NULL) + return (NULL); + + if (swabber != NULL && + lustre_msg_swabbed (req->rq_reqmsg)) + ((void (*)(void *))swabber)(ptr); + + return (ptr); +} + +/* Wrap up the normal fixed length case */ +void *lustre_swab_repbuf (struct ptlrpc_request *req, int index, int min_size, + void *swabber) +{ + void *ptr; + + LASSERT_REPSWAB (req, index); + + ptr = lustre_msg_buf (req->rq_repmsg, index, min_size); + if (ptr == NULL) + return (NULL); + + if (swabber != NULL && + lustre_msg_swabbed (req->rq_repmsg)) + ((void (*)(void *))swabber)(ptr); + + return (ptr); +} + +/* byte flipping routines for all wire types declared in + * lustre_idl.h implemented here. + */ + +void lustre_swab_obdo (struct obdo *o) +{ + __swab64s (&o->o_id); + __swab64s (&o->o_gr); + __swab64s (&o->o_atime); + __swab64s (&o->o_mtime); + __swab64s (&o->o_ctime); + __swab64s (&o->o_size); + __swab64s (&o->o_blocks); + __swab64s (&o->o_rdev); + __swab32s (&o->o_blksize); + __swab32s (&o->o_mode); + __swab32s (&o->o_uid); + __swab32s (&o->o_gid); + __swab32s (&o->o_flags); + __swab32s (&o->o_nlink); + __swab32s (&o->o_generation); + __swab32s (&o->o_valid); + __swab32s (&o->o_obdflags); + __swab32s (&o->o_easize); + /* o_inline is opaque */ +} + +void lustre_swab_obd_statfs (struct obd_statfs *os) +{ + __swab64s (&os->os_type); + __swab64s (&os->os_blocks); + __swab64s (&os->os_bfree); + __swab64s (&os->os_bavail); + __swab64s (&os->os_ffree); + /* no need to swap os_fsid */ + __swab32s (&os->os_bsize); + __swab32s (&os->os_namelen); + /* no need to swap os_spare */ +} + +void lustre_swab_obd_ioobj (struct obd_ioobj *ioo) +{ + __swab64s (&ioo->ioo_id); + __swab64s (&ioo->ioo_gr); + __swab32s (&ioo->ioo_type); + __swab32s (&ioo->ioo_bufcnt); +} + +void lustre_swab_niobuf_remote (struct niobuf_remote *nbr) +{ + __swab64s (&nbr->offset); + __swab32s (&nbr->len); + __swab32s (&nbr->flags); +} + +void lustre_swab_ost_body (struct ost_body *b) +{ + lustre_swab_obdo (&b->oa); +} + +void lustre_swab_ll_fid (struct ll_fid *fid) +{ + __swab64s (&fid->id); + __swab32s (&fid->generation); + __swab32s (&fid->f_type); +} + +void lustre_swab_mds_status_req (struct mds_status_req *r) +{ + __swab32s (&r->flags); + __swab32s (&r->repbuf); +} + +void lustre_swab_mds_fileh_body (struct mds_fileh_body *f) +{ + lustre_swab_ll_fid (&f->f_fid); +} + +void lustre_swab_mds_body (struct mds_body *b) +{ + lustre_swab_ll_fid (&b->fid1); + lustre_swab_ll_fid (&b->fid2); + /* handle is opaque */ + __swab64s (&b->size); + __swab64s (&b->blocks); + __swab32s (&b->ino); + __swab32s (&b->valid); + __swab32s (&b->fsuid); + __swab32s (&b->fsgid); + __swab32s (&b->capability); + __swab32s (&b->mode); + __swab32s (&b->uid); + __swab32s (&b->gid); + __swab32s (&b->mtime); + __swab32s (&b->ctime); + __swab32s (&b->atime); + __swab32s (&b->flags); + __swab32s (&b->rdev); + __swab32s (&b->nlink); + __swab32s (&b->generation); + __swab32s (&b->suppgid); + __swab32s (&b->eadatasize); +} + +void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa) +{ + __swab32s (&sa->sa_opcode); + __swab32s (&sa->sa_fsuid); + __swab32s (&sa->sa_fsgid); + __swab32s (&sa->sa_cap); + __swab32s (&sa->sa_reserved); + __swab32s (&sa->sa_valid); + lustre_swab_ll_fid (&sa->sa_fid); + __swab32s (&sa->sa_mode); + __swab32s (&sa->sa_uid); + __swab32s (&sa->sa_gid); + __swab32s (&sa->sa_attr_flags); + __swab64s (&sa->sa_size); + __swab64s (&sa->sa_atime); + __swab64s (&sa->sa_mtime); + __swab64s (&sa->sa_ctime); + __swab32s (&sa->sa_suppgid); +} + +void lustre_swab_mds_rec_create (struct mds_rec_create *cr) +{ + __swab32s (&cr->cr_opcode); + __swab32s (&cr->cr_fsuid); + __swab32s (&cr->cr_fsgid); + __swab32s (&cr->cr_cap); + __swab32s (&cr->cr_flags); /* for use with open */ + __swab32s (&cr->cr_mode); + lustre_swab_ll_fid (&cr->cr_fid); + lustre_swab_ll_fid (&cr->cr_replayfid); + __swab32s (&cr->cr_uid); + __swab32s (&cr->cr_gid); + __swab64s (&cr->cr_time); + __swab64s (&cr->cr_rdev); + __swab32s (&cr->cr_suppgid); +} + +void lustre_swab_mds_rec_link (struct mds_rec_link *lk) +{ + __swab32s (&lk->lk_opcode); + __swab32s (&lk->lk_fsuid); + __swab32s (&lk->lk_fsgid); + __swab32s (&lk->lk_cap); + __swab32s (&lk->lk_suppgid1); + __swab32s (&lk->lk_suppgid2); + lustre_swab_ll_fid (&lk->lk_fid1); + lustre_swab_ll_fid (&lk->lk_fid2); +} + +void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul) +{ + __swab32s (&ul->ul_opcode); + __swab32s (&ul->ul_fsuid); + __swab32s (&ul->ul_fsgid); + __swab32s (&ul->ul_cap); + __swab32s (&ul->ul_reserved); + __swab32s (&ul->ul_mode); + __swab32s (&ul->ul_suppgid); + lustre_swab_ll_fid (&ul->ul_fid1); + lustre_swab_ll_fid (&ul->ul_fid2); +} + +void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn) +{ + __swab32s (&rn->rn_opcode); + __swab32s (&rn->rn_fsuid); + __swab32s (&rn->rn_fsgid); + __swab32s (&rn->rn_cap); + __swab32s (&rn->rn_suppgid1); + __swab32s (&rn->rn_suppgid2); + lustre_swab_ll_fid (&rn->rn_fid1); + lustre_swab_ll_fid (&rn->rn_fid2); +} + +void lustre_swab_lov_desc (struct lov_desc *ld) +{ + __swab32s (&ld->ld_tgt_count); + __swab32s (&ld->ld_active_tgt_count); + __swab32s (&ld->ld_default_stripe_count); + __swab64s (&ld->ld_default_stripe_size); + __swab64s (&ld->ld_default_stripe_offset); + __swab32s (&ld->ld_pattern); + /* uuid endian insensitive */ +} + +void lustre_swab_ldlm_res_id (struct ldlm_res_id *id) +{ + int i; + + for (i = 0; i < RES_NAME_SIZE; i++) + __swab64s (&id->name[i]); +} + +void lustre_swab_ldlm_extent (struct ldlm_extent *e) +{ + __swab64s (&e->start); + __swab64s (&e->end); +} + +void lustre_swab_ldlm_intent (struct ldlm_intent *i) +{ + __swab64s (&i->opc); +} + +void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r) +{ + int i; + + __swab32s (&r->lr_type); + lustre_swab_ldlm_res_id (&r->lr_name); + for (i = 0; i < RES_VERSION_SIZE; i++) + __swab32s (&r->lr_version[i]); +} + +void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l) +{ + int i; + + lustre_swab_ldlm_resource_desc (&l->l_resource); + __swab32s (&l->l_req_mode); + __swab32s (&l->l_granted_mode); + lustre_swab_ldlm_extent (&l->l_extent); + for (i = 0; i < RES_VERSION_SIZE; i++) + __swab32s (&l->l_version[i]); +} + +void lustre_swab_ldlm_request (struct ldlm_request *rq) +{ + __swab32s (&rq->lock_flags); + lustre_swab_ldlm_lock_desc (&rq->lock_desc); + /* lock_handle1 opaque */ + /* lock_handle2 opaque */ +} + +void lustre_swab_ldlm_reply (struct ldlm_reply *r) +{ + __swab32s (&r->lock_flags); + __swab32s (&r->lock_mode); + lustre_swab_ldlm_res_id (&r->lock_resource_name); + /* lock_handle opaque */ + lustre_swab_ldlm_extent (&r->lock_extent); + __swab64s (&r->lock_policy_res1); + __swab64s (&r->lock_policy_res2); +} + +void lustre_swab_ptlbd_op (struct ptlbd_op *op) +{ + __swab16s (&op->op_cmd); + __swab16s (&op->op_lun); + __swab16s (&op->op_niob_cnt); + /* ignore op__padding */ + __swab32s (&op->op_block_cnt); +} + +void lustre_swab_ptlbd_niob (struct ptlbd_niob *n) +{ + __swab64s (&n->n_xid); + __swab64s (&n->n_block_nr); + __swab32s (&n->n_offset); + __swab32s (&n->n_length); +} + +void lustre_swab_ptlbd_rsp (struct ptlbd_rsp *r) +{ + __swab16s (&r->r_status); + __swab16s (&r->r_error_cnt); +} + +void lustre_assert_wire_constants (void) +{ +#if BUG_1343 + /* Wire protocol assertions generated by 'wirecheck' */ + + /* Constants... */ + LASSERT (PTLRPC_MSG_MAGIC == 0x0BD00BD0); + LASSERT (PTLRPC_MSG_VERSION == 0x00040002); + LASSERT (PTL_RPC_MSG_REQUEST == 4711); + LASSERT (PTL_RPC_MSG_ERR == 4712); + LASSERT (PTL_RPC_MSG_REPLY == 4713); + LASSERT (MSG_LAST_REPLAY == 1); + LASSERT (MSG_RESENT == 2); + LASSERT (MSG_CONNECT_RECOVERING == 1); + LASSERT (MSG_CONNECT_RECONNECT == 2); + LASSERT (MSG_CONNECT_REPLAYABLE == 4); + LASSERT (OST_REPLY == 0); + LASSERT (OST_GETATTR == 1); + LASSERT (OST_SETATTR == 2); + LASSERT (OST_READ == 3); + LASSERT (OST_WRITE == 4); + LASSERT (OST_CREATE == 5); + LASSERT (OST_DESTROY == 6); + LASSERT (OST_GET_INFO == 7); + LASSERT (OST_CONNECT == 8); + LASSERT (OST_DISCONNECT == 9); + LASSERT (OST_PUNCH == 10); + LASSERT (OST_OPEN == 11); + LASSERT (OST_CLOSE == 12); + LASSERT (OST_STATFS == 13); + LASSERT (OST_SAN_READ == 14); + LASSERT (OST_SAN_WRITE == 15); + LASSERT (OST_SYNCFS == 16); + LASSERT (OST_LAST_OPC == 17); + LASSERT (OST_FIRST_OPC == 0); + LASSERT (OBD_FL_INLINEDATA == 1); + LASSERT (OBD_FL_OBDMDEXISTS == 2); + LASSERT (LOV_MAGIC == 198183888); + LASSERT (OBD_MD_FLALL == -1); + LASSERT (OBD_MD_FLID == 1); + LASSERT (OBD_MD_FLATIME == 2); + LASSERT (OBD_MD_FLMTIME == 4); + LASSERT (OBD_MD_FLCTIME == 8); + LASSERT (OBD_MD_FLSIZE == 16); + LASSERT (OBD_MD_FLBLOCKS == 32); + LASSERT (OBD_MD_FLBLKSZ == 64); + LASSERT (OBD_MD_FLMODE == 128); + LASSERT (OBD_MD_FLTYPE == 256); + LASSERT (OBD_MD_FLUID == 512); + LASSERT (OBD_MD_FLGID == 1024); + LASSERT (OBD_MD_FLFLAGS == 2048); + LASSERT (OBD_MD_FLOBDFLG == 4096); + LASSERT (OBD_MD_FLNLINK == 8192); + LASSERT (OBD_MD_FLGENER == 16384); + LASSERT (OBD_MD_FLINLINE == 32768); + LASSERT (OBD_MD_FLRDEV == 65536); + LASSERT (OBD_MD_FLEASIZE == 131072); + LASSERT (OBD_MD_LINKNAME == 262144); + LASSERT (OBD_MD_FLHANDLE == 524288); + LASSERT (OBD_MD_FLCKSUM == 1048576); + LASSERT (OBD_BRW_READ == 1); + LASSERT (OBD_BRW_WRITE == 2); + LASSERT (OBD_BRW_CREATE == 4); + LASSERT (OBD_BRW_SYNC == 8); + LASSERT (OBD_OBJECT_EOF == 0xffffffffffffffffULL); + LASSERT (OST_REQ_HAS_OA1 == 1); + LASSERT (MDS_GETATTR == 33); + LASSERT (MDS_GETATTR_NAME == 34); + LASSERT (MDS_CLOSE == 35); + LASSERT (MDS_REINT == 36); + LASSERT (MDS_READPAGE == 37); + LASSERT (MDS_CONNECT == 38); + LASSERT (MDS_DISCONNECT == 39); + LASSERT (MDS_GETSTATUS == 40); + LASSERT (MDS_STATFS == 41); + LASSERT (MDS_GETLOVINFO == 42); + LASSERT (MDS_LAST_OPC == 43); + LASSERT (MDS_FIRST_OPC == 33); + LASSERT (REINT_SETATTR == 1); + LASSERT (REINT_CREATE == 2); + LASSERT (REINT_LINK == 3); + LASSERT (REINT_UNLINK == 4); + LASSERT (REINT_RENAME == 5); + LASSERT (REINT_OPEN == 6); + LASSERT (REINT_MAX == 6); + LASSERT (IT_INTENT_EXEC == 1); + LASSERT (IT_OPEN_LOOKUP == 2); + LASSERT (IT_OPEN_NEG == 4); + LASSERT (IT_OPEN_POS == 8); + LASSERT (IT_OPEN_CREATE == 16); + LASSERT (IT_OPEN_OPEN == 32); + LASSERT (MDS_STATUS_CONN == 1); + LASSERT (MDS_STATUS_LOV == 2); + LASSERT (MDS_OPEN_HAS_EA == 1); + LASSERT (LOV_RAID0 == 0); + LASSERT (LOV_RAIDRR == 1); + LASSERT (LDLM_ENQUEUE == 101); + LASSERT (LDLM_CONVERT == 102); + LASSERT (LDLM_CANCEL == 103); + LASSERT (LDLM_BL_CALLBACK == 104); + LASSERT (LDLM_CP_CALLBACK == 105); + LASSERT (LDLM_LAST_OPC == 106); + LASSERT (LDLM_FIRST_OPC == 101); + LASSERT (PTLBD_QUERY == 200); + LASSERT (PTLBD_READ == 201); + LASSERT (PTLBD_WRITE == 202); + LASSERT (PTLBD_FLUSH == 203); + LASSERT (PTLBD_CONNECT == 204); + LASSERT (PTLBD_DISCONNECT == 205); + LASSERT (PTLBD_LAST_OPC == 204); + LASSERT (PTLBD_FIRST_OPC == 200); + LASSERT (OBD_PING == 400); + /* Sizes and Offsets */ + + + /* Checks for struct lustre_handle */ + LASSERT (sizeof (struct lustre_handle) == 8); + LASSERT (offsetof (struct lustre_handle, cookie) == 0); + LASSERT (sizeof (((struct lustre_handle *)0)->cookie) == 8); + + /* Checks for struct lustre_msg */ + LASSERT (sizeof (struct lustre_msg) == 60); + LASSERT (offsetof (struct lustre_msg, handle) == 0); + LASSERT (sizeof (((struct lustre_msg *)0)->handle) == 8); + LASSERT (offsetof (struct lustre_msg, magic) == 8); + LASSERT (sizeof (((struct lustre_msg *)0)->magic) == 4); + LASSERT (offsetof (struct lustre_msg, type) == 12); + LASSERT (sizeof (((struct lustre_msg *)0)->type) == 4); + LASSERT (offsetof (struct lustre_msg, version) == 16); + LASSERT (sizeof (((struct lustre_msg *)0)->version) == 4); + LASSERT (offsetof (struct lustre_msg, opc) == 20); + LASSERT (sizeof (((struct lustre_msg *)0)->opc) == 4); + LASSERT (offsetof (struct lustre_msg, last_xid) == 24); + LASSERT (sizeof (((struct lustre_msg *)0)->last_xid) == 8); + LASSERT (offsetof (struct lustre_msg, last_committed) == 32); + LASSERT (sizeof (((struct lustre_msg *)0)->last_committed) == 8); + LASSERT (offsetof (struct lustre_msg, transno) == 40); + LASSERT (sizeof (((struct lustre_msg *)0)->transno) == 8); + LASSERT (offsetof (struct lustre_msg, status) == 48); + LASSERT (sizeof (((struct lustre_msg *)0)->status) == 4); + LASSERT (offsetof (struct lustre_msg, flags) == 52); + LASSERT (sizeof (((struct lustre_msg *)0)->flags) == 4); + LASSERT (offsetof (struct lustre_msg, bufcount) == 56); + LASSERT (sizeof (((struct lustre_msg *)0)->bufcount) == 4); + LASSERT (offsetof (struct lustre_msg, buflens[7]) == 88); + LASSERT (sizeof (((struct lustre_msg *)0)->buflens[7]) == 4); + + /* Checks for struct obdo */ + LASSERT (sizeof (struct obdo) == 164); + LASSERT (offsetof (struct obdo, o_id) == 0); + LASSERT (sizeof (((struct obdo *)0)->o_id) == 8); + LASSERT (offsetof (struct obdo, o_gr) == 8); + LASSERT (sizeof (((struct obdo *)0)->o_gr) == 8); + LASSERT (offsetof (struct obdo, o_atime) == 16); + LASSERT (sizeof (((struct obdo *)0)->o_atime) == 8); + LASSERT (offsetof (struct obdo, o_mtime) == 24); + LASSERT (sizeof (((struct obdo *)0)->o_mtime) == 8); + LASSERT (offsetof (struct obdo, o_ctime) == 32); + LASSERT (sizeof (((struct obdo *)0)->o_ctime) == 8); + LASSERT (offsetof (struct obdo, o_size) == 40); + LASSERT (sizeof (((struct obdo *)0)->o_size) == 8); + LASSERT (offsetof (struct obdo, o_blocks) == 48); + LASSERT (sizeof (((struct obdo *)0)->o_blocks) == 8); + LASSERT (offsetof (struct obdo, o_rdev) == 56); + LASSERT (sizeof (((struct obdo *)0)->o_rdev) == 8); + LASSERT (offsetof (struct obdo, o_blksize) == 64); + LASSERT (sizeof (((struct obdo *)0)->o_blksize) == 4); + LASSERT (offsetof (struct obdo, o_mode) == 68); + LASSERT (sizeof (((struct obdo *)0)->o_mode) == 4); + LASSERT (offsetof (struct obdo, o_uid) == 72); + LASSERT (sizeof (((struct obdo *)0)->o_uid) == 4); + LASSERT (offsetof (struct obdo, o_gid) == 76); + LASSERT (sizeof (((struct obdo *)0)->o_gid) == 4); + LASSERT (offsetof (struct obdo, o_flags) == 80); + LASSERT (sizeof (((struct obdo *)0)->o_flags) == 4); + LASSERT (offsetof (struct obdo, o_nlink) == 84); + LASSERT (sizeof (((struct obdo *)0)->o_nlink) == 4); + LASSERT (offsetof (struct obdo, o_generation) == 88); + LASSERT (sizeof (((struct obdo *)0)->o_generation) == 4); + LASSERT (offsetof (struct obdo, o_valid) == 92); + LASSERT (sizeof (((struct obdo *)0)->o_valid) == 4); + LASSERT (offsetof (struct obdo, o_obdflags) == 96); + LASSERT (sizeof (((struct obdo *)0)->o_obdflags) == 4); + LASSERT (offsetof (struct obdo, o_easize) == 100); + LASSERT (sizeof (((struct obdo *)0)->o_easize) == 4); + LASSERT (offsetof (struct obdo, o_inline) == 104); + LASSERT (sizeof (((struct obdo *)0)->o_inline) == 60); + + /* Checks for struct obd_statfs */ + LASSERT (sizeof (struct obd_statfs) == 144); + LASSERT (offsetof (struct obd_statfs, os_type) == 0); + LASSERT (sizeof (((struct obd_statfs *)0)->os_type) == 8); + LASSERT (offsetof (struct obd_statfs, os_blocks) == 8); + LASSERT (sizeof (((struct obd_statfs *)0)->os_blocks) == 8); + LASSERT (offsetof (struct obd_statfs, os_bfree) == 16); + LASSERT (sizeof (((struct obd_statfs *)0)->os_bfree) == 8); + LASSERT (offsetof (struct obd_statfs, os_bavail) == 24); + LASSERT (sizeof (((struct obd_statfs *)0)->os_bavail) == 8); + LASSERT (offsetof (struct obd_statfs, os_ffree) == 40); + LASSERT (sizeof (((struct obd_statfs *)0)->os_ffree) == 8); + LASSERT (offsetof (struct obd_statfs, os_fsid) == 48); + LASSERT (sizeof (((struct obd_statfs *)0)->os_fsid) == 40); + LASSERT (offsetof (struct obd_statfs, os_bsize) == 88); + LASSERT (sizeof (((struct obd_statfs *)0)->os_bsize) == 4); + LASSERT (offsetof (struct obd_statfs, os_namelen) == 92); + LASSERT (sizeof (((struct obd_statfs *)0)->os_namelen) == 4); + + /* Checks for struct obd_ioobj */ + LASSERT (sizeof (struct obd_ioobj) == 24); + LASSERT (offsetof (struct obd_ioobj, ioo_id) == 0); + LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_id) == 8); + LASSERT (offsetof (struct obd_ioobj, ioo_gr) == 8); + LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_gr) == 8); + LASSERT (offsetof (struct obd_ioobj, ioo_type) == 16); + LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_type) == 4); + LASSERT (offsetof (struct obd_ioobj, ioo_bufcnt) == 20); + LASSERT (sizeof (((struct obd_ioobj *)0)->ioo_bufcnt) == 4); + + /* Checks for struct niobuf_remote */ + LASSERT (sizeof (struct niobuf_remote) == 16); + LASSERT (offsetof (struct niobuf_remote, offset) == 0); + LASSERT (sizeof (((struct niobuf_remote *)0)->offset) == 8); + LASSERT (offsetof (struct niobuf_remote, len) == 8); + LASSERT (sizeof (((struct niobuf_remote *)0)->len) == 4); + LASSERT (offsetof (struct niobuf_remote, flags) == 12); + LASSERT (sizeof (((struct niobuf_remote *)0)->flags) == 4); + + /* Checks for struct ost_body */ + LASSERT (sizeof (struct ost_body) == 164); + LASSERT (offsetof (struct ost_body, oa) == 0); + LASSERT (sizeof (((struct ost_body *)0)->oa) == 164); + + /* Checks for struct ll_fid */ + LASSERT (sizeof (struct ll_fid) == 16); + LASSERT (offsetof (struct ll_fid, id) == 0); + LASSERT (sizeof (((struct ll_fid *)0)->id) == 8); + LASSERT (offsetof (struct ll_fid, generation) == 8); + LASSERT (sizeof (((struct ll_fid *)0)->generation) == 4); + LASSERT (offsetof (struct ll_fid, f_type) == 12); + LASSERT (sizeof (((struct ll_fid *)0)->f_type) == 4); + + /* Checks for struct mds_status_req */ + LASSERT (sizeof (struct mds_status_req) == 8); + LASSERT (offsetof (struct mds_status_req, flags) == 0); + LASSERT (sizeof (((struct mds_status_req *)0)->flags) == 4); + LASSERT (offsetof (struct mds_status_req, repbuf) == 4); + LASSERT (sizeof (((struct mds_status_req *)0)->repbuf) == 4); + + /* Checks for struct mds_fileh_body */ + LASSERT (sizeof (struct mds_fileh_body) == 24); + LASSERT (offsetof (struct mds_fileh_body, f_fid) == 0); + LASSERT (sizeof (((struct mds_fileh_body *)0)->f_fid) == 16); + + /* Checks for struct mds_body */ + LASSERT (sizeof (struct mds_body) == 124); + LASSERT (offsetof (struct mds_body, fid1) == 0); + LASSERT (sizeof (((struct mds_body *)0)->fid1) == 16); + LASSERT (offsetof (struct mds_body, fid2) == 16); + LASSERT (sizeof (((struct mds_body *)0)->fid2) == 16); + LASSERT (offsetof (struct mds_body, handle) == 32); + LASSERT (sizeof (((struct mds_body *)0)->handle) == 8); + LASSERT (offsetof (struct mds_body, size) == 40); + LASSERT (sizeof (((struct mds_body *)0)->size) == 8); + LASSERT (offsetof (struct mds_body, blocks) == 48); + LASSERT (sizeof (((struct mds_body *)0)->blocks) == 8); + LASSERT (offsetof (struct mds_body, ino) == 56); + LASSERT (sizeof (((struct mds_body *)0)->ino) == 4); + LASSERT (offsetof (struct mds_body, valid) == 60); + LASSERT (sizeof (((struct mds_body *)0)->valid) == 4); + LASSERT (offsetof (struct mds_body, fsuid) == 64); + LASSERT (sizeof (((struct mds_body *)0)->fsuid) == 4); + LASSERT (offsetof (struct mds_body, fsgid) == 68); + LASSERT (sizeof (((struct mds_body *)0)->fsgid) == 4); + LASSERT (offsetof (struct mds_body, capability) == 72); + LASSERT (sizeof (((struct mds_body *)0)->capability) == 4); + LASSERT (offsetof (struct mds_body, mode) == 76); + LASSERT (sizeof (((struct mds_body *)0)->mode) == 4); + LASSERT (offsetof (struct mds_body, uid) == 80); + LASSERT (sizeof (((struct mds_body *)0)->uid) == 4); + LASSERT (offsetof (struct mds_body, gid) == 84); + LASSERT (sizeof (((struct mds_body *)0)->gid) == 4); + LASSERT (offsetof (struct mds_body, mtime) == 88); + LASSERT (sizeof (((struct mds_body *)0)->mtime) == 4); + LASSERT (offsetof (struct mds_body, ctime) == 92); + LASSERT (sizeof (((struct mds_body *)0)->ctime) == 4); + LASSERT (offsetof (struct mds_body, atime) == 96); + LASSERT (sizeof (((struct mds_body *)0)->atime) == 4); + LASSERT (offsetof (struct mds_body, flags) == 100); + LASSERT (sizeof (((struct mds_body *)0)->flags) == 4); + LASSERT (offsetof (struct mds_body, rdev) == 104); + LASSERT (sizeof (((struct mds_body *)0)->rdev) == 4); + LASSERT (offsetof (struct mds_body, nlink) == 108); + LASSERT (sizeof (((struct mds_body *)0)->nlink) == 4); + LASSERT (offsetof (struct mds_body, generation) == 112); + LASSERT (sizeof (((struct mds_body *)0)->generation) == 4); + LASSERT (offsetof (struct mds_body, suppgid) == 116); + LASSERT (sizeof (((struct mds_body *)0)->suppgid) == 4); + + /* Checks for struct mds_rec_setattr */ + LASSERT (sizeof (struct mds_rec_setattr) == 92); + LASSERT (offsetof (struct mds_rec_setattr, sa_opcode) == 0); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_opcode) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_fsuid) == 4); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fsuid) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_fsgid) == 8); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fsgid) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_cap) == 12); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_cap) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_reserved) == 16); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_reserved) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_valid) == 20); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_valid) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_fid) == 24); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_fid) == 16); + LASSERT (offsetof (struct mds_rec_setattr, sa_mode) == 40); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_mode) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_uid) == 44); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_uid) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_gid) == 48); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_gid) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_attr_flags) == 52); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_attr_flags) == 4); + LASSERT (offsetof (struct mds_rec_setattr, sa_size) == 56); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_size) == 8); + LASSERT (offsetof (struct mds_rec_setattr, sa_atime) == 64); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_atime) == 8); + LASSERT (offsetof (struct mds_rec_setattr, sa_mtime) == 72); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_mtime) == 8); + LASSERT (offsetof (struct mds_rec_setattr, sa_ctime) == 80); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_ctime) == 8); + LASSERT (offsetof (struct mds_rec_setattr, sa_suppgid) == 88); + LASSERT (sizeof (((struct mds_rec_setattr *)0)->sa_suppgid) == 4); + + /* Checks for struct mds_rec_create */ + LASSERT (sizeof (struct mds_rec_create) == 84); + LASSERT (offsetof (struct mds_rec_create, cr_opcode) == 0); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_opcode) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_fsuid) == 4); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fsuid) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_fsgid) == 8); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fsgid) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_cap) == 12); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_cap) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_flags) == 16); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_flags) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_mode) == 20); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_mode) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_fid) == 24); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_fid) == 16); + LASSERT (offsetof (struct mds_rec_create, cr_replayfid) == 40); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_replayfid) == 16); + LASSERT (offsetof (struct mds_rec_create, cr_uid) == 56); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_uid) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_gid) == 60); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_gid) == 4); + LASSERT (offsetof (struct mds_rec_create, cr_time) == 64); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_time) == 8); + LASSERT (offsetof (struct mds_rec_create, cr_rdev) == 72); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_rdev) == 8); + LASSERT (offsetof (struct mds_rec_create, cr_suppgid) == 80); + LASSERT (sizeof (((struct mds_rec_create *)0)->cr_suppgid) == 4); + + /* Checks for struct mds_rec_link */ + LASSERT (sizeof (struct mds_rec_link) == 56); + LASSERT (offsetof (struct mds_rec_link, lk_opcode) == 0); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_opcode) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_fsuid) == 4); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fsuid) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_fsgid) == 8); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fsgid) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_cap) == 12); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_cap) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_suppgid1) == 16); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_suppgid1) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_suppgid2) == 20); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_suppgid2) == 4); + LASSERT (offsetof (struct mds_rec_link, lk_fid1) == 24); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fid1) == 16); + LASSERT (offsetof (struct mds_rec_link, lk_fid2) == 40); + LASSERT (sizeof (((struct mds_rec_link *)0)->lk_fid2) == 16); + + /* Checks for struct mds_rec_unlink */ + LASSERT (sizeof (struct mds_rec_unlink) == 60); + LASSERT (offsetof (struct mds_rec_unlink, ul_opcode) == 0); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_opcode) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_fsuid) == 4); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fsuid) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_fsgid) == 8); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fsgid) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_cap) == 12); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_cap) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_reserved) == 16); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_reserved) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_mode) == 20); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_mode) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_suppgid) == 24); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_suppgid) == 4); + LASSERT (offsetof (struct mds_rec_unlink, ul_fid1) == 28); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fid1) == 16); + LASSERT (offsetof (struct mds_rec_unlink, ul_fid2) == 44); + LASSERT (sizeof (((struct mds_rec_unlink *)0)->ul_fid2) == 16); + + /* Checks for struct mds_rec_rename */ + LASSERT (sizeof (struct mds_rec_rename) == 56); + LASSERT (offsetof (struct mds_rec_rename, rn_opcode) == 0); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_opcode) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_fsuid) == 4); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fsuid) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_fsgid) == 8); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fsgid) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_cap) == 12); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_cap) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_suppgid1) == 16); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_suppgid1) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_suppgid2) == 20); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_suppgid2) == 4); + LASSERT (offsetof (struct mds_rec_rename, rn_fid1) == 24); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fid1) == 16); + LASSERT (offsetof (struct mds_rec_rename, rn_fid2) == 40); + LASSERT (sizeof (((struct mds_rec_rename *)0)->rn_fid2) == 16); + + /* Checks for struct lov_desc */ + LASSERT (sizeof (struct lov_desc) == 72); + LASSERT (offsetof (struct lov_desc, ld_tgt_count) == 0); + LASSERT (sizeof (((struct lov_desc *)0)->ld_tgt_count) == 4); + LASSERT (offsetof (struct lov_desc, ld_active_tgt_count) == 4); + LASSERT (sizeof (((struct lov_desc *)0)->ld_active_tgt_count) == 4); + LASSERT (offsetof (struct lov_desc, ld_default_stripe_count) == 8); + LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_count) == 4); + LASSERT (offsetof (struct lov_desc, ld_default_stripe_size) == 12); + LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_size) == 8); + LASSERT (offsetof (struct lov_desc, ld_default_stripe_offset) == 20); + LASSERT (sizeof (((struct lov_desc *)0)->ld_default_stripe_offset) == 8); + LASSERT (offsetof (struct lov_desc, ld_pattern) == 28); + LASSERT (sizeof (((struct lov_desc *)0)->ld_pattern) == 4); + LASSERT (offsetof (struct lov_desc, ld_uuid) == 32); + LASSERT (sizeof (((struct lov_desc *)0)->ld_uuid) == 37); + + /* Checks for struct ldlm_res_id */ + LASSERT (sizeof (struct ldlm_res_id) == 24); + LASSERT (offsetof (struct ldlm_res_id, name[3]) == 24); + LASSERT (sizeof (((struct ldlm_res_id *)0)->name[3]) == 8); + + /* Checks for struct ldlm_extent */ + LASSERT (sizeof (struct ldlm_extent) == 16); + LASSERT (offsetof (struct ldlm_extent, start) == 0); + LASSERT (sizeof (((struct ldlm_extent *)0)->start) == 8); + LASSERT (offsetof (struct ldlm_extent, end) == 8); + LASSERT (sizeof (((struct ldlm_extent *)0)->end) == 8); + + /* Checks for struct ldlm_intent */ + LASSERT (sizeof (struct ldlm_intent) == 8); + LASSERT (offsetof (struct ldlm_intent, opc) == 0); + LASSERT (sizeof (((struct ldlm_intent *)0)->opc) == 8); + + /* Checks for struct ldlm_resource_desc */ + LASSERT (sizeof (struct ldlm_resource_desc) == 44); + LASSERT (offsetof (struct ldlm_resource_desc, lr_type) == 0); + LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_type) == 4); + LASSERT (offsetof (struct ldlm_resource_desc, lr_name) == 4); + LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_name) == 24); + LASSERT (offsetof (struct ldlm_resource_desc, lr_version[4]) == 44); + LASSERT (sizeof (((struct ldlm_resource_desc *)0)->lr_version[4]) == 4); + + /* Checks for struct ldlm_lock_desc */ + LASSERT (sizeof (struct ldlm_lock_desc) == 84); + LASSERT (offsetof (struct ldlm_lock_desc, l_resource) == 0); + LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_resource) == 44); + LASSERT (offsetof (struct ldlm_lock_desc, l_req_mode) == 44); + LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_req_mode) == 4); + LASSERT (offsetof (struct ldlm_lock_desc, l_granted_mode) == 48); + LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_granted_mode) == 4); + LASSERT (offsetof (struct ldlm_lock_desc, l_extent) == 52); + LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_extent) == 16); + LASSERT (offsetof (struct ldlm_lock_desc, l_version[4]) == 84); + LASSERT (sizeof (((struct ldlm_lock_desc *)0)->l_version[4]) == 4); + + /* Checks for struct ldlm_request */ + LASSERT (sizeof (struct ldlm_request) == 104); + LASSERT (offsetof (struct ldlm_request, lock_flags) == 0); + LASSERT (sizeof (((struct ldlm_request *)0)->lock_flags) == 4); + LASSERT (offsetof (struct ldlm_request, lock_desc) == 4); + LASSERT (sizeof (((struct ldlm_request *)0)->lock_desc) == 84); + LASSERT (offsetof (struct ldlm_request, lock_handle1) == 88); + LASSERT (sizeof (((struct ldlm_request *)0)->lock_handle1) == 8); + LASSERT (offsetof (struct ldlm_request, lock_handle2) == 96); + LASSERT (sizeof (((struct ldlm_request *)0)->lock_handle2) == 8); + + /* Checks for struct ldlm_reply */ + LASSERT (sizeof (struct ldlm_reply) == 72); + LASSERT (offsetof (struct ldlm_reply, lock_flags) == 0); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_flags) == 4); + LASSERT (offsetof (struct ldlm_reply, lock_mode) == 4); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_mode) == 4); + LASSERT (offsetof (struct ldlm_reply, lock_resource_name) == 8); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_resource_name) == 24); + LASSERT (offsetof (struct ldlm_reply, lock_handle) == 32); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_handle) == 8); + LASSERT (offsetof (struct ldlm_reply, lock_extent) == 40); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_extent) == 16); + LASSERT (offsetof (struct ldlm_reply, lock_policy_res1) == 56); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_policy_res1) == 8); + LASSERT (offsetof (struct ldlm_reply, lock_policy_res2) == 64); + LASSERT (sizeof (((struct ldlm_reply *)0)->lock_policy_res2) == 8); + + /* Checks for struct ptlbd_op */ + LASSERT (sizeof (struct ptlbd_op) == 12); + LASSERT (offsetof (struct ptlbd_op, op_cmd) == 0); + LASSERT (sizeof (((struct ptlbd_op *)0)->op_cmd) == 2); + LASSERT (offsetof (struct ptlbd_op, op_lun) == 2); + LASSERT (sizeof (((struct ptlbd_op *)0)->op_lun) == 2); + LASSERT (offsetof (struct ptlbd_op, op_niob_cnt) == 4); + LASSERT (sizeof (((struct ptlbd_op *)0)->op_niob_cnt) == 2); + LASSERT (offsetof (struct ptlbd_op, op__padding) == 6); + LASSERT (sizeof (((struct ptlbd_op *)0)->op__padding) == 2); + LASSERT (offsetof (struct ptlbd_op, op_block_cnt) == 8); + LASSERT (sizeof (((struct ptlbd_op *)0)->op_block_cnt) == 4); + + /* Checks for struct ptlbd_niob */ + LASSERT (sizeof (struct ptlbd_niob) == 24); + LASSERT (offsetof (struct ptlbd_niob, n_xid) == 0); + LASSERT (sizeof (((struct ptlbd_niob *)0)->n_xid) == 8); + LASSERT (offsetof (struct ptlbd_niob, n_block_nr) == 8); + LASSERT (sizeof (((struct ptlbd_niob *)0)->n_block_nr) == 8); + LASSERT (offsetof (struct ptlbd_niob, n_offset) == 16); + LASSERT (sizeof (((struct ptlbd_niob *)0)->n_offset) == 4); + LASSERT (offsetof (struct ptlbd_niob, n_length) == 20); + LASSERT (sizeof (((struct ptlbd_niob *)0)->n_length) == 4); + + /* Checks for struct ptlbd_rsp */ + LASSERT (sizeof (struct ptlbd_rsp) == 4); + LASSERT (offsetof (struct ptlbd_rsp, r_status) == 0); + LASSERT (sizeof (((struct ptlbd_rsp *)0)->r_status) == 2); + LASSERT (offsetof (struct ptlbd_rsp, r_error_cnt) == 2); + LASSERT (sizeof (((struct ptlbd_rsp *)0)->r_error_cnt) == 2); +#endif +} diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c new file mode 100644 index 0000000..51a0cad --- /dev/null +++ b/lustre/ptlrpc/pinger.c @@ -0,0 +1,174 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Portal-RPC reconnection and replay operations, for use in recovery. + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * Author: Phil Schwan <phil@clusterfs.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/version.h> + +#define DEBUG_SUBSYSTEM S_RPC +#include <linux/obd_support.h> +#include <linux/obd_class.h> +#include "ptlrpc_internal.h" + +static struct ptlrpc_thread *pinger_thread = NULL; +static spinlock_t pinger_lock = SPIN_LOCK_UNLOCKED; +static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports); + +int ptlrpc_pinger_add_import(struct obd_import *imp) +{ + ENTRY; + if (!list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + spin_lock(&pinger_lock); + list_add(&imp->imp_pinger_chain, &pinger_imports); + spin_unlock(&pinger_lock); + RETURN(0); +} + +int ptlrpc_pinger_del_import(struct obd_import *imp) +{ + ENTRY; + if (list_empty(&imp->imp_pinger_chain)) + RETURN(-EALREADY); + + spin_lock(&pinger_lock); + list_del_init(&imp->imp_pinger_chain); + spin_unlock(&pinger_lock); + RETURN(0); +} + +static void ptlrpc_pinger_do_stuff(void) +{ + + + +} + +static int ptlrpc_pinger_main(void *arg) +{ + struct ptlrpc_svc_data *data = (struct ptlrpc_svc_data *)arg; + struct ptlrpc_thread *thread = data->thread; + unsigned long flags; + int rc = 0; + ENTRY; + + lock_kernel(); + ptlrpc_daemonize(); + + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) + sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid); +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + sprintf(current->comm, "%s|%d", data->name, + current->thread.mode.tt.extern_pid); +#else + strcpy(current->comm, data->name); +#endif + unlock_kernel(); + + /* Record that the thread is running */ + thread->t_flags = SVC_RUNNING; + wake_up(&thread->t_ctl_waitq); + + /* And now, loop forever on requests */ + while (1) { + struct l_wait_info lwi = LWI_TIMEOUT(5 * HZ, NULL, NULL); + l_wait_event(thread->t_ctl_waitq, + thread->t_flags & SVC_STOPPING, &lwi); + + if (thread->t_flags & SVC_STOPPING) { + thread->t_flags &= ~SVC_STOPPING; + EXIT; + break; + } + ptlrpc_pinger_do_stuff(); + } + + thread->t_flags = SVC_STOPPED; + wake_up(&thread->t_ctl_waitq); + + CDEBUG(D_NET, "pinger thread exiting, process %d: rc = %d\n", + current->pid, rc); + return rc; +} + +int ptlrpc_pinger_start(void) +{ + struct l_wait_info lwi = { 0 }; + struct ptlrpc_svc_data d; + int rc; + ENTRY; + + spin_lock(&pinger_lock); + if (pinger_thread != NULL) + GOTO(out, rc = -EALREADY); + + OBD_ALLOC(pinger_thread, sizeof(*pinger_thread)); + if (pinger_thread == NULL) + GOTO(out, rc = -ENOMEM); + init_waitqueue_head(&pinger_thread->t_ctl_waitq); + + d.name = "Lustre pinger"; + d.thread = pinger_thread; + + /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we + * just drop the VM and FILES in ptlrpc_daemonize() right away. */ + rc = kernel_thread(ptlrpc_pinger_main, &d, CLONE_VM | CLONE_FILES); + if (rc < 0) { + CERROR("cannot start thread: %d\n", rc); + OBD_FREE(pinger_thread, sizeof(*pinger_thread)); + GOTO(out, rc); + } + l_wait_event(pinger_thread->t_ctl_waitq, + pinger_thread->t_flags & SVC_RUNNING, &lwi); + + out: + spin_unlock(&pinger_lock); + RETURN(rc); +} + +int ptlrpc_stop_pinger(void) +{ + struct l_wait_info lwi = { 0 }; + int rc = 0; + ENTRY; + + spin_lock(&pinger_lock); + if (pinger_thread == NULL) + GOTO(out, rc = -EALREADY); + + pinger_thread->t_flags = SVC_STOPPING; + wake_up(&pinger_thread->t_ctl_waitq); + l_wait_event(pinger_thread->t_ctl_waitq, + (pinger_thread->t_flags & SVC_STOPPED), &lwi); + + OBD_FREE(pinger_thread, sizeof(*pinger_thread)); + + out: + spin_unlock(&pinger_lock); + RETURN(rc); +} diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h new file mode 100644 index 0000000..7100707 --- /dev/null +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -0,0 +1,93 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +/* Intramodule declarations for ptlrpc. */ + +#ifndef PTLRPC_INTERNAL_H +#define PTLRPC_INTERNAL_H + +struct ldlm_namespace; +struct obd_import; +struct ldlm_res_id; + +/* ldlm hooks that we need, managed via inter_module_{get,put} */ +extern int (*ptlrpc_ldlm_namespace_cleanup)(struct ldlm_namespace *, int); +extern int (*ptlrpc_ldlm_cli_cancel_unused)(struct ldlm_namespace *, + struct ldlm_res_id *, int); +extern int (*ptlrpc_ldlm_replay_locks)(struct obd_import *); + +int ptlrpc_get_ldlm_hooks(void); +void ptlrpc_daemonize(void); + +int ptlrpc_request_handle_eviction(struct ptlrpc_request *); +void lustre_assert_wire_constants (void); + +void ptlrpc_lprocfs_register_service(struct obd_device *obddev, + struct ptlrpc_service *svc); +void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); + + +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < PTLBD_LAST_OPC) { + /* Portals Block Device */ + return (opc - PTLBD_FIRST_OPC + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc == OBD_PING) { + /* OBD Ping */ + return (opc - OBD_PING + + (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); + } else { + /* Unknown Opcode */ + return -1; + } +} + +#define LUSTRE_MAX_OPCODES (1 + (PTLBD_LAST_OPC - PTLBD_FIRST_OPC) \ + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) \ + + (MDS_LAST_OPC - MDS_FIRST_OPC) \ + + (OST_LAST_OPC - OST_FIRST_OPC)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_SVCEQDEPTH_CNTR = 1, + PTLRPC_SVCIDLETIME_CNTR = 2, + PTLRPC_LAST_CNTR = 3 +}; + +#endif /* PTLRPC_INTERNAL_H */ diff --git a/lustre/ptlrpc/ptlrpc_lib.c b/lustre/ptlrpc/ptlrpc_lib.c new file mode 100644 index 0000000..71142fa --- /dev/null +++ b/lustre/ptlrpc/ptlrpc_lib.c @@ -0,0 +1,119 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef __KERNEL__ +# include <linux/module.h> +#else +# include <liblustre.h> +#endif +#include <linux/obd.h> +#include <linux/obd_ost.h> +#include <linux/lustre_net.h> +#include <linux/lustre_dlm.h> + +int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) +{ + struct ptlrpc_connection *conn; + struct obd_ioctl_data* data = buf; + struct client_obd *cli = &obddev->u.cli; + struct obd_import *imp; + struct obd_uuid server_uuid; + int rq_portal, rp_portal, connect_op; + char *name; + ENTRY; + + if (obddev->obd_type->typ_ops->o_brw) { + rq_portal = OST_REQUEST_PORTAL; + rp_portal = OSC_REPLY_PORTAL; + name = "osc"; + connect_op = OST_CONNECT; + } else { + rq_portal = MDS_REQUEST_PORTAL; + rp_portal = MDC_REPLY_PORTAL; + name = "mdc"; + connect_op = MDS_CONNECT; + } + + if (data->ioc_inllen1 < 1) { + CERROR("requires a TARGET UUID\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1 > 37) { + CERROR("client UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen2 < 1) { + CERROR("setup requires a SERVER UUID\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen2 > 37) { + CERROR("target UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + sema_init(&cli->cl_sem, 1); + cli->cl_conn_count = 0; + memcpy(server_uuid.uuid, data->ioc_inlbuf2, MIN(data->ioc_inllen2, + sizeof(server_uuid))); + + conn = ptlrpc_uuid_to_connection(&server_uuid); + if (conn == NULL) + RETURN(-ENOENT); + + ptlrpc_init_client(rq_portal, rp_portal, name, + &obddev->obd_ldlm_client); + + imp = class_new_import(); + if (imp == NULL) { + ptlrpc_put_connection(conn); + RETURN(-ENOMEM); + } + imp->imp_connection = conn; + imp->imp_client = &obddev->obd_ldlm_client; + imp->imp_obd = obddev; + imp->imp_connect_op = connect_op; + imp->imp_generation = 0; + memcpy(imp->imp_target_uuid.uuid, data->ioc_inlbuf1, data->ioc_inllen1); + class_import_put(imp); + + cli->cl_import = imp; + cli->cl_max_mds_easize = sizeof(struct lov_mds_md); + cli->cl_sandev = to_kdev_t(0); + + RETURN(0); +} + +int client_obd_cleanup(struct obd_device *obddev, int force, int failover) +{ + struct client_obd *client = &obddev->u.cli; + + if (!client->cl_import) + RETURN(-EINVAL); + class_destroy_import(client->cl_import); + client->cl_import = NULL; + RETURN(0); +} diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c new file mode 100644 index 0000000..01ba349 --- /dev/null +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -0,0 +1,237 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define EXPORT_SYMTAB +#define DEBUG_SUBSYSTEM S_RPC + +#ifdef __KERNEL__ +# include <linux/module.h> +# include <linux/init.h> +#else +# include <liblustre.h> +#endif + +#include <linux/obd_support.h> +#include <linux/obd_class.h> +#include <linux/lustre_net.h> + +#include "ptlrpc_internal.h" + +extern int ptlrpc_init_portals(void); +extern void ptlrpc_exit_portals(void); +static int ldlm_hooks_referenced = 0; + +int (*ptlrpc_ldlm_namespace_cleanup)(struct ldlm_namespace *, int); +int (*ptlrpc_ldlm_replay_locks)(struct obd_import *); + +#define GET_HOOK(name) \ +if (!ptlrpc_##name) { \ + if (!(ptlrpc_##name = inter_module_get(#name))) { \ + CERROR("can't i_m_g(\"" #name "\")\n"); \ + return 0; \ + } \ +} + +static int ldlm_hooks_referenced; + +/* This is called from ptlrpc_get_connection, which runs after all the modules + * are loaded, but before anything else interesting happens. + */ +int ptlrpc_get_ldlm_hooks(void) +{ + if (ldlm_hooks_referenced) + return 1; + + GET_HOOK(ldlm_namespace_cleanup); + GET_HOOK(ldlm_replay_locks); + + ldlm_hooks_referenced = 1; + RETURN(1); +} + +#undef GET_HOOK + +#define PUT_HOOK(hook) \ +if (ptlrpc_##hook) { \ + inter_module_put(#hook); \ + ptlrpc_##hook = NULL; \ +} + +void ptlrpc_put_ldlm_hooks(void) +{ + ENTRY; + if (!ldlm_hooks_referenced) + return; + + PUT_HOOK(ldlm_namespace_cleanup); + PUT_HOOK(ldlm_replay_locks); + ldlm_hooks_referenced = 0; + EXIT; +} + +#undef PUT_HOOK + +int ptlrpc_ldlm_hooks_referenced(void) +{ + return ldlm_hooks_referenced; +} + +__init int ptlrpc_init(void) +{ + int rc; + ENTRY; + + lustre_assert_wire_constants (); + + rc = ptlrpc_init_portals(); + if (rc) + RETURN(rc); + + ptlrpc_init_connection(); + + ptlrpc_put_connection_superhack = ptlrpc_put_connection; + ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight; + RETURN(0); +} + +static void __exit ptlrpc_exit(void) +{ + ptlrpc_exit_portals(); + ptlrpc_cleanup_connection(); +} + +/* connection.c */ +EXPORT_SYMBOL(ptlrpc_dump_connections); +EXPORT_SYMBOL(ptlrpc_readdress_connection); +EXPORT_SYMBOL(ptlrpc_get_connection); +EXPORT_SYMBOL(ptlrpc_put_connection); +EXPORT_SYMBOL(ptlrpc_connection_addref); +EXPORT_SYMBOL(ptlrpc_init_connection); +EXPORT_SYMBOL(ptlrpc_cleanup_connection); + +/* niobuf.c */ +EXPORT_SYMBOL(ptlrpc_bulk_put); +EXPORT_SYMBOL(ptlrpc_bulk_get); +EXPORT_SYMBOL(ptlrpc_abort_bulk); +EXPORT_SYMBOL(ptlrpc_register_bulk); +EXPORT_SYMBOL(ptlrpc_unregister_bulk); +EXPORT_SYMBOL(ptlrpc_reply); +EXPORT_SYMBOL(ptlrpc_error); +EXPORT_SYMBOL(ptlrpc_resend_req); +EXPORT_SYMBOL(ptl_send_rpc); +EXPORT_SYMBOL(ptlrpc_link_svc_me); + +/* client.c */ +EXPORT_SYMBOL(ptlrpc_init_client); +EXPORT_SYMBOL(ptlrpc_cleanup_client); +EXPORT_SYMBOL(ptlrpc_req_to_uuid); +EXPORT_SYMBOL(ptlrpc_uuid_to_connection); +EXPORT_SYMBOL(ptlrpc_queue_wait); +EXPORT_SYMBOL(ptlrpc_replay_req); +EXPORT_SYMBOL(ptlrpc_restart_req); +EXPORT_SYMBOL(ptlrpc_prep_req); +EXPORT_SYMBOL(ptlrpc_free_req); +EXPORT_SYMBOL(ptlrpc_unregister_reply); +EXPORT_SYMBOL(ptlrpc_req_finished); +EXPORT_SYMBOL(ptlrpc_request_addref); +EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); +EXPORT_SYMBOL(ptlrpc_prep_bulk_exp); +EXPORT_SYMBOL(ptlrpc_free_bulk); +EXPORT_SYMBOL(ptlrpc_prep_bulk_page); +EXPORT_SYMBOL(ptlrpc_free_bulk_page); +EXPORT_SYMBOL(ptlrpc_abort_inflight); +EXPORT_SYMBOL(ptlrpc_retain_replayable_request); +EXPORT_SYMBOL(ptlrpc_next_xid); + +EXPORT_SYMBOL(ptlrpc_prep_set); +EXPORT_SYMBOL(ptlrpc_set_add_req); +EXPORT_SYMBOL(ptlrpc_set_destroy); +EXPORT_SYMBOL(ptlrpc_set_wait); + +/* service.c */ +EXPORT_SYMBOL(ptlrpc_init_svc); +EXPORT_SYMBOL(ptlrpc_stop_all_threads); +EXPORT_SYMBOL(ptlrpc_start_thread); +EXPORT_SYMBOL(ptlrpc_unregister_service); + +/* pack_generic.c */ +EXPORT_SYMBOL(lustre_pack_msg); +EXPORT_SYMBOL(lustre_msg_size); +EXPORT_SYMBOL(lustre_unpack_msg); +EXPORT_SYMBOL(lustre_msg_buf); +EXPORT_SYMBOL(lustre_msg_string); +EXPORT_SYMBOL(lustre_swab_reqbuf); +EXPORT_SYMBOL(lustre_swab_repbuf); +EXPORT_SYMBOL(lustre_swab_obdo); +EXPORT_SYMBOL(lustre_swab_obd_statfs); +EXPORT_SYMBOL(lustre_swab_obd_ioobj); +EXPORT_SYMBOL(lustre_swab_niobuf_remote); +EXPORT_SYMBOL(lustre_swab_ost_body); +EXPORT_SYMBOL(lustre_swab_ll_fid); +EXPORT_SYMBOL(lustre_swab_mds_status_req); +EXPORT_SYMBOL(lustre_swab_mds_fileh_body); +EXPORT_SYMBOL(lustre_swab_mds_body); +EXPORT_SYMBOL(lustre_swab_mds_rec_setattr); +EXPORT_SYMBOL(lustre_swab_mds_rec_create); +EXPORT_SYMBOL(lustre_swab_mds_rec_link); +EXPORT_SYMBOL(lustre_swab_mds_rec_unlink); +EXPORT_SYMBOL(lustre_swab_mds_rec_rename); +EXPORT_SYMBOL(lustre_swab_lov_desc); +EXPORT_SYMBOL(lustre_swab_ldlm_res_id); +EXPORT_SYMBOL(lustre_swab_ldlm_extent); +EXPORT_SYMBOL(lustre_swab_ldlm_intent); +EXPORT_SYMBOL(lustre_swab_ldlm_resource_desc); +EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); +EXPORT_SYMBOL(lustre_swab_ldlm_request); +EXPORT_SYMBOL(lustre_swab_ldlm_reply); +EXPORT_SYMBOL(lustre_swab_ptlbd_op); +EXPORT_SYMBOL(lustre_swab_ptlbd_niob); +EXPORT_SYMBOL(lustre_swab_ptlbd_rsp); + +/* ptlrpc_module.c */ +EXPORT_SYMBOL(ptlrpc_put_ldlm_hooks); +EXPORT_SYMBOL(ptlrpc_ldlm_hooks_referenced); + +/* recover.c */ +EXPORT_SYMBOL(ptlrpc_run_recovery_over_upcall); +EXPORT_SYMBOL(ptlrpc_run_failed_import_upcall); +EXPORT_SYMBOL(ptlrpc_reconnect_import); +EXPORT_SYMBOL(ptlrpc_replay); +EXPORT_SYMBOL(ptlrpc_resend); +EXPORT_SYMBOL(ptlrpc_wake_delayed); +EXPORT_SYMBOL(ptlrpc_set_import_active); +EXPORT_SYMBOL(ptlrpc_fail_import); +EXPORT_SYMBOL(ptlrpc_fail_export); +EXPORT_SYMBOL(ptlrpc_recover_import); + +/*ptlrpc_lib.c*/ +EXPORT_SYMBOL(client_obd_setup); +EXPORT_SYMBOL(client_obd_cleanup); + +#ifdef __KERNEL__ +MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); +MODULE_DESCRIPTION("Lustre Request Processor"); +MODULE_LICENSE("GPL"); + +module_init(ptlrpc_init); +module_exit(ptlrpc_exit); +#endif diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c deleted file mode 100644 index 21cb3fe..0000000 --- a/lustre/ptlrpc/recovd.c +++ /dev/null @@ -1,372 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * obd/rpc/recovd.c - * - * Lustre High Availability Daemon - * - * Copyright (C) 2001, 2002 Cluster File Systems, Inc. - * - * This code is issued under the GNU General Public License. - * See the file COPYING in this distribution - * - * by Peter Braam <braam@clusterfs.com> - * - */ - -#define DEBUG_SUBSYSTEM S_RPC -#ifndef __KERNEL__ -#include <liblustre.h> -#include <linux/obd.h> -#include <linux/obd_class.h> -#else -#include <linux/lustre_lite.h> -#endif - -#include <linux/lustre_ha.h> -#include <linux/obd_support.h> - -/* dump_connection_list, but shorter for nicer debugging logs */ -static void d_c_l(struct list_head *head) -{ - struct list_head *tmp; - - list_for_each(tmp, head) { - struct ptlrpc_connection *conn = - list_entry(tmp, struct ptlrpc_connection, - c_recovd_data.rd_managed_chain); - CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn, - conn->c_remote_uuid.uuid, - conn->c_recovd_data.rd_phase, - conn->c_recovd_data.rd_next_phase); - } -} - -static void dump_lists(struct recovd_obd *recovd) -{ - CDEBUG(D_HA, "managed: \n"); - d_c_l(&recovd->recovd_managed_items); - CDEBUG(D_HA, "troubled: \n"); - d_c_l(&recovd->recovd_troubled_items); -} - -void recovd_conn_manage(struct ptlrpc_connection *conn, - struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) -{ - struct recovd_data *rd = &conn->c_recovd_data; - ENTRY; - if (!recovd || !recover) { - EXIT; - return; - } - - if (!list_empty(&rd->rd_managed_chain)) { - if (rd->rd_recovd == recovd && rd->rd_recover == recover) { - CDEBUG(D_HA, "conn %p/%s already setup for recovery\n", - conn, conn->c_remote_uuid.uuid); - EXIT; - return; - } - CDEBUG(D_HA, - "conn %p/%s has recovery items %p/%p, making %p/%p\n", - conn, conn->c_remote_uuid.uuid, rd->rd_recovd, rd->rd_recover, - recovd, recover); - spin_lock(&rd->rd_recovd->recovd_lock); - list_del_init(&rd->rd_managed_chain); - spin_unlock(&rd->rd_recovd->recovd_lock); - } - - rd->rd_recovd = recovd; - rd->rd_recover = recover; - rd->rd_phase = RD_IDLE; - rd->rd_next_phase = RD_TROUBLED; - - spin_lock(&recovd->recovd_lock); - list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items); - dump_lists(recovd); - spin_unlock(&recovd->recovd_lock); - - EXIT; -} - -void recovd_conn_unmanage(struct ptlrpc_connection *conn) -{ - struct recovd_data *rd = &conn->c_recovd_data; - struct recovd_obd *recovd = rd->rd_recovd; - ENTRY; - - if (recovd) { - spin_lock(&recovd->recovd_lock); - list_del_init(&rd->rd_managed_chain); - rd->rd_recovd = NULL; - spin_unlock(&recovd->recovd_lock); - } - /* should be safe enough, right? */ - rd->rd_recover = NULL; - rd->rd_next_phase = RD_IDLE; - rd->rd_next_phase = RD_TROUBLED; -} - -void recovd_conn_fail(struct ptlrpc_connection *conn) -{ - struct recovd_data *rd = &conn->c_recovd_data; - struct recovd_obd *recovd = rd->rd_recovd; - ENTRY; - - if (!recovd) { - CERROR("no recovd for connection %p\n", conn); - EXIT; - return; - } - - spin_lock(&recovd->recovd_lock); - if (rd->rd_phase == RD_TROUBLED || rd->rd_phase == RD_PREPARING) { - CDEBUG(D_HA, "connection %p to %s already in recovery\n", - conn, conn->c_remote_uuid.uuid); - spin_unlock(&recovd->recovd_lock); - EXIT; - return; - } - - CERROR("connection %p to %s nid "LPX64" on %s failed\n", conn, - conn->c_remote_uuid.uuid, conn->c_peer.peer_nid, - conn->c_peer.peer_ni->pni_name); - list_del(&rd->rd_managed_chain); - list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items); - if (rd->rd_phase != RD_IDLE) { - CDEBUG(D_HA, - "connection %p to %s failed in recovery: restarting\n", - conn, conn->c_remote_uuid.uuid); - /* XXX call callback with PHASE_FAILED? */ - rd->rd_next_phase = RD_TROUBLED; - } - rd->rd_phase = RD_TROUBLED; - dump_lists(recovd); - spin_unlock(&recovd->recovd_lock); - - wake_up(&recovd->recovd_waitq); - - EXIT; -} - -void recovd_conn_fixed(struct ptlrpc_connection *conn) -{ - struct recovd_data *rd = &conn->c_recovd_data; - ENTRY; - - CDEBUG(D_HA, "connection %p (now to %s) fixed\n", - conn, conn->c_remote_uuid.uuid); - spin_lock(&rd->rd_recovd->recovd_lock); - list_del(&rd->rd_managed_chain); - rd->rd_phase = RD_IDLE; - rd->rd_next_phase = RD_TROUBLED; - list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items); - dump_lists(rd->rd_recovd); - spin_unlock(&rd->rd_recovd->recovd_lock); - - EXIT; -} - -static int recovd_check_event(struct recovd_obd *recovd) -{ - int rc = 0; - struct list_head *tmp; - - ENTRY; - - spin_lock(&recovd->recovd_lock); - - if (recovd->recovd_state == RECOVD_STOPPING) - GOTO(out, rc = 1); - - list_for_each(tmp, &recovd->recovd_troubled_items) { - - struct recovd_data *rd = list_entry(tmp, struct recovd_data, - rd_managed_chain); - - if (rd->rd_phase == rd->rd_next_phase || - rd->rd_phase == RD_FAILED) - GOTO(out, rc = 1); - } - - out: - spin_unlock(&recovd->recovd_lock); - RETURN(rc); -} - -static int recovd_handle_event(struct recovd_obd *recovd) -{ - struct list_head *tmp, *n; - int rc = 0; - ENTRY; - - spin_lock(&recovd->recovd_lock); - - dump_lists(recovd); - - /* - * We use _safe here because one of the callbacks, expecially - * FAILURE or PREPARED, could move list items around. - */ - list_for_each_safe(tmp, n, &recovd->recovd_troubled_items) { - struct recovd_data *rd = list_entry(tmp, struct recovd_data, - rd_managed_chain); - - if (rd->rd_phase != RD_FAILED && - rd->rd_phase != rd->rd_next_phase) - continue; - - switch (rd->rd_phase) { - case RD_FAILED: - cb_failed: /* must always reach here with recovd_lock held! */ - CERROR("recovery FAILED for rd %p (conn %p): %d\n", - rd, class_rd2conn(rd), rc); - - spin_unlock(&recovd->recovd_lock); - (void)rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE); - spin_lock(&recovd->recovd_lock); - break; - - case RD_TROUBLED: - if (!rd->rd_recover) { - CERROR("no rd_recover for rd %p (conn %p)\n", - rd, class_rd2conn(rd)); - rc = -EINVAL; - break; - } - CERROR("starting recovery for rd %p (conn %p)\n", - rd, class_rd2conn(rd)); - rd->rd_phase = RD_PREPARING; - rd->rd_next_phase = RD_PREPARED; - - spin_unlock(&recovd->recovd_lock); - rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE); - spin_lock(&recovd->recovd_lock); - if (rc) - goto cb_failed; - - break; - - case RD_PREPARED: - - CERROR("recovery prepared for rd %p (conn %p)\n", - rd, class_rd2conn(rd)); - rd->rd_phase = RD_RECOVERING; - rd->rd_next_phase = RD_RECOVERED; - - spin_unlock(&recovd->recovd_lock); - rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER); - spin_lock(&recovd->recovd_lock); - if (rc) - goto cb_failed; - - break; - - case RD_RECOVERED: - rd->rd_phase = RD_IDLE; - rd->rd_next_phase = RD_TROUBLED; - - CERROR("recovery complete for rd %p (conn %p)\n", - rd, class_rd2conn(rd)); - break; - - default: - break; - } - } - spin_unlock(&recovd->recovd_lock); - RETURN(0); -} - -#ifdef __KERNEL__ -static int recovd_main(void *arg) -{ - struct recovd_obd *recovd = (struct recovd_obd *)arg; - unsigned long flags; - ENTRY; - - lock_kernel(); - daemonize(); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) - sigfillset(¤t->blocked); - recalc_sigpending(); -#else - spin_lock_irqsave(¤t->sigmask_lock, flags); - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); -#endif - - sprintf(current->comm, "lustre_recovd"); - unlock_kernel(); - - /* Signal that the thread is running. */ - recovd->recovd_thread = current; - recovd->recovd_state = RECOVD_READY; - wake_up(&recovd->recovd_ctl_waitq); - - /* And now, loop forever on requests. */ - while (1) { - wait_event(recovd->recovd_waitq, recovd_check_event(recovd)); - if (recovd->recovd_state == RECOVD_STOPPING) - break; - recovd_handle_event(recovd); - } - - recovd->recovd_thread = NULL; - recovd->recovd_state = RECOVD_STOPPED; - wake_up(&recovd->recovd_ctl_waitq); - CDEBUG(D_HA, "mgr exiting process %d\n", current->pid); - RETURN(0); -} - -int recovd_setup(struct recovd_obd *recovd) -{ - int rc = 0; /* initialize for Liblustre */ - - ENTRY; - - INIT_LIST_HEAD(&recovd->recovd_managed_items); - INIT_LIST_HEAD(&recovd->recovd_troubled_items); - spin_lock_init(&recovd->recovd_lock); - - init_waitqueue_head(&recovd->recovd_waitq); - init_waitqueue_head(&recovd->recovd_recovery_waitq); - init_waitqueue_head(&recovd->recovd_ctl_waitq); - - rc = kernel_thread(recovd_main, (void *)recovd, - CLONE_VM | CLONE_FS | CLONE_FILES); - if (rc < 0) { - CERROR("cannot start thread\n"); - RETURN(-EINVAL); - } - wait_event(recovd->recovd_ctl_waitq, - recovd->recovd_state == RECOVD_READY); - - ptlrpc_recovd = recovd; - class_signal_connection_failure = recovd_conn_fail; - - RETURN(0); -} -#else -int recovd_setup(struct recovd_obd *recovd) -{ - return 0; -} -#endif - -int recovd_cleanup(struct recovd_obd *recovd) -{ - ENTRY; - spin_lock(&recovd->recovd_lock); - recovd->recovd_state = RECOVD_STOPPING; - wake_up(&recovd->recovd_waitq); - spin_unlock(&recovd->recovd_lock); - - wait_event(recovd->recovd_ctl_waitq, - (recovd->recovd_state == RECOVD_STOPPED)); - RETURN(0); -} - -struct recovd_obd *ptlrpc_recovd; diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index a1464a3..a90df0e 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -27,90 +27,87 @@ #include <linux/config.h> #include <linux/module.h> #include <linux/kmod.h> -#else +#else #include <liblustre.h> #endif +#include <linux/obd_support.h> #include <linux/lustre_ha.h> #include <linux/lustre_net.h> +#include <linux/lustre_import.h> +#include <linux/lustre_export.h> #include <linux/obd.h> +#include <linux/obd_class.h> +#include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */ -int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, +#include "ptlrpc_internal.h" + +int ptlrpc_reconnect_import(struct obd_import *imp, struct ptlrpc_request **reqptr) { struct obd_device *obd = imp->imp_obd; - struct client_obd *cli = &obd->u.cli; - int size[] = { sizeof(cli->cl_target_uuid), sizeof(obd->obd_uuid) }; - char *tmp[] = {cli->cl_target_uuid.uuid, obd->obd_uuid.uuid}; + int flags, rc, size[] = {sizeof(imp->imp_target_uuid), + sizeof(obd->obd_uuid), + sizeof(imp->imp_dlm_handle)}; + char *tmp[] = {imp->imp_target_uuid.uuid, + obd->obd_uuid.uuid, + (char *)&imp->imp_dlm_handle}; struct ptlrpc_connection *conn = imp->imp_connection; struct ptlrpc_request *req; - struct obd_export *ldlmexp; struct lustre_handle old_hdl; - int rc; - req = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp); + spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_generation++; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + CERROR("reconnect handle "LPX64"\n", + imp->imp_dlm_handle.cookie); + + req = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp); if (!req) RETURN(-ENOMEM); req->rq_level = LUSTRE_CONN_NEW; req->rq_replen = lustre_msg_size(0, NULL); - /* - * This address is the export that represents our client-side LDLM - * service (for ASTs). We should only have one on this list, so we - * just grab the first one. - * - * XXX tear down export, call class_obd_connect? - */ - ldlmexp = list_entry(obd->obd_exports.next, struct obd_export, - exp_obd_chain); - req->rq_reqmsg->addr = (__u64)(unsigned long)ldlmexp; - req->rq_reqmsg->cookie = ldlmexp->exp_cookie; rc = ptlrpc_queue_wait(req); if (rc) { CERROR("cannot connect to %s@%s: rc = %d\n", - cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid, rc); + imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, rc); GOTO(out_disc, rc); } + if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) { memset(&old_hdl, 0, sizeof(old_hdl)); - if (!memcmp(&old_hdl.addr, &req->rq_repmsg->addr, - sizeof (old_hdl.addr)) && - !memcmp(&old_hdl.cookie, &req->rq_repmsg->cookie, - sizeof (old_hdl.cookie))) { - CERROR("%s@%s didn't like our handle "LPX64"/"LPX64 - ", failed\n", cli->cl_target_uuid.uuid, + if (!memcmp(&old_hdl, &req->rq_repmsg->handle, + sizeof (old_hdl))) { + CERROR("%s@%s didn't like our handle "LPX64 + ", failed\n", imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, - (__u64)(unsigned long)ldlmexp, - ldlmexp->exp_cookie); + imp->imp_dlm_handle.cookie); GOTO(out_disc, rc = -ENOTCONN); } - old_hdl.addr = req->rq_repmsg->addr; - old_hdl.cookie = req->rq_repmsg->cookie; - if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) { - CERROR("%s@%s changed handle from "LPX64"/"LPX64 - " to "LPX64"/"LPX64"; " - "copying, but this may foreshadow disaster\n", - cli->cl_target_uuid.uuid, + if (memcmp(&imp->imp_remote_handle, &req->rq_repmsg->handle, + sizeof(imp->imp_remote_handle))) { + CERROR("%s@%s changed handle from "LPX64" to "LPX64 + "; copying, but this may foreshadow disaster\n", + imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, - old_hdl.addr, old_hdl.cookie, - imp->imp_handle.addr, imp->imp_handle.cookie); - imp->imp_handle.addr = req->rq_repmsg->addr; - imp->imp_handle.cookie = req->rq_repmsg->cookie; + imp->imp_remote_handle.cookie, + req->rq_repmsg->handle.cookie); + imp->imp_remote_handle = req->rq_repmsg->handle; GOTO(out_disc, rc = 0); } CERROR("reconnected to %s@%s after partition\n", - cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid); + imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid); GOTO(out_disc, rc = 0); } - old_hdl = imp->imp_handle; - imp->imp_handle.addr = req->rq_repmsg->addr; - imp->imp_handle.cookie = req->rq_repmsg->cookie; - CERROR("reconnected to %s@%s ("LPX64"/"LPX64", was "LPX64"/" - LPX64")!\n", cli->cl_target_uuid.uuid, conn->c_remote_uuid.uuid, - imp->imp_handle.addr, imp->imp_handle.cookie, - old_hdl.addr, old_hdl.cookie); + old_hdl = imp->imp_remote_handle; + imp->imp_remote_handle = req->rq_repmsg->handle; + CERROR("reconnected to %s@%s ("LPX64", was "LPX64")!\n", + imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, + imp->imp_remote_handle.cookie, old_hdl.cookie); GOTO(out_disc, rc = 0); out_disc: @@ -118,37 +115,62 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, return rc; } -int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn) +void ptlrpc_run_recovery_over_upcall(struct obd_device *obd) { - char *argv[3]; + char *argv[4]; char *envp[3]; int rc; ENTRY; - argv[0] = obd_recovery_upcall; - argv[1] = conn->c_remote_uuid.uuid; - argv[2] = NULL; + argv[0] = obd_lustre_upcall; + argv[1] = "RECOVERY_OVER"; + argv[2] = obd->obd_uuid.uuid; + argv[3] = NULL; envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; - rc = call_usermodehelper(argv[0], argv, envp); + rc = USERMODEHELPER(argv[0], argv, envp); if (rc < 0) { - CERROR("Error invoking recovery upcall %s for %s: %d\n", - argv[0], argv[1], rc); - CERROR("Check /proc/sys/lustre/recovery_upcall?\n"); + CERROR("Error invoking recovery upcall %s %s %s: %d; check " + "/proc/sys/lustre/upcall\n", + argv[0], argv[1], argv[2], rc); + } else { - CERROR("Invoked upcall %s for connection %s\n", - argv[0], argv[1]); + CERROR("Invoked upcall %s %s %s", + argv[0], argv[1], argv[2]); } +} - /* - * We don't want to make this a "failed" recovery, because the system - * administrator -- or, perhaps, tester -- may well be able to rescue - * things by running the correct upcall. - */ - RETURN(0); +void ptlrpc_run_failed_import_upcall(struct obd_import* imp) +{ + char *argv[6]; + char *envp[3]; + int rc; + + ENTRY; + argv[0] = obd_lustre_upcall; + argv[1] = "FAILED_IMPORT"; + argv[2] = imp->imp_target_uuid.uuid; + argv[3] = imp->imp_obd->obd_uuid.uuid; + argv[4] = imp->imp_connection->c_remote_uuid.uuid; + argv[5] = NULL; + + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + rc = USERMODEHELPER(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; check " + "/proc/sys/lustre/lustre_upcall\n", + argv[0], argv[1], argv[2], argv[3], argv[4],rc); + + } else { + CERROR("Invoked upcall %s %s %s %s %s\n", + argv[0], argv[1], argv[2], argv[3], argv[4]); + } } int ptlrpc_replay(struct obd_import *imp) @@ -164,119 +186,404 @@ int ptlrpc_replay(struct obd_import *imp) * get rid of them now. */ spin_lock_irqsave(&imp->imp_lock, flags); - ptlrpc_free_committed(imp); + spin_unlock_irqrestore(&imp->imp_lock, flags); CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n", - imp, imp->imp_obd->u.cli.cl_target_uuid.uuid, committed); + imp, imp->imp_target_uuid.uuid, committed); list_for_each(tmp, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "RETAINED: "); } + /* Do I need to hold a lock across this iteration? We shouldn't be + * racing with any additions to the list, because we're in recovery + * and are therefore not processing additional requests to add. Calls + * to ptlrpc_free_committed might commit requests, but nothing "newer" + * than the one we're replaying (it can't be committed until it's + * replayed, and we're doing that here). l_f_e_safe protects against + * problems with the current request being committed, in the unlikely + * event of that race. So, in conclusion, I think that it's safe to + * perform this list-walk without the imp_lock held. + * + * But, the {mdc,osc}_replay_open callbacks both iterate + * request lists, and have comments saying they assume the + * imp_lock is being held by ptlrpc_replay, but it's not. it's + * just a little race... + */ list_for_each_safe(tmp, pos, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "REPLAY:"); - /* XXX locking WRT failure during replay? */ rc = ptlrpc_replay_req(req); - + if (rc) { CERROR("recovery replay error %d for req "LPD64"\n", rc, req->rq_xid); - GOTO(out, rc); + RETURN(rc); } } - out: - spin_unlock_irqrestore(&imp->imp_lock, flags); - return rc; + RETURN(0); } -#define NO_RESEND 0 /* No action required. */ -#define RESEND 1 /* Resend required. */ -#define RESEND_IGNORE 2 /* Resend, ignore the reply (already saw it). */ -#define RESTART 3 /* Have to restart the call, sorry! */ +int ptlrpc_resend(struct obd_import *imp) +{ + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + unsigned long flags; -static int resend_type(struct ptlrpc_request *req, __u64 committed) + ENTRY; + + /* As long as we're in recovery, nothing should be added to the sending + * list, so we don't need to hold the lock during this iteration and + * resend process. + */ + spin_lock_irqsave(&imp->imp_lock, flags); + LASSERT(imp->imp_level < LUSTRE_CONN_FULL); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + list_for_each_safe(tmp, pos, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + ptlrpc_resend_req(req); + } + + RETURN(0); +} + +void ptlrpc_wake_delayed(struct obd_import *imp) { - if (req->rq_transno && req->rq_transno < committed) { - if (req->rq_flags & PTL_RPC_FL_REPLIED) { - /* Saw the reply and it was committed, no biggie. */ - DEBUG_REQ(D_HA, req, "NO_RESEND"); - return NO_RESEND; + unsigned long flags; + struct list_head *tmp, *pos; + struct ptlrpc_request *req; + + spin_lock_irqsave(&imp->imp_lock, flags); + list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + + ptlrpc_put_connection(req->rq_connection); + req->rq_connection = + ptlrpc_connection_addref(req->rq_import->imp_connection); + + if (req->rq_set) { + DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); + wake_up(&req->rq_set->set_waitq); + } else { + DEBUG_REQ(D_HA, req, "waking:"); + wake_up(&req->rq_wait_for_rep); } - /* Request committed, but no reply: have to restart. */ - return RESTART; } + spin_unlock_irqrestore(&imp->imp_lock, flags); +} - if (req->rq_flags & PTL_RPC_FL_REPLIED) { - /* Saw reply, so resend and ignore new reply. */ - return RESEND_IGNORE; - } +inline void ptlrpc_invalidate_import_state(struct obd_import *imp) +{ + struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; + if (ptlrpc_ldlm_namespace_cleanup == NULL) + CERROR("ptlrpc/ldlm hook is NULL! Please tell phil\n"); + else + ptlrpc_ldlm_namespace_cleanup(ns, 1 /* no network ops */); + ptlrpc_abort_inflight(imp); +} + +int ptlrpc_request_handle_eviction(struct ptlrpc_request *failed_req) +{ + int rc = 0, in_recovery = 0; + struct obd_import *imp= failed_req->rq_import; + unsigned long flags; + struct ptlrpc_request *req; + + spin_lock_irqsave(&imp->imp_lock, flags); + + if (imp->imp_level == LUSTRE_CONN_NOTCONN) + in_recovery = 1; + + if (failed_req->rq_import_generation == imp->imp_generation) + imp->imp_level = LUSTRE_CONN_NOTCONN; + else + in_recovery = 1; - /* Didn't see reply either, so resend. */ - return RESEND; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + if (in_recovery) { + ptlrpc_resend_req(failed_req); + RETURN(rc); + } + CDEBUG(D_HA, "import %s of %s@%s evicted: reconnecting\n", + imp->imp_obd->obd_name, + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); + rc = ptlrpc_reconnect_import(imp, &req); + if (rc) { + ptlrpc_resend_req(failed_req); + ptlrpc_fail_import(imp, imp->imp_generation); + } else { + spin_lock_irqsave (&failed_req->rq_lock, flags); + failed_req->rq_err = 1; + spin_unlock_irqrestore (&failed_req->rq_lock, flags); + spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_level = LUSTRE_CONN_FULL; + imp->imp_invalid = 0; + spin_unlock_irqrestore(&imp->imp_lock, flags); + ptlrpc_invalidate_import_state(imp/*, req->rq_import_generation*/); + } + ptlrpc_req_finished(req); + RETURN(rc); } -int ptlrpc_resend(struct obd_import *imp) +int ptlrpc_set_import_active(struct obd_import *imp, int active) { - int rc = 0; - struct list_head *tmp, *pos; - struct ptlrpc_request *req; + struct obd_device *notify_obd; unsigned long flags; - __u64 committed = imp->imp_peer_committed_transno; + int rc; + + LASSERT(imp->imp_obd); + + notify_obd = imp->imp_obd->u.cli.cl_containing_lov; + + /* When deactivating, mark import invalid, and + abort in-flight requests. */ + if (!active) { + spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_invalid = 1; + spin_unlock_irqrestore(&imp->imp_lock, flags); + + ptlrpc_abort_inflight(imp); + } + + imp->imp_invalid = !active; + + if (notify_obd == NULL) + GOTO(out, rc = 0); + + /* How gross is _this_? */ + if (!list_empty(¬ify_obd->obd_exports)) { + struct lustre_handle fakeconn; + struct obd_ioctl_data ioc_data = { 0 }; + struct obd_export *exp = + list_entry(notify_obd->obd_exports.next, + struct obd_export, exp_obd_chain); + + fakeconn.cookie = exp->exp_handle.h_cookie; + ioc_data.ioc_inlbuf1 = (char *)&imp->imp_target_uuid; + ioc_data.ioc_offset = active; + rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn, + sizeof ioc_data, &ioc_data, NULL); + if (rc) + CERROR("error %sabling %s on LOV %p/%s: %d\n", + active ? "en" : "dis", + imp->imp_target_uuid.uuid, notify_obd, + notify_obd->obd_uuid.uuid, rc); + } else { + CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about " + "%p\n", notify_obd, notify_obd->obd_uuid.uuid, + imp->imp_obd->obd_uuid.uuid); + rc = -ENOENT; + } +out: + /* When activating, mark import valid */ + if (active) { + spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_invalid = 0; + spin_unlock_irqrestore(&imp->imp_lock, flags); + } + + RETURN(rc); +} + +void ptlrpc_fail_import(struct obd_import *imp, int generation) +{ + unsigned long flags; + int in_recovery = 0; ENTRY; + LASSERT (!imp->imp_dlm_fake); + + /* If we were already in recovery, or if the import's connection to its + * service is newer than the failing operation's original attempt, then + * we don't want to recover again. */ spin_lock_irqsave(&imp->imp_lock, flags); - list_for_each_safe(tmp, pos, &imp->imp_sending_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - switch(resend_type(req, committed)) { - case NO_RESEND: - break; - - case RESTART: - ptlrpc_restart_req(req); - break; - - case RESEND_IGNORE: - rc = ptlrpc_replay_req(req); - if (rc) { - DEBUG_REQ(D_ERROR, req, "error %d resending:", - rc); - ptlrpc_restart_req(req); /* might as well */ - } - break; - - case RESEND: - ptlrpc_resend_req(req); - break; - - default: - LBUG(); - } + if (imp->imp_level == LUSTRE_CONN_RECOVD) + in_recovery = 1; + + if (generation == imp->imp_generation) { + imp->imp_level = LUSTRE_CONN_RECOVD; + imp->imp_generation++; + } else { + in_recovery = 1; } spin_unlock_irqrestore(&imp->imp_lock, flags); + + if (in_recovery) { + EXIT; + return; + } + + if (!imp->imp_replayable) { + CDEBUG(D_HA, + "import %s@%s for %s not replayable, deactivating\n", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid, + imp->imp_obd->obd_name); + ptlrpc_set_import_active(imp, 0); + } + + ptlrpc_run_failed_import_upcall(imp); + EXIT; +} + +static int signal_completed_replay(struct obd_import *imp) +{ + struct ptlrpc_request *req; + int rc; + ENTRY; + + req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL); + if (!req) + RETURN(-ENOMEM); + + req->rq_replen = lustre_msg_size(0, NULL); + req->rq_level = LUSTRE_CONN_RECOVD; + req->rq_reqmsg->flags |= MSG_LAST_REPLAY; + + rc = ptlrpc_queue_wait(req); + + ptlrpc_req_finished(req); RETURN(rc); } -void ptlrpc_wake_delayed(struct obd_import *imp) +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) { + int msg_flags = 0, rc; unsigned long flags; - struct list_head *tmp, *pos; struct ptlrpc_request *req; + ENTRY; spin_lock_irqsave(&imp->imp_lock, flags); - list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_HA, req, "waking:"); - wake_up(&req->rq_wait_for_rep); + if (imp->imp_level == LUSTRE_CONN_FULL) { + imp->imp_level = LUSTRE_CONN_RECOVD; + imp->imp_generation++; + } + spin_unlock_irqrestore(&imp->imp_lock, flags); + + if (new_uuid) { + struct ptlrpc_connection *conn; + struct obd_uuid uuid; + struct ptlrpc_peer peer; + struct obd_export *dlmexp; + + obd_str2uuid(&uuid, new_uuid); + if (ptlrpc_uuid_to_peer(&uuid, &peer)) { + CERROR("no connection found for UUID %s\n", new_uuid); + RETURN(-EINVAL); + } + + conn = ptlrpc_get_connection(&peer, &uuid); + if (!conn) + RETURN(-ENOMEM); + + CDEBUG(D_HA, "switching import %s/%s from %s to %s\n", + imp->imp_target_uuid.uuid, imp->imp_obd->obd_name, + imp->imp_connection->c_remote_uuid.uuid, + conn->c_remote_uuid.uuid); + + /* Switch the import's connection and the DLM export's + * connection (which are almost certainly the same, but we + * keep distinct refs just to make things clearer. I think. */ + if (imp->imp_connection) + ptlrpc_put_connection(imp->imp_connection); + /* We hand off the ref from ptlrpc_get_connection. */ + imp->imp_connection = conn; + + dlmexp = class_conn2export(&imp->imp_dlm_handle); + if (dlmexp->exp_connection) + ptlrpc_put_connection(dlmexp->exp_connection); + dlmexp->exp_connection = ptlrpc_connection_addref(conn); + class_export_put(dlmexp); + + } + + rc = ptlrpc_reconnect_import(imp, &req); + + if (rc) { + CERROR("failed to reconnect to %s@%s: %d\n", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid, rc); + RETURN(rc); + } + + if (req->rq_repmsg) + msg_flags = lustre_msg_get_op_flags(req->rq_repmsg); + + if (msg_flags & MSG_CONNECT_RECOVERING) { + CDEBUG(D_HA, "replay requested by %s\n", + imp->imp_target_uuid.uuid); + rc = ptlrpc_replay(imp); + if (rc) + GOTO(out, rc); + + if (ptlrpc_ldlm_replay_locks == NULL) + CERROR("ptlrpc/ldlm hook is NULL! Please tell phil\n"); + else + rc = ptlrpc_ldlm_replay_locks(imp); + if (rc) + GOTO(out, rc); + + rc = signal_completed_replay(imp); + if (rc) + GOTO(out, rc); + } else if (msg_flags & MSG_CONNECT_RECONNECT) { + CDEBUG(D_HA, "reconnected to %s@%s\n", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); + } else { + CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", + imp->imp_target_uuid.uuid, + imp->imp_connection->c_remote_uuid.uuid); + ptlrpc_invalidate_import_state(imp); } + + rc = ptlrpc_resend(imp); + + spin_lock_irqsave(&imp->imp_lock, flags); + imp->imp_level = LUSTRE_CONN_FULL; + imp->imp_invalid = 0; spin_unlock_irqrestore(&imp->imp_lock, flags); + + ptlrpc_wake_delayed(imp); + EXIT; + out: + ptlrpc_req_finished(req); + return rc; +} + +void ptlrpc_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + struct lustre_handle hdl; + unsigned long flags; + + spin_lock_irqsave(&exp->exp_lock, flags); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock_irqrestore(&exp->exp_lock, flags); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + hdl.cookie = exp->exp_handle.h_cookie; + rc = obd_disconnect(&hdl, 0); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); } diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c deleted file mode 100644 index c0d5ba5..0000000 --- a/lustre/ptlrpc/rpc.c +++ /dev/null @@ -1,312 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002, 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#define EXPORT_SYMTAB -#define DEBUG_SUBSYSTEM S_RPC - -#ifdef __KERNEL__ -# include <linux/module.h> -# include <linux/init.h> -#else -# include <liblustre.h> -#endif -#include <linux/obd.h> -#include <linux/obd_support.h> -#include <linux/obd_class.h> -#include <linux/lustre_lib.h> -#include <linux/lustre_ha.h> -#include <linux/lustre_net.h> -#include <linux/lprocfs_status.h> - -extern int ptlrpc_init_portals(void); -extern void ptlrpc_exit_portals(void); - -static __u32 ptlrpc_last_xid = 0; -static spinlock_t ptlrpc_last_xid_lock = SPIN_LOCK_UNLOCKED; - -__u32 ptlrpc_next_xid(void) -{ - __u32 tmp; - spin_lock(&ptlrpc_last_xid_lock); - tmp = ++ptlrpc_last_xid; - spin_unlock(&ptlrpc_last_xid_lock); - return tmp; -} - -int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf) -{ - struct recovd_obd *recovd = &obddev->u.recovd; - int err; - ENTRY; - - memset(recovd, 0, sizeof(*recovd)); - - err = recovd_setup(recovd); - RETURN(err); -} - -int connmgr_cleanup(struct obd_device *dev) -{ - struct recovd_obd *recovd = &dev->u.recovd; - int err; - - err = recovd_cleanup(recovd); - RETURN(err); -} - -int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, - void *karg, void *uarg) -{ - struct ptlrpc_connection *conn = NULL; - struct obd_device *obd = class_conn2obd(hdl); - struct recovd_obd *recovd = &obd->u.recovd; - struct obd_ioctl_data *data = karg; - struct list_head *tmp; - int rc = 0; - - ENTRY; - - if (cmd != OBD_IOC_RECOVD_NEWCONN && cmd != OBD_IOC_RECOVD_FAILCONN) - RETURN(-EINVAL); /* XXX ENOSYS? */ - - /* Find the connection that's been rebuilt or has failed. */ - spin_lock(&recovd->recovd_lock); - list_for_each(tmp, &recovd->recovd_troubled_items) { - conn = list_entry(tmp, struct ptlrpc_connection, - c_recovd_data.rd_managed_chain); - - LASSERT(conn->c_recovd_data.rd_recovd == recovd); /* sanity */ -#warning check buffer overflow in next line - if (!strcmp(conn->c_remote_uuid.uuid, data->ioc_inlbuf1)) - break; - conn = NULL; - } - - if (!conn) { - if (cmd == OBD_IOC_RECOVD_NEWCONN) - GOTO(out, rc = -EINVAL); - /* XXX macroize/inline and share with loop above */ - list_for_each(tmp, &recovd->recovd_managed_items) { - conn = list_entry(tmp, struct ptlrpc_connection, - c_recovd_data.rd_managed_chain); - - LASSERT(conn->c_recovd_data.rd_recovd == recovd); - -#warning check buffer overflow in next line - if (!strcmp(conn->c_remote_uuid.uuid, - data->ioc_inlbuf1)) - break; - conn = NULL; - } - if (!conn) - GOTO(out, rc = -EINVAL); - } - - if (cmd == OBD_IOC_RECOVD_FAILCONN) { - spin_unlock(&recovd->recovd_lock); - recovd_conn_fail(conn); - spin_lock(&recovd->recovd_lock); - goto out; - } - - - /* else (NEWCONN) */ - spin_lock(&conn->c_lock); - - /* whatever happens, reset the INVALID flag */ - conn->c_flags &= ~CONN_INVALID; - - /* XXX is this a good check? should we allow readdressing of - * XXX conns that aren't in recovery? - */ - if (conn->c_recovd_data.rd_phase != RD_PREPARING) { - spin_unlock(&conn->c_lock); - GOTO(out, rc = -EALREADY); - } - - if (data->ioc_inllen2) { - CERROR("conn %p UUID change %s -> %s\n", - conn, conn->c_remote_uuid.uuid, data->ioc_inlbuf2); - obd_str2uuid(&conn->c_remote_uuid, data->ioc_inlbuf2); - } else { - CERROR("conn %p UUID %s reconnected\n", conn, - conn->c_remote_uuid.uuid); - } - ptlrpc_readdress_connection(conn, &conn->c_remote_uuid); - spin_unlock(&conn->c_lock); - - conn->c_recovd_data.rd_phase = RD_PREPARED; - wake_up(&recovd->recovd_waitq); - out: - spin_unlock(&recovd->recovd_lock); - RETURN(rc); -} - -static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src, - struct obd_uuid *cluuid, struct recovd_obd *recovd, - ptlrpc_recovery_cb_t recover) -{ - return class_connect(conn, src, cluuid); -} - -int connmgr_attach(struct obd_device *dev, obd_count len, void *data) -{ - struct lprocfs_static_vars lvars; - int rc = 0; - - lprocfs_init_vars(&lvars); - rc = lprocfs_obd_attach(dev, lvars.obd_vars); - return rc; -} - -int conmgr_detach(struct obd_device *dev) -{ - return lprocfs_obd_detach(dev); -} - -/* use obd ops to offer management infrastructure */ -static struct obd_ops recovd_obd_ops = { - o_owner: THIS_MODULE, - o_attach: connmgr_attach, - o_detach: conmgr_detach, - o_setup: connmgr_setup, - o_cleanup: connmgr_cleanup, - o_iocontrol: connmgr_iocontrol, - o_connect: connmgr_connect, - o_disconnect: class_disconnect -}; - - - -__init int ptlrpc_init(void) -{ - struct lprocfs_static_vars lvars; - int rc; - ENTRY; - - rc = ptlrpc_init_portals(); - if (rc) - RETURN(rc); - ptlrpc_init_connection(); - - lprocfs_init_vars(&lvars); - rc = class_register_type(&recovd_obd_ops, lvars.module_vars, - LUSTRE_HA_NAME); - if (rc) - RETURN(rc); - ptlrpc_put_connection_superhack = ptlrpc_put_connection; - ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight; - RETURN(0); -} - -static void __exit ptlrpc_exit(void) -{ - class_unregister_type(LUSTRE_HA_NAME); - ptlrpc_exit_portals(); - ptlrpc_cleanup_connection(); -} - -/* rpc.c */ -EXPORT_SYMBOL(ptlrpc_next_xid); - -/* recovd.c */ -EXPORT_SYMBOL(ptlrpc_recovd); -EXPORT_SYMBOL(recovd_conn_fail); -EXPORT_SYMBOL(recovd_conn_manage); -EXPORT_SYMBOL(recovd_conn_fixed); -EXPORT_SYMBOL(recovd_setup); -EXPORT_SYMBOL(recovd_cleanup); - -/* connection.c */ -EXPORT_SYMBOL(ptlrpc_readdress_connection); -EXPORT_SYMBOL(ptlrpc_get_connection); -EXPORT_SYMBOL(ptlrpc_put_connection); -EXPORT_SYMBOL(ptlrpc_connection_addref); -EXPORT_SYMBOL(ptlrpc_init_connection); -EXPORT_SYMBOL(ptlrpc_cleanup_connection); - -/* niobuf.c */ -EXPORT_SYMBOL(ptlrpc_bulk_put); -EXPORT_SYMBOL(ptlrpc_bulk_get); -EXPORT_SYMBOL(ptlrpc_register_bulk_put); -EXPORT_SYMBOL(ptlrpc_register_bulk_get); -EXPORT_SYMBOL(ptlrpc_abort_bulk); -EXPORT_SYMBOL(ptlrpc_reply); -EXPORT_SYMBOL(ptlrpc_error); -EXPORT_SYMBOL(ptlrpc_resend_req); -EXPORT_SYMBOL(ptl_send_rpc); -EXPORT_SYMBOL(ptlrpc_link_svc_me); -EXPORT_SYMBOL(obd_brw_set_new); -EXPORT_SYMBOL(obd_brw_set_add); -EXPORT_SYMBOL(obd_brw_set_del); -EXPORT_SYMBOL(obd_brw_set_decref); -EXPORT_SYMBOL(obd_brw_set_addref); - -/* client.c */ -EXPORT_SYMBOL(ptlrpc_init_client); -EXPORT_SYMBOL(ptlrpc_cleanup_client); -EXPORT_SYMBOL(ptlrpc_req_to_uuid); -EXPORT_SYMBOL(ptlrpc_uuid_to_connection); -EXPORT_SYMBOL(ptlrpc_queue_wait); -EXPORT_SYMBOL(ptlrpc_continue_req); -EXPORT_SYMBOL(ptlrpc_replay_req); -EXPORT_SYMBOL(ptlrpc_restart_req); -EXPORT_SYMBOL(ptlrpc_prep_req); -EXPORT_SYMBOL(ptlrpc_free_req); -EXPORT_SYMBOL(ptlrpc_abort); -EXPORT_SYMBOL(ptlrpc_req_finished); -EXPORT_SYMBOL(ptlrpc_request_addref); -EXPORT_SYMBOL(ptlrpc_prep_bulk); -EXPORT_SYMBOL(ptlrpc_free_bulk); -EXPORT_SYMBOL(ptlrpc_prep_bulk_page); -EXPORT_SYMBOL(ptlrpc_free_bulk_page); -EXPORT_SYMBOL(ll_brw_sync_wait); -EXPORT_SYMBOL(ptlrpc_abort_inflight); -EXPORT_SYMBOL(ptlrpc_retain_replayable_request); - -/* service.c */ -EXPORT_SYMBOL(ptlrpc_init_svc); -EXPORT_SYMBOL(ptlrpc_stop_all_threads); -EXPORT_SYMBOL(ptlrpc_start_thread); -EXPORT_SYMBOL(ptlrpc_unregister_service); - -/* pack_generic.c */ -EXPORT_SYMBOL(lustre_pack_msg); -EXPORT_SYMBOL(lustre_msg_size); -EXPORT_SYMBOL(lustre_unpack_msg); -EXPORT_SYMBOL(lustre_msg_buf); - -/* recover.c */ -EXPORT_SYMBOL(ptlrpc_run_recovery_upcall); -EXPORT_SYMBOL(ptlrpc_reconnect_import); -EXPORT_SYMBOL(ptlrpc_replay); -EXPORT_SYMBOL(ptlrpc_resend); -EXPORT_SYMBOL(ptlrpc_wake_delayed); - -#ifdef __KERNEL__ -MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); -MODULE_DESCRIPTION("Lustre Request Processor"); -MODULE_LICENSE("GPL"); - -module_init(ptlrpc_init); -module_exit(ptlrpc_exit); -#endif diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 3338445..f9475b0 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -28,6 +28,8 @@ #include <linux/obd_support.h> #include <linux/obd_class.h> #include <linux/lustre_net.h> +#include <portals/types.h> +#include "ptlrpc_internal.h" extern int request_in_callback(ptl_event_t *ev); @@ -52,11 +54,10 @@ static int ptlrpc_check_event(struct ptlrpc_service *svc, idx = (svc->srv_interface_rover + i) % ptlrpc_ninterfaces; srv_ni = &svc->srv_interfaces[idx]; - LASSERT (ptl_is_valid_handle (&srv_ni->sni_eq_h)); + LASSERT (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)); rc = PtlEQGet(srv_ni->sni_eq_h, event); - switch (rc) - { + switch (rc) { case PTL_OK: /* next time start with the next interface */ svc->srv_interface_rover = (idx+1) % ptlrpc_ninterfaces; @@ -72,6 +73,7 @@ static int ptlrpc_check_event(struct ptlrpc_service *svc, } } rc = 0; + EXIT; out: spin_unlock(&svc->srv_lock); return rc; @@ -81,12 +83,10 @@ struct ptlrpc_service * ptlrpc_init_svc(__u32 nevents, __u32 nbufs, __u32 bufsize, __u32 max_req_size, int req_portal, int rep_portal, - svc_handler_t handler, char *name) + svc_handler_t handler, char *name, + struct obd_device *obddev) { - int ssize; - int rc; - int i; - int j; + int i, j, ssize, rc; struct ptlrpc_service *service; struct ptlrpc_srv_ni *srv_ni; ENTRY; @@ -118,7 +118,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs, srv_ni->sni_service = service; srv_ni->sni_ni = &ptlrpc_interfaces[i]; - ptl_set_inv_handle (&srv_ni->sni_eq_h); + srv_ni->sni_eq_h = PTL_HANDLE_NONE; INIT_LIST_HEAD(&srv_ni->sni_rqbds); srv_ni->sni_nrqbds = 0; atomic_set(&srv_ni->sni_nrqbds_receiving, 0); @@ -152,7 +152,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs, } rqbd->rqbd_srv_ni = srv_ni; - ptl_set_inv_handle(&rqbd->rqbd_me_h); + rqbd->rqbd_me_h = PTL_HANDLE_NONE; atomic_set(&rqbd->rqbd_refcount, 0); OBD_ALLOC(rqbd->rqbd_buffer, service->srv_buf_size); @@ -171,6 +171,8 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs, } } + ptlrpc_lprocfs_register_service(obddev, service); + CDEBUG(D_NET, "%s: Started on %d interfaces, listening on portal %d\n", service->srv_name, ptlrpc_ninterfaces, service->srv_req_portal); @@ -192,12 +194,13 @@ static int handle_incoming_request(struct obd_device *obddev, * on the stack of mds_handle instead. */ LASSERT (atomic_read (&rqbd->rqbd_refcount) > 0); - LASSERT ((event->mem_desc.options & PTL_MD_IOV) == 0); + LASSERT ((event->mem_desc.options & (PTL_MD_IOV | PTL_MD_KIOV)) == 0); LASSERT (rqbd->rqbd_srv_ni->sni_service == svc); LASSERT (rqbd->rqbd_buffer == event->mem_desc.start); LASSERT (event->offset + event->mlength <= svc->srv_buf_size); memset(request, 0, sizeof(*request)); + spin_lock_init (&request->rq_lock); INIT_LIST_HEAD(&request->rq_list); request->rq_svc = svc; request->rq_obd = obddev; @@ -205,55 +208,37 @@ static int handle_incoming_request(struct obd_device *obddev, request->rq_reqmsg = event->mem_desc.start + event->offset; request->rq_reqlen = event->mlength; - rc = -EINVAL; - - if (request->rq_reqlen < sizeof(struct lustre_msg)) { - CERROR("incomplete request (%d): ptl %d from "LPX64" xid " - LPU64"\n", - request->rq_reqlen, svc->srv_req_portal, +#if SWAB_PARANOIA + /* Clear request swab mask; this is a new request */ + request->rq_req_swab_mask = 0; +#endif + rc = lustre_unpack_msg (request->rq_reqmsg, request->rq_reqlen); + if (rc != 0) { + CERROR ("error unpacking request: ptl %d from "LPX64 + " xid "LPU64"\n", svc->srv_req_portal, event->initiator.nid, request->rq_xid); goto out; } - - CDEBUG(D_RPCTRACE, "Handling RPC ni:pid:xid:nid:opc %d:%d:"LPU64":" - LPX64":%d\n", (int)(rqbd->rqbd_srv_ni - svc->srv_interfaces), - NTOH__u32(request->rq_reqmsg->status), request->rq_xid, - event->initiator.nid, NTOH__u32(request->rq_reqmsg->opc)); - - if (NTOH__u32(request->rq_reqmsg->type) != PTL_RPC_MSG_REQUEST) { + rc = -EINVAL; + if (request->rq_reqmsg->type != PTL_RPC_MSG_REQUEST) { CERROR("wrong packet type received (type=%u)\n", request->rq_reqmsg->type); goto out; } - if (request->rq_reqmsg->magic != PTLRPC_MSG_MAGIC) { - CERROR("wrong lustre_msg magic %d: ptl %d from "LPX64" xid " - LPD64"\n", - request->rq_reqmsg->magic, svc->srv_req_portal, - event->initiator.nid, request->rq_xid); - goto out; - } - - if (request->rq_reqmsg->version != PTLRPC_MSG_VERSION) { - CERROR("wrong lustre_msg version %d: ptl %d from "LPX64" xid " - LPD64"\n", - request->rq_reqmsg->version, svc->srv_req_portal, - event->initiator.nid, request->rq_xid); - goto out; - } - CDEBUG(D_NET, "got req "LPD64" (md: %p + %d)\n", request->rq_xid, event->mem_desc.start, event->offset); request->rq_peer.peer_nid = event->initiator.nid; request->rq_peer.peer_ni = rqbd->rqbd_srv_ni->sni_ni; - request->rq_export = class_conn2export((struct lustre_handle *) - request->rq_reqmsg); + request->rq_export = class_conn2export(&request->rq_reqmsg->handle); if (request->rq_export) { request->rq_connection = request->rq_export->exp_connection; ptlrpc_connection_addref(request->rq_connection); + request->rq_export->exp_last_request_time = + LTIME_S(CURRENT_TIME); } else { /* create a (hopefully temporary) connection that will be used * to send the reply if this call doesn't create an export. @@ -262,8 +247,28 @@ static int handle_incoming_request(struct obd_device *obddev, ptlrpc_get_connection(&request->rq_peer, NULL); } + CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid:pid:xid:ni:nid:opc %s:%s:%d:" + LPU64":%s:"LPX64":%d\n", + current->comm, + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + request->rq_reqmsg->status, request->rq_xid, + rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid, + request->rq_reqmsg->opc); + rc = svc->srv_handler(request); + CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid:pid:xid:ni:nid:opc %s:%s:%d:" + LPU64":%s:"LPX64":%d\n", + current->comm, + (request->rq_export ? + (char *)request->rq_export->exp_client_uuid.uuid : "0"), + request->rq_reqmsg->status, request->rq_xid, + rqbd->rqbd_srv_ni->sni_ni->pni_name, event->initiator.nid, + request->rq_reqmsg->opc); + ptlrpc_put_connection(request->rq_connection); + if (request->rq_export != NULL) + class_export_put(request->rq_export); out: if (atomic_dec_and_test (&rqbd->rqbd_refcount)) /* last reference? */ @@ -272,8 +277,8 @@ static int handle_incoming_request(struct obd_device *obddev, return rc; } -/* Don't use daemonize, it removes fs struct from new thread (bug 418) */ -static void ptlrpc_daemonize(void) +/* Don't use daemonize, it removes fs struct from new thread (bug 418) */ +void ptlrpc_daemonize(void) { exit_mm(current); @@ -295,25 +300,23 @@ static int ptlrpc_main(void *arg) ptl_event_t *event; int rc = 0; unsigned long flags; + cycles_t workdone_time; + cycles_t svc_workcycles; ENTRY; lock_kernel(); ptlrpc_daemonize(); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) + SIGNAL_MASK_LOCK(current, flags); sigfillset(¤t->blocked); - recalc_sigpending(); -#else - spin_lock_irqsave(¤t->sigmask_lock, flags); - sigfillset(¤t->blocked); - recalc_sigpending(current); - spin_unlock_irqrestore(¤t->sigmask_lock, flags); -#endif + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); -#ifdef __arch_um__ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) sprintf(current->comm, "%s|%d", data->name,current->thread.extern_pid); -#endif +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) + sprintf(current->comm, "%s|%d", data->name, + current->thread.mode.tt.extern_pid); #else strcpy(current->comm, data->name); #endif @@ -328,6 +331,7 @@ static int ptlrpc_main(void *arg) /* Record that the thread is running */ thread->t_flags = SVC_RUNNING; + svc_workcycles = workdone_time = 0; wake_up(&thread->t_ctl_waitq); /* XXX maintain a list of all managed devices: insert here */ @@ -348,12 +352,43 @@ static int ptlrpc_main(void *arg) } if (thread->t_flags & SVC_EVENT) { + cycles_t workstart_time; spin_lock(&svc->srv_lock); thread->t_flags &= ~SVC_EVENT; + /* Update Service Statistics */ + workstart_time = get_cycles(); + if (workdone_time && (svc->svc_counters != NULL)) { + /* Stats for req(n) are updated just before + * req(n+1) is executed. This avoids need to + * reacquire svc->srv_lock after + * call to handling_request(). + */ + int opc_offset; + /* req_waittime */ + LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_REQWAIT_CNTR], + (workstart_time - + event->arrival_time)); + /* svc_eqdepth */ + LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_SVCEQDEPTH_CNTR], + 0); /* Wait for b_eq branch */ + /* svc_idletime */ + LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_SVCIDLETIME_CNTR], + (workstart_time - + workdone_time)); + /* previous request */ + opc_offset = + opcode_offset(request->rq_reqmsg->opc); + if (opc_offset >= 0) { + LASSERT(opc_offset < LUSTRE_MAX_OPCODES); + LPROCFS_COUNTER_INCR(&svc->svc_counters->cntr[PTLRPC_LAST_CNTR+opc_offset], svc_workcycles); + } + } spin_unlock(&svc->srv_lock); rc = handle_incoming_request(obddev, svc, event, request); + workdone_time = get_cycles(); + svc_workcycles = workdone_time - workstart_time; continue; } @@ -363,6 +398,10 @@ static int ptlrpc_main(void *arg) break; } + /* NB should wait for all SENT callbacks to complete before exiting + * here. Unfortunately at this time there is no way to track this + * state. + */ OBD_FREE(request, sizeof(*request)); out_event: OBD_FREE(event, sizeof(*event)); @@ -415,10 +454,8 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, ENTRY; OBD_ALLOC(thread, sizeof(*thread)); - if (thread == NULL) { - LBUG(); + if (thread == NULL) RETURN(-ENOMEM); - } init_waitqueue_head(&thread->t_ctl_waitq); d.dev = dev; @@ -433,9 +470,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we * just drop the VM and FILES in ptlrpc_daemonize() right away. */ - rc = kernel_thread(ptlrpc_main, (void *) &d, CLONE_VM | CLONE_FILES); + rc = kernel_thread(ptlrpc_main, &d, CLONE_VM | CLONE_FILES); if (rc < 0) { - CERROR("cannot start thread\n"); + CERROR("cannot start thread: %d\n", rc); OBD_FREE(thread, sizeof(*thread)); RETURN(rc); } @@ -446,8 +483,7 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, int ptlrpc_unregister_service(struct ptlrpc_service *service) { - int i; - int rc; + int i, rc; struct ptlrpc_srv_ni *srv_ni; LASSERT (list_empty (&service->srv_threads)); @@ -490,7 +526,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) LASSERT (srv_ni->sni_nrqbds == 0); - if (ptl_is_valid_handle (&srv_ni->sni_eq_h)) { + if (!PtlHandleEqual (srv_ni->sni_eq_h, PTL_HANDLE_NONE)) { rc = PtlEQFree(srv_ni->sni_eq_h); if (rc) CERROR("%s.%d: PtlEQFree failed on %s: %d\n", @@ -499,6 +535,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) } } + ptlrpc_lprocfs_unregister_service(service); + OBD_FREE(service, offsetof (struct ptlrpc_service, srv_interfaces[ptlrpc_ninterfaces])); diff --git a/lustre/scripts/llite-group.sh b/lustre/scripts/llite-group.sh new file mode 100644 index 0000000..ed914e8 --- /dev/null +++ b/lustre/scripts/llite-group.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# +# llite-group.sh : Cluster Manager service script for Lustre +# +# This must be named llite-<group>.sh, where group is the device +# group that is being managed by the cluster manager service. +# + +set -e +set -vx + +[ -f ${LUSTRE_CFG:=/etc/lustre/lustre.cfg} ] && . ${LUSTRE_CFG} + +LDAPURL=${LDAPURL:-ldap://localhost} +CONFIG=${CONFIG:-test23} + +LACTIVE=${LACTIVE:-/usr/sbin/lactive} +LCONF=${LCONF:-/usr/sbin/lconf} + +group=`basename $0 .sh| cut -d- -f2` +confopt="--ldapurl $LDAPURL --config $CONFIG" + +[ -z "$group" ] && exit 0 + +node=`hostname -s` + +[ -d ${STATUS_DIR:=/var/lustre} ] || mkdir -p $STATUS_DIR + +start() { + echo -n "Starting $SERVICE: " + python2 $LACTIVE $confopt --group $group --active $node + python2 $LCONF -v $confopt + RETVAL=$? + echo done +} + +stop() { + echo -n "Shutting down $SERVICE: " + python2 $LCONF -v --cleanup --force --failover $confopt + RETVAL=$? + echo done +} + +status() { + RETVAL=0 +} + + +case "$1" in + start) + start + ;; + stop) + stop + ;; + restart) + restart + ;; + status) + status $SERVICE + ;; + *) + echo "Usage: $0 {start|stop|status}" + exit 1 +esac + +exit $RETVAL diff --git a/lustre/scripts/lustre.spec.in b/lustre/scripts/lustre.spec.in index 3657c7a..40e627d 100644 --- a/lustre/scripts/lustre.spec.in +++ b/lustre/scripts/lustre.spec.in @@ -1,10 +1,8 @@ # lustre.spec -%define version HEAD +%define version b_devel %define kversion @RELEASE@ %define linuxdir @LINUX@ -%define portalsdir @PORTALS@ -%define portalslibdir @PORTALSLIB@ -Release: 0302240920chaos +Release: 0305281701chaos Summary: Lustre Lite File System Name: lustre-lite @@ -21,7 +19,7 @@ servers and utilities. %package -n lustre-modules Summary: Kernel Lustre drivers for Linux %{kversion} -Requires: portals-modules +Requires: modutils >= 2.4.10 Group: Development/Kernel %description -n lustre-modules @@ -59,7 +57,6 @@ Group: Development/Kernel %description -n liblustre Lustre lib binary package. - %prep %setup -qn lustre-%{version} %setup -c -n lustre-%{version}-lib @@ -69,12 +66,12 @@ rm -rf $RPM_BUILD_ROOT # Set an explicit path to our Linux tree, if we can. cd $RPM_BUILD_DIR/lustre-%{version} -./configure --with-linux='%{linuxdir}' --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}' +./configure --with-linux='%{linuxdir}' make %ifarch i386 cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version} -./configure --with-lib --with-portals='%{portalsdir}' --with-portalslib='%{portalslibdir}' +./configure --with-lib make %endif @@ -87,6 +84,17 @@ cd $RPM_BUILD_DIR/lustre-%{version}-lib/lustre-%{version} make install prefix=$RPM_BUILD_ROOT %endif +%ifarch alpha +# this hurts me + conf_flag= + linuxdir=%{linuxdir} + test -d $linuxdir && conf_flag=--with-linux=$linuxdir + make clean + ./configure --enable-rtscts-myrinet $conf_flag + make + cp linux/rtscts/rtscts.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/net/lustre/rtscts_myrinet.o + cp user/myrinet_utils/mcpload $RPM_BUILD_ROOT/usr/sbin/mcpload +%endif # Create the pristine source directory. cd $RPM_BUILD_DIR/lustre-%{version} @@ -107,6 +115,7 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre %attr(-, root, root) /usr/sbin/lstripe %attr(-, root, root) /usr/sbin/mcreate %attr(-, root, root) /usr/sbin/mkdirmany +%attr(-, root, root) /usr/lib/lustre/python/* %attr(-, root, root) /usr/lib/lustre/examples/llmount.sh %attr(-, root, root) /usr/lib/lustre/examples/llmountcleanup.sh %attr(-, root, root) /usr/lib/lustre/examples/llecho.sh @@ -114,9 +123,19 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre %attr(-, root, root) /usr/lib/lustre/examples/uml.sh %attr(-, root, root) /usr/lib/lustre/examples/lov.sh %attr(-, root, root) /etc/init.d/lustre +%attr(-, root, root) /usr/sbin/acceptor +%attr(-, root, root) /usr/sbin/ptlctl +%attr(-, root, root) /usr/sbin/debugctl +%attr(-, root, root) /lib/libportals.a +%attr(-, root, root) /lib/libptlctl.a +%attr(-, root, root) /lib/libtcpnal.a +%attr(-, root, root) /usr/include/lustre/*.h +%ifarch alpha +%attr(-, root, root) /usr/sbin/mcpload +%endif %files -n lustre-doc -%attr(-, root, root) %doc COPYING FDL +#%attr(-, root, root) %doc COPYING FDL %attr(-, root, root) %doc doc/lustre.pdf doc/lustre-HOWTO.txt %attr(-, root, root) %doc tests/client-echo.cfg tests/client-mount.cfg %attr(-, root, root) %doc tests/client-mount2.cfg @@ -128,12 +147,11 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre %files -n lustre-modules %attr(-, root, root) %doc COPYING -%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/extN.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ldlm.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/llite.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/mdc.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/mds.o -%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/fsfilt_extN.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/fsfilt_ext3.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdclass.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdecho.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/obdfilter.o @@ -141,6 +159,14 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/osc.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ost.o %attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/ptlrpc.o +#portals modules +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/kptlrouter.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/*nal.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/portals.o +%ifarch alpha +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/p3mod.o +%attr(-, root, root) /lib/modules/%{kversion}/kernel/net/lustre/rtscts.o +%endif %files -n lustre-source %attr(-, root, root) /usr/src/lustre-%{version} @@ -179,6 +205,9 @@ mkdir -p $RPM_BUILD_ROOT/var/lib/ldap/lustre if [ ! -e /dev/obd ]; then mknod /dev/obd c 10 241 fi +if [ ! -e /dev/portals ]; then + mknod /dev/portals c 10 240 +fi depmod -ae || exit 0 grep -q obdclass /etc/modules.conf || \ @@ -190,6 +219,12 @@ grep -q '/dev/obd' /etc/modules.conf || \ grep -q '/dev/lustre' /etc/modules.conf || \ echo 'alias /dev/lustre obdclass' >> /etc/modules.conf +grep -q portals /etc/modules.conf || \ + echo 'alias char-major-10-240 portals' >> /etc/modules.conf + +grep -q '/dev/portals' /etc/modules.conf || \ + echo 'alias /dev/portals portals' >> /etc/modules.conf + %postun depmod -ae || exit 0 @@ -206,7 +241,6 @@ if grep -q slapd-lustre $slapd; then cp $tmp $slapd rm $tmp fi - %clean #rm -rf $RPM_BUILD_ROOT diff --git a/lustre/scripts/version_tag.pl b/lustre/scripts/version_tag.pl index 3575b87..f33443f 100644 --- a/lustre/scripts/version_tag.pl +++ b/lustre/scripts/version_tag.pl @@ -59,6 +59,9 @@ sub get_latest_mtime() $cur_dir =~ s/\/CVS\/Entries$//; my @statbuf = stat("$cur_dir/$file"); my $mtime = $statbuf[9]; + if (!defined($mtime)) { + next; + } my $local_date = gmtime($mtime); if ($local_date ne $date && $file ne "lustre.spec.in") { @@ -100,7 +103,7 @@ sub get_linuxdir() } while (defined($line = <$config>)) { chomp($line); - if ($line =~ /LINUX = (.*)/) { + if ($line =~ /LINUX :?= (.*)/) { $dir = $1; last; } diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 7a18486..5bb1e26 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -35,3 +35,9 @@ wantedi createtest open_delay statone +opendevunlink +opendirunlink +runas +openfile +unlinkmany +fchdir_test diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 6d23b3d..470c9de 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -1,32 +1,22 @@ # Lustre test Makefile DEFS= -CPPFLAGS = -I. -I$(PORTALS)/include -I$(top_srcdir)/include -D_LARGEFILE64_SOURCE +CPPFLAGS = -I. -I$(top_srcdir)/portals/include/ -I$(top_srcdir)/include -D_LARGEFILE64_SOURCE CFLAGS := -g -Wall # LDADD = -lldap # LDADD := -lreadline -ltermcap # -lefence EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \ - common.sh lustre.cfg \ - client-echo.cfg elan-server.cfg net-client.cfg obdecho.cfg \ - client-mount.cfg ldlm.cfg net-local.cfg obdfilter.cfg \ - client-mount2.cfg lustre.cfg net-server.cfg sanity.sh \ - rundbench mcreate \ - elan-client.cfg mds.cfg trivial.sh -pkgexampledir = '${exec_prefix}/usr/lib/$(PACKAGE)/examples' + sanity.sh rundbench mcreate pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh local.sh echo.sh uml.sh lov.sh -noinst_SCRIPTS = llsetup.sh llrsetup.sh llcleanup.sh -noinst_DATA = lustre.cfg -noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \ - lldlm.sh llecho.sh llext3.sh llmodules.sh llmount-client.sh \ - llmount-server.sh llmount.sh llmountcleanup.sh llrext3.sh \ - llrmount.sh llsimple.sh mdcreq.sh mdcreqcleanup.sh \ - ostreq.sh runfailure-client-mds-recover.sh runfailure-mds \ - runfailure-net runfailure-ost runiozone runregression-net.sh \ - runtests runvmstat snaprun.sh tbox.sh common.sh +noinst_DATA = +noinst_SCRIPTS = leak_finder.pl llecho.sh llmount.sh llmountcleanup.sh tbox.sh \ + llrmount.sh runfailure-mds runvmstat runfailure-net runfailure-ost \ + runiozone runregression-net.sh runtests sanity.sh rundbench noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink +noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test # noinst_PROGRAMS += ldaptest -noinst_PROGRAMS += checkstat wantedi statone runas +noinst_PROGRAMS += checkstat wantedi statone runas openfile sbin_PROGRAMS = mcreate mkdirmany # ldaptest_SOURCES = ldaptest.c @@ -48,13 +38,21 @@ createdestroy_SOURCES = createdestroy.c stat_SOURCES = stat.c createmany_SOURCES = createmany.c statmany_SOURCES = statmany.c +unlinkmany_SOURCES = unlinkmany.c statone_SOURCES = statone.c mkdirmany_SOURCES = mkdirmany.c multifstat_SOURCES = multifstat.c checkstat_SOURCES = checkstat.c runas_SOURCES = runas.c +openfile_SOURCES = openfile.c wantedi_SOURCES = wantedi.c createtest_SOURCES = createtest.c open_delay_SOURCES = open_delay.c +opendirunlink_SOURCES=opendirunlink.c +opendevunlink_SOURCES=opendirunlink.c +fchdir_test_SOURCES=fchdir_test.c +#mkdirdeep_SOURCES= mkdirdeep.c +#mkdirdeep_LDADD=-L../portals/util -lptlctl +#mkdirdeep_CPPFLAGS=-I$(top_srcdir)/portals/include include $(top_srcdir)/Rules diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index bee6588..e874f5d 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -18,11 +18,12 @@ fi [ "$TMP" ] || TMP=/tmp [ "$COUNT" ] || COUNT=1000 [ "$DEBUG_OFF" ] || DEBUG_OFF="eval echo 0 > /proc/sys/portals/debug" +[ "$DEBUG_ON" ] || DEBUG_ON="eval echo -1 > /proc/sys/portals/debug" for NAME in $CONFIGS; do export NAME [ -e $NAME.sh ] && sh $NAME.sh - [ ! -e $NAME.xml ] && echo "no config '$NAME.xml'" 1>&2 && exit 1 + [ ! -e $NAME.xml ] && [ -z "$LDAPURL" ] && echo "no config '$NAME.xml'" 1>&2 && exit 1 if [ "$RUNTESTS" != "no" ]; then sh runtests @@ -39,11 +40,13 @@ for NAME in $CONFIGS; do $DEBUG_OFF sh rundbench 1 + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh if [ $DB_THREADS -gt 1 ]; then $DEBUG_OFF sh rundbench $DB_THREADS + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh fi @@ -54,6 +57,7 @@ for NAME in $CONFIGS; do mount | grep $MNT || sh llmount.sh $DEBUG_OFF bonnie++ -s 0 -n 10 -u $UID -d $MNT + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh fi @@ -63,6 +67,7 @@ for NAME in $CONFIGS; do mount | grep $MNT || sh llmount.sh $DEBUG_OFF iozone $IOZONE_OPTS $IOZONE_FILE + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh fi @@ -75,6 +80,7 @@ for NAME in $CONFIGS; do $DEBUG_OFF iozone -I $IOZONE_OPTS $IOZONE_FILE.odir IOZVER=`iozone -v | awk '/Revision:/ { print $3 }' | tr -d '.'` + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh if [ "$IOZ_THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then @@ -86,6 +92,7 @@ for NAME in $CONFIGS; do THREAD=`expr $THREAD + 1` done iozone -I $IOZONE_OPTS -t $IOZ_THREADS $IOZONE_FILE + $DEBUG_ON sh llmountcleanup.sh sh llrmount.sh elif [ $IOZVER -lt 3145 ]; then @@ -97,6 +104,7 @@ for NAME in $CONFIGS; do mount | grep $MNT || sh llmount.sh $DEBUG_OFF ./fsx -W -c 50 -p 1000 -P $TMP -l 1024000 -N $(($COUNT * 100)) $MNT/fsxfile + $DEBUG_ON sh llmountcleanup.sh #sh llrmount.sh fi diff --git a/lustre/tests/ba-echo.sh b/lustre/tests/ba-echo.sh index c0427fd..b28c5f4 100644 --- a/lustre/tests/ba-echo.sh +++ b/lustre/tests/ba-echo.sh @@ -7,7 +7,7 @@ LMC="save_cmd" TCPBUF=1048576 OST=${OST:-ba-ost-1} -CLIENT=`hostname` +CLIENT=${CLIENT:-`hostname`} UUIDLIST=${UUIDLIST:-/usr/local/admin/ba-ost/UUID.txt} diff --git a/lustre/tests/checkstat.c b/lustre/tests/checkstat.c index f09fde9..c98d6aa 100644 --- a/lustre/tests/checkstat.c +++ b/lustre/tests/checkstat.c @@ -215,7 +215,8 @@ main (int argc, char **argv) } else { - fprintf (stderr, "Can't parse file type %s\n", type); + fprintf (stderr, "Can't parse file type %s\n", + type); return (1); } @@ -229,7 +230,8 @@ main (int argc, char **argv) { if (verbose) printf ("%s has perms 0%o, not 0%o\n", - fname, (buf.st_mode & ~S_IFMT), perms); + fname, (buf.st_mode & ~S_IFMT), + perms); return (1); } @@ -244,7 +246,8 @@ main (int argc, char **argv) { if (verbose) printf ("%s has size %Ld, not %Ld\n", - fname, (long long)buf.st_size, size); + fname, (long long)buf.st_size, + size); return (1); } diff --git a/lustre/tests/cobd.sh b/lustre/tests/cobd.sh index 3f6521a..cb4f94d 100755 --- a/lustre/tests/cobd.sh +++ b/lustre/tests/cobd.sh @@ -12,16 +12,6 @@ MDSSIZE=50000 OSTDEV=$TMP/ost1 OSTSIZE=200000 -kver=`uname -r | cut -d "." -f 1,2` - -case $kver in - 2.4) FSTYPE="--fstype=extN" ;; - 2.5) FSTYPE="--fstype=ext3" ;; - *) echo "Kernel version $kver not supported" - exit 1 - ;; -esac - rm -f $config # create nodes ${LMC} --add node --node localhost || exit 10 diff --git a/lustre/tests/createtest.c b/lustre/tests/createtest.c index 5404f13..6223034 100644 --- a/lustre/tests/createtest.c +++ b/lustre/tests/createtest.c @@ -94,7 +94,7 @@ int main(int argc, char *argv[]) argv[0], name, strerror(errno)); exit(11); } - if ((st.st_mode & S_IFMT) != S_IFREG) { + if (!S_ISREG(st.st_mode & S_IFMT)) { fprintf(stderr, "%s: ERROR mode %s: %o != %o", argv[0], name, st.st_mode & S_IFMT, S_IFREG); exit(12); @@ -124,7 +124,7 @@ int main(int argc, char *argv[]) argv[0], name, strerror(errno)); exit(11); } - if ((st.st_mode & S_IFMT) != S_IFDIR) { + if (!S_ISDIR(st.st_mode)) { fprintf(stderr, "%s: ERROR mode %s: %o != %o", argv[0], name, st.st_mode & S_IFMT, S_IFDIR); exit(12); diff --git a/lustre/tests/directio.c b/lustre/tests/directio.c index f529fb0..e660ea4 100644 --- a/lustre/tests/directio.c +++ b/lustre/tests/directio.c @@ -17,24 +17,26 @@ int main(int argc, char **argv) { int fd; char *buf; - int blocks; + int blocks, seek_blocks; long len; - struct stat st; + off64_t seek; + struct stat64 st; int rc; - if (argc != 3) { - printf("Usage: %s file nr_blocks\n", argv[0]); + if (argc != 4) { + printf("Usage: %s file seek nr_blocks\n", argv[0]); return 1; } - blocks = strtoul(argv[2], 0, 0); - fd = open(argv[1], O_DIRECT | O_RDWR | O_CREAT, 0644); + seek_blocks = strtoul(argv[2], 0, 0); + blocks = strtoul(argv[3], 0, 0); + fd = open(argv[1], O_LARGEFILE | O_DIRECT | O_RDWR | O_CREAT, 0644); if (fd == -1) { printf("Cannot open %s: %s\n", argv[1], strerror(errno)); return 1; } - if (fstat(fd, &st) < 0) { + if (fstat64(fd, &st) < 0) { printf("Cannot stat %s: %s\n", argv[1], strerror(errno)); return 1; } @@ -42,6 +44,12 @@ int main(int argc, char **argv) printf("directio on %s for %dx%lu blocks \n", argv[1], blocks, st.st_blksize); + seek = (off64_t)seek_blocks * (off64_t)st.st_blksize; + if (lseek64(fd, seek, SEEK_SET) < 0) { + printf("lseek64 failed: %s\n", strerror(errno)); + return 1; + } + len = blocks * st.st_blksize; buf = mmap(0, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, 0, 0); if (!buf) { @@ -56,7 +64,7 @@ int main(int argc, char **argv) return 1; } - if (lseek(fd, 0, SEEK_SET) != 0) { + if (lseek64(fd, seek, SEEK_SET) < 0) { printf("Cannot seek %s\n", strerror(errno)); return 1; } diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh index 99e026f..335db41 100755 --- a/lustre/tests/echo.sh +++ b/lustre/tests/echo.sh @@ -16,6 +16,9 @@ TMP=${TMP:-/tmp} SERVER=${SERVER:-localhost} CLIENT=${CLIENT:-localhost} NET=${NET:-tcp} +SERVERNID=${SERVERNID:-$SERVER} +CLIENTNID=${CLIENTNID:-$CLIENT} + # FIXME: make LMC not require MDS for obdecho LOV MDSDEV=${MDSDEV:-$TMP/mds1} @@ -27,7 +30,7 @@ STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs rm -f $config # create nodes $LMC --add node --node $SERVER || exit 1 -$LMC --add net --node $SERVER --nid $SERVER --nettype $NET || exit 2 +$LMC --add net --node $SERVER --nid $SERVERNID --nettype $NET || exit 2 if (($LOV)); then $LMC --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10 @@ -42,7 +45,7 @@ fi if [ "$SERVER" != "$CLIENT" ]; then $LMC --add node --node $CLIENT || exit 1 - $LMC --add net --node $CLIENT --nid $CLIENT --nettype $NET || exit 2 + $LMC --add net --node $CLIENT --nid $CLIENTNID --nettype $NET || exit 2 fi $LMC --add echo_client --node $CLIENT --ost ${OBD_NAME} || exit 3 diff --git a/lustre/tests/fchdir_test.c b/lustre/tests/fchdir_test.c new file mode 100644 index 0000000..83c096e --- /dev/null +++ b/lustre/tests/fchdir_test.c @@ -0,0 +1,41 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <fcntl.h> + + +int main(int argc, char **argv) +{ + int fd; + int rc; + + fd = open(".", O_RDONLY); + if (fd < 0) { + perror("opening '.' :"); + exit(2); + } + + rc = chdir("/mnt/lustre/subdir/subdir"); + if (rc) { + perror("cannot chdir subdir:"); + exit(3); + } + + rc = fchdir(fd); + if (rc) { + perror("cannot fchdir back\n"); + exit(4); + } + + rc = close(fd); + if (rc) { + perror("cannot close '.'\n"); + exit(5); + } + + return(0); +} diff --git a/lustre/tests/llecho.sh b/lustre/tests/llecho.sh index d2497a4..5afade1 100644 --- a/lustre/tests/llecho.sh +++ b/lustre/tests/llecho.sh @@ -6,9 +6,13 @@ NAME=${NAME:-echo} config=$NAME.xml mkconfig=$NAME.sh -sh $mkconfig $config || exit 1 +if [ "$LUSTRE" ]; then + lustre_opt="--lustre=$LUSTRE" +fi -$LCONF --reformat --gdb $OPTS $config || exit 4 +sh -x $mkconfig $config || exit 1 + +$LCONF $lustre_opt --reformat --gdb $OPTS $config || exit 4 cat <<EOF diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index c490856..de20003 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -15,12 +15,20 @@ if [ "$LUSTRE" ]; then lustre_opt="--lustre=$LUSTRE" fi +if [ "$LDAPURL" ]; then + conf_opt="--ldapurl $LDAPURL --config $NAME" +else + sh $mkconfig $config || exit 1 + conf_opt="$config" +fi + +[ "$NODE" ] && node_opt="--node $NODE" + if [ "$1" = "-v" ]; then verbose="-v" fi [ -x $LCONF ] || chmod a+rx $LCONF -sh $mkconfig $config || exit 1 - -${LCONF} $portals_opt $lustre_opt --reformat --gdb $verbose $config || exit 2 +${LCONF} $portals_opt $lustre_opt $node_opt --reformat --gdb \ + $verbose $conf_opt || exit 2 diff --git a/lustre/tests/llmountcleanup.sh b/lustre/tests/llmountcleanup.sh index cd28d21..98d0512 100755 --- a/lustre/tests/llmountcleanup.sh +++ b/lustre/tests/llmountcleanup.sh @@ -15,12 +15,24 @@ if [ "$LUSTRE" ]; then lustre_opt="--lustre=$LUSTRE" fi -if [ ! -f $config ]; then - sh $mkconfig $config || exit 1 +if [ "$1" = "--force" ]; then + force="--force" fi +if [ "$LDAPURL" ]; then + conf_opt="--ldapurl $LDAPURL --config $NAME" +else + if [ ! -f $config -o $mkconfig -nt $config ]; then + sh $mkconfig $config || exit 1 + fi + conf_opt="$config" +fi + +[ "$NODE" ] && node_opt="--node $NODE" + sync; sleep 2; sync -${LCONF} $portals_opt $lustre_opt --cleanup --dump $TMP/debug $config +${LCONF} $portals_opt $lustre_opt $node_opt --cleanup $force \ + --dump $TMP/debug $conf_opt rc=$? BUSY=`dmesg | grep -i destruct` if [ "$BUSY" ]; then @@ -28,7 +40,7 @@ if [ "$BUSY" ]; then mv $TMP/debug $TMP/debug-busy.`date +%s` exit 255 fi -LEAK_LUSTRE=`dmesg | tail -20 | grep -v "leaked: 0" | grep leaked` +LEAK_LUSTRE=`dmesg | grep "obd mem.*leaked" | tail -1 | grep -v "leaked: 0"` LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"` if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then echo "$LEAK_LUSTRE" 1>&2 diff --git a/lustre/tests/llrmount.sh b/lustre/tests/llrmount.sh index 6531055..b12c1ae 100755 --- a/lustre/tests/llrmount.sh +++ b/lustre/tests/llrmount.sh @@ -14,8 +14,15 @@ if [ "$LUSTRE" ]; then lustre_opt="--lustre=$LUSTRE" fi -if [ ! -f $config -o $mkconfig -nt $config ]; then - sh $mkconfig $config || exit 1 -fi +if [ "$LDAPURL" ]; then + conf_opt="--ldapurl $LDAPURL --config $NAME" +else + if [ ! -f $config -o $mkconfig -nt $config ]; then + sh $mkconfig $config || exit 1 + fi + conf_opt="$config" +fi + +[ "$NODE" ] && node_opt="--node $NODE" -${LCONF} $portals_opt $lustre_opt --gdb $config || exit 2 +${LCONF} $portals_opt $lustre_opt $node_opt --gdb $conf_opt || exit 2 diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index 2132801..2bd47ae 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -11,17 +11,7 @@ MDSSIZE=${MDSSIZE:-50000} OSTDEV=${OSTDEV:-$TMP/ost1} OSTSIZE=${OSTSIZE:-200000} - -kver=`uname -r | cut -d "." -f 1,2` - -case $kver in - 2.4) FSTYPE="--fstype=extN" ;; - 2.5) FSTYPE="--fstype=ext3" ;; - *) echo "Kernel version $kver not supported" - exit 1 - ;; -esac - +FSTYPE=${FSTYPE:-ext3} rm -f $config @@ -30,10 +20,10 @@ ${LMC} --add node --node localhost || exit 10 ${LMC} --add net --node localhost --nid localhost --nettype tcp || exit 11 # configure mds server -${LMC} --add mds --node localhost --mds mds1 $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 20 +${LMC} --add mds --node localhost --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE || exit 20 # configure ost -${LMC} --add ost --node localhost --ost obd1 $FSTYPE --dev $OSTDEV --size $OSTSIZE || exit 30 +${LMC} --add ost --node localhost --ost ost1 --fstype $FSTYPE --dev $OSTDEV --size $OSTSIZE || exit 30 # create client config -${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --ost obd1 || exit 40 +${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --ost ost1 || exit 40 diff --git a/lustre/tests/mcr-routed-config.sh b/lustre/tests/mcr-routed-config.sh index 3b1d961..8d8a100 100755 --- a/lustre/tests/mcr-routed-config.sh +++ b/lustre/tests/mcr-routed-config.sh @@ -11,7 +11,7 @@ save_cmd() { } LMC="save_cmd" -LMC_REAL="../../lustre/utils/lmc -m $config" +LMC_REAL="../utils/lmc -m $config" # TCP/IP servers SERVER_START=0 diff --git a/lustre/tests/mkdirdeep.c b/lustre/tests/mkdirdeep.c new file mode 100644 index 0000000..cfd1535 --- /dev/null +++ b/lustre/tests/mkdirdeep.c @@ -0,0 +1,275 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Compile with: + * cc -I../../portals/include -o mkdirdeep mkdirdeep.c + * -L../../portals/linux/utils -lptlctl + */ + +#include <stdio.h> +#include <stdlib.h> +#include <getopt.h> +#include <string.h> +#include <errno.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <linux/limits.h> +#include <portals/lltrace.h> + +static int opt_depth = 1; +static int opt_mknod = 0; +static int opt_verbose = 0; +static int opt_trace = 1; +static char* basepathname = 0; +static char mycwd[PATH_MAX]; +static char* pname = 0; +static char* outputfilename = 0; + +void usage() +{ + fprintf(stderr, "Usage: %s --depth <d> --output <outputtracefilename>" + "[--mknod] [--verbose] [--notrace] <basepath>\n", pname); + exit(1); +} + +int do_mkdir(char* path) +{ + int rc = mkdir(path, 0755); + if (rc!=0) + fprintf(stderr, "mkdir(%s) failed: %s\n", + path, strerror(errno)); + if (opt_verbose) + printf("mkdir %s\n", path); + return rc; +} + + +int do_mknod(char* path) +{ + int rc = mknod(path, 0755, S_IFIFO); + if (rc!=0) + fprintf(stderr, "mkdir(%s) failed: %s\n", + path, strerror(errno)); + if (opt_verbose) + printf("mknod %s\n", path); + return rc; +} + +int do_chdir(char* path) +{ + int rc = chdir(path); + if (rc!=0) + fprintf(stderr, "chdir(%s) failed: %s\n", + path, strerror(errno)); + if (opt_verbose) + printf("chdir %s\n", path); + + return rc; +} + + +int do_stat(char* path) +{ + char mark_buf[PATH_MAX]; + struct stat mystat; + int rc = stat(path, &mystat); + if (rc!=0) + fprintf(stderr, "stat(%s) failed: %s\n", + path, strerror(errno)); + if (opt_verbose) + printf("stat %s = inode %lu\n", path, mystat.st_ino); + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, "stat %s = inode %lu", + path, mystat.st_ino); + ltrace_mark(0, mark_buf); + } + + return rc; +} + +int main(int argc, char** argv) +{ + int c, opt_index, i, mypid; + + static struct option long_options[] = { + {"depth", 1, 0, 0 }, + {"help", 0, 0, 0 }, + {"mknod", 0, 0, 0 }, + {"verbose", 0, 0, 0 }, + {"notrace", 0, 0, 0 }, + {"output", 1, 0, 0 }, + {0,0,0,0} + }; + + char full_pathname[PATH_MAX]; + char rel_pathname[PATH_MAX]; + char mark_buf[PATH_MAX]; + + pname = strdup(argv[0]); + + while (1) { + c = getopt_long(argc, argv, "d:mhv", long_options, &opt_index); + if (c == -1) + break; + if (c==0) { + if (!strcmp(long_options[opt_index].name, "notrace")) { + opt_trace = 0; + continue; + } + c = long_options[opt_index].name[0]; + } + switch (c) { + case 'd': + opt_depth = atoi(optarg); + if ((opt_depth == 0) || (opt_depth > 100)) + usage(); + break; + case 'm': + opt_mknod = 1; + break; + case 'v': + opt_verbose = 1; + break; + case 'o': + outputfilename = optarg; + break; + case 'h': + case '?': + case ':': + default: + usage(); + break; + } + } + + if (optind != (argc-1)) + usage(); + + if (outputfilename == NULL) + usage(); + + basepathname = argv[optind]; + mypid = getpid(); + + printf("%s(pid=%d) depth=%d mknod=%d, basepathname=%s, " + "trace=%d, outputfilename=%s\n", + pname, mypid, opt_depth, opt_mknod, basepathname, opt_trace, + outputfilename); + + if (!getcwd(&mycwd[0], sizeof(mycwd))) { + fprintf(stderr, "%s: unable to getcwd()\n", pname); + exit(1); + } + + if (opt_trace) { + ltrace_start(); + ltrace_clear(); + snprintf(mark_buf, PATH_MAX, + "Initialize - mkdir %s; chdir %s", + basepathname, basepathname); + ltrace_mark(2, mark_buf); + } + + if (do_mkdir(basepathname)!=0) + exit(1); + if (do_chdir(basepathname)!=0) + exit(1); + + /* Create directory tree with depth level of subdirectories */ + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, + "Create Directory Tree (depth %d)", opt_depth); + ltrace_mark(2, mark_buf); + } + + for (i=0; i<opt_depth; i++) { + + snprintf(rel_pathname, sizeof(rel_pathname),"%d", i+1); + + if (i == (opt_depth-1)) { + /* Last Iteration */ + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, + "Tree Leaf (%d) %s/stat", i, + (opt_mknod ? "mknod" : "mkdir")); + ltrace_mark(3, mark_buf); + } + + if (opt_mknod) + do_mknod(rel_pathname); + else + do_mkdir(rel_pathname); + /* Now stat it */ + do_stat(rel_pathname); + } + else { + /* Not Leaf */ + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, + "Tree Level (%d) mkdir/stat/chdir", + i); + ltrace_mark(3, mark_buf); + } + + do_mkdir(rel_pathname); + do_stat(rel_pathname); + do_chdir(rel_pathname); + } + } + + /* Stat through directory tree with fullpaths */ + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, "Walk Directory Tree"); + ltrace_mark(2, mark_buf); + } + + do_chdir(basepathname); + + strncpy(full_pathname, basepathname, sizeof(full_pathname)); + + for (i=0; i<opt_depth; i++) { + snprintf(rel_pathname, sizeof(rel_pathname),"%d", i+1); + strcat(full_pathname, "/"); + strcat(full_pathname, rel_pathname); + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, "stat %s", + full_pathname); + ltrace_mark(2, mark_buf); + } + + do_stat(full_pathname); + } + + /* Cleanup */ + + if (opt_trace) { + snprintf(mark_buf, PATH_MAX, "Cleanup"); + ltrace_mark(2, mark_buf); + } + + if (opt_trace) { + ltrace_write_file(outputfilename); + ltrace_add_processnames(outputfilename); + ltrace_stop(); + } + + do_chdir(basepathname); + + snprintf(full_pathname, sizeof(full_pathname), + "rm -rf %s\n", basepathname); + if (opt_verbose) + printf("Cleanup: %s", full_pathname); + + system(full_pathname); + + printf("%s (pid=%d) done.\n", pname, mypid); + return 0; +} diff --git a/lustre/tests/opendevunlink.c b/lustre/tests/opendevunlink.c new file mode 100644 index 0000000..fde7d36 --- /dev/null +++ b/lustre/tests/opendevunlink.c @@ -0,0 +1,111 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> +#include <string.h> + +int main(int argc, char **argv) +{ + char *dname1, *dname2; + int fddev1, fddev2, rc; + //DIR *dp; + struct stat st1, st2; + + if (argc < 2 || argc > 3) { + fprintf(stderr, "usage: %s filename1 [filename2]\n", argv[0]); + exit(1); + } + + dname1 = argv[1]; + if (argc == 3) + dname2 = argv[2]; + else + dname2 = argv[1]; + + //create the special file (right now only test on pipe) + fprintf(stderr, "creating special file %s\n", dname1); + rc = mknod(dname1, 0777|S_IFIFO, 0); + if (rc == -1) { + fprintf(stderr, "creating %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + // open the special file again + fprintf(stderr, "opening file\n"); + fddev1 = open(dname1, O_RDONLY | O_NONBLOCK); + if (fddev1 == -1) { + fprintf(stderr, "open %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + // doesn't matter if the two dirs are the same?? + fddev2 = open(dname2, O_RDONLY | O_NONBLOCK); + if (fddev2 == -1) { + fprintf(stderr, "open %s fails: %s\n", + dname2, strerror(errno)); + exit(1); + } + + // delete the special file + fprintf (stderr, "unlinking %s\n", dname1); + rc = unlink(dname1); + if (rc) { + fprintf(stderr, "unlink %s error: %s\n", + dname1, strerror(errno)); + exit(1); + } + + if (access(dname2, F_OK) == 0){ + fprintf(stderr, "%s still exists\n", dname2); + exit(1); + } + + if (access(dname1, F_OK) == 0){ + fprintf(stderr, "%s still exists\n", dname1); + exit(1); + } + + // fchmod one special file + rc = fchmod (fddev1, 0777); + if(rc == -1) + { + fprintf(stderr, "fchmod unlinked special file %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + // fstat two files to check if they are the same + rc = fstat(fddev1, &st1); + if(rc == -1) + { + fprintf(stderr, "fstat unlinked special file %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + rc = fstat(fddev2, &st2); + if (rc == -1) { + fprintf(stderr, "fstat file %s fails: %s\n", + dname2, strerror(errno)); + exit(1); + } + + if (st1.st_mode != st2.st_mode) { // can we do this? + fprintf(stderr, "fstat different value on %s and %s\n", dname1, dname2); + exit(1); + } + + fprintf(stderr, "Ok, everything goes well.\n"); + return 0; +} + diff --git a/lustre/tests/opendirunlink.c b/lustre/tests/opendirunlink.c new file mode 100644 index 0000000..2664618 --- /dev/null +++ b/lustre/tests/opendirunlink.c @@ -0,0 +1,122 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> +#include <string.h> + +int main(int argc, char **argv) +{ + char *dname1, *dname2; + int fddir1, fddir2, rc; + //DIR *dp; + struct stat st1, st2; + + if (argc < 2 || argc > 3) { + fprintf(stderr, "usage: %s dirname1 [dirname2]\n", argv[0]); + exit(1); + } + + dname1 = argv[1]; + if (argc == 3) + dname2 = argv[2]; + else + dname2 = argv[1]; + + //create the directory + fprintf(stderr, "creating directory %s\n", dname1); + rc = mkdir(dname1, 0744); + if (rc == -1) { + fprintf(stderr, "creating %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + // open the dir again + fprintf(stderr, "opening directory\n"); + fddir1 = open(dname1, O_RDONLY | O_DIRECTORY); + if (fddir1 == -1) { + fprintf(stderr, "open %s fails: %s\n", + dname1, strerror(errno)); + exit(1); + } + + // doesn't matter if the two dirs are the same?? + fddir2 = open(dname2, O_RDONLY | O_DIRECTORY); + if (fddir2 == -1) { + fprintf(stderr, "open %s fails: %s\n", + dname2, strerror(errno)); + exit(1); + } + + // another method +/* + if ( (dp = opendir(dname2)) == NULL) { + fprintf(stderr, "opendir() %s\n", strerror(errno)); + exit(1); + } + fddir = dirfd(dp); +*/ + + // delete the dir + fprintf (stderr, "unlinking %s\n", dname1); + rc = rmdir(dname1); + if (rc) { + fprintf(stderr, "unlink %s error: %s\n", + dname1, strerror(errno)); + exit(1); + } + + if (access(dname2, F_OK) == 0){ + fprintf(stderr, "%s still exists\n", dname2); + exit(1); + } + + if (access(dname1, F_OK) == 0){ + fprintf(stderr, "%s still exists\n", dname1); + exit(1); + } + + // fchmod the dir + rc = fchmod (fddir1, 0777); + if(rc == -1) + { + fprintf(stderr, "fchmod unlinked dir fails %s\n", + strerror(errno)); + exit(1); + } + + // fstat two dirs to check if they are the same + rc = fstat(fddir1, &st1); + if(rc == -1) + { + fprintf(stderr, "fstat unlinked dir %s fails %s\n", + dname1, strerror(errno)); + exit(1); + } + + rc = fstat(fddir2, &st2); + if (rc == -1) { + fprintf(stderr, "fstat dir %s fails %s\n", + dname2, strerror(errno)); + exit(1); + } + + if (st1.st_mode != st2.st_mode) { // can we do this? + fprintf(stderr, "fstat different value on %s and %s\n", dname1, dname2); + exit(1); + } + + fprintf(stderr, "Ok, everything goes well.\n"); + return 0; +} + diff --git a/lustre/tests/openfile.c b/lustre/tests/openfile.c new file mode 100644 index 0000000..ab5cbdb --- /dev/null +++ b/lustre/tests/openfile.c @@ -0,0 +1,162 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#if 0 +#define DEBUG +#endif + +#define _GNU_SOURCE + +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> + +typedef struct flag_mapping { + char string[20]; + int flag; +} FLAG_MAPPING; + +FLAG_MAPPING flag_table[] = { + {"O_RDONLY", O_RDONLY}, + {"O_WRONLY", O_WRONLY}, + {"O_RDWR", O_RDWR}, + {"O_CREAT", O_CREAT}, + {"O_EXCL", O_EXCL}, + {"O_NOCTTY", O_NOCTTY}, + {"O_TRUNC", O_TRUNC}, + {"O_APPEND", O_APPEND}, + {"O_NONBLOCK", O_NONBLOCK}, + {"O_NDELAY", O_NDELAY}, + {"O_SYNC", O_SYNC}, + {"O_NOFOLLOW", O_NOFOLLOW}, + {"O_DIRECTORY", O_DIRECTORY}, + {"O_LARGEFILE", O_LARGEFILE}, + {"", -1} +}; + +void Usage_and_abort(void) +{ + fprintf(stderr, "Usage: openfile -f flags [ -m mode ] filename \n"); + fprintf(stderr, "e.g. openfile -f O_RDWR:O_CREAT -m 0755 /etc/passwd\n"); + exit(-1); +} + +int main(int argc, char** argv) +{ + int i; + int flags=0; + mode_t mode=0; + char* fname=NULL; + int mode_set=0; + int flag_set=0; + int file_set=0; + char c; + char* cloned_flags; + + if(argc == 1) { + Usage_and_abort(); + } + + while ((c = getopt (argc, argv, "f:m:")) != -1) { + switch (c) { + case 'f': { + char *tmp; + + cloned_flags = (char*)malloc(strlen(optarg)); + if (cloned_flags==NULL) { + fprintf(stderr, "Insufficient memory.\n"); + exit(-1); + } + + strncpy(cloned_flags, optarg, strlen(optarg)); + tmp = strtok(optarg, ":"); + while (tmp) { + int i = 0; +#ifdef DEBUG + printf("flags = %s\n",tmp); +#endif + flag_set = 1; + while (flag_table[i].flag != -1) { + int r; + r = strncasecmp(tmp, (flag_table[i].string), + strlen((flag_table[i].string)) ); + + if (r == 0) + break; + i++; + } + + if (flag_table[i].flag != -1) { + flags |= flag_table[i].flag; + } else { + fprintf(stderr, "No such flag: %s\n", + tmp); + exit(-1); + } + + tmp = strtok(NULL, ":"); + + } +#ifdef DEBUG + printf("flags = %x\n", flags); +#endif + break; + } + case 'm': +#ifdef DEBUG + printf("mode = %s\n", optarg); +#endif + mode = strtol (optarg, NULL, 8); + mode_set = 1; +#ifdef DEBUG + printf("mode = %o\n", mode); +#endif + break; + default: + fprintf(stderr, "Bad parameters.\n"); + Usage_and_abort(); + } + } + + if (optind == argc) { + fprintf(stderr, "Bad parameters.\n"); + Usage_and_abort(); + } + + fname = argv[optind]; + file_set = 1; + + if (!flag_set || !file_set) { + fprintf(stderr, "Missing flag or file-name\n"); + exit(-1); + } + + + if (mode_set) + i = open(fname, flags, mode); + else + i = open(fname, flags); + + if (i != -1) { + fprintf(stderr, "Succeed in opening file \"%s\"(flags=%s", + fname, cloned_flags); + + if (mode_set) + fprintf(stderr, ", mode=%o", mode); + fprintf(stderr, ")\n"); + close (i); + } else { + fprintf(stderr, "Error in opening file \"%s\"(flags=%s", + fname, cloned_flags); + if (mode_set) + fprintf(stderr, ", mode=%o", mode); + fprintf(stderr, ") %s\n", strerror(errno)); + } + return(i); +} diff --git a/lustre/tests/recovery-cleanup.sh b/lustre/tests/recovery-cleanup.sh index 481ebaa..c8f85ee 100755 --- a/lustre/tests/recovery-cleanup.sh +++ b/lustre/tests/recovery-cleanup.sh @@ -3,23 +3,29 @@ set -ex LUSTRE=${LUSTRE:-`dirname $0`/..} +LTESTDIR=${LTESTDIR:-"$LUSTRE/../ltest"} PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests -. $LUSTRE/../ltest/functional/llite/common/common.sh +. $LTESTDIR/functional/llite/common/common.sh + +# Allow us to override the setup if we already have a mounted system by +# setting SETUP=" " and CLEANUP=" " +SETUP=${SETUP:-"setup"} +CLEANUP=${CLEANUP:-"cleanup"} PDSH='pdsh -S -w' # XXX I wish all this stuff was in some default-config.sh somewhere MDSNODE=${MDSNODE:-mdev6} OSTNODE=${OSTNODE:-mdev7} -CLIENT=${CLIENTNODE:-mdev8} +CLIENT=${CLIENT:-mdev8} NETWORKTYPE=${NETWORKTYPE:-tcp} MOUNTPT=${MOUNTPT:-/mnt/lustre} -CONFIG=recovery-small.xml -MDSDEV=/tmp/mds -OSTDEV=/tmp/ost -MDSSIZE=100000 -OSTSIZE=100000 +CONFIG=${CONFIG:-recovery-cleanup.xml} +MDSDEV=${MDSDEV:-/tmp/mds} +OSTDEV=${OSTDEV:-/tmp/ost} +MDSSIZE=${MDSSIZE:-100000} +OSTSIZE=${OSTSIZE:-100000} do_mds() { $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" @@ -99,7 +105,7 @@ wait_for_timeout() { try_to_cleanup() { kill -INT $! - unmount_client --force + unmount_client --force --dump /tmp/client-cleanup-`date +%s`.log mount_client --timeout=${TIMEOUT:-5} --recovery_upcall=/bin/true } @@ -108,7 +114,8 @@ if [ ! -z "$ONLY" ]; then exit $? fi -setup +$SETUP + drop_request "mcreate /mnt/lustre/1" & wait_for_timeout try_to_cleanup @@ -131,4 +138,4 @@ try_to_cleanup drop_request "munlink /mnt/lustre/link1" & wait_for_timeout try_to_cleanup -cleanup +$CLEANUP '--dump /tmp/`hostname`-cleanup.log' diff --git a/lustre/tests/recovery-small-upcall.sh b/lustre/tests/recovery-small-upcall.sh new file mode 100755 index 0000000..02e9f69 --- /dev/null +++ b/lustre/tests/recovery-small-upcall.sh @@ -0,0 +1,3 @@ +#!/bin/sh +LUSTRE=`dirname $0`/.. +$LUSTRE/utils/lctl --device %$3 recover || logger -p kern.info recovery failed: $@ diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 7425e57..42a1e18 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3,43 +3,56 @@ set -ex LUSTRE=${LUSTRE:-`dirname $0`/..} +LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} PATH=$PATH:$LUSTRE/utils:$LUSTRE/tests -. $LUSTRE/../ltest/functional/llite/common/common.sh +RLUSTRE=${RLUSTRE:-$LUSTRE} +RPWD=${RPWD:-$PWD} -PDSH='pdsh -S -w' +. $LTESTDIR/functional/llite/common/common.sh + +# Allow us to override the setup if we already have a mounted system by +# setting SETUP=" " and CLEANUP=" " +SETUP=${SETUP:-"setup"} +CLEANUP=${CLEANUP:-"cleanup"} + +PDSH=${PDSH:-'pdsh -S -w'} # XXX I wish all this stuff was in some default-config.sh somewhere MDSNODE=${MDSNODE:-mdev6} OSTNODE=${OSTNODE:-mdev7} -CLIENT=${CLIENTNODE:-mdev8} +CLIENT=${CLIENT:-mdev8} NETWORKTYPE=${NETWORKTYPE:-tcp} MOUNTPT=${MOUNTPT:-/mnt/lustre} -CONFIG=recovery-small.xml -MDSDEV=/tmp/mds -OSTDEV=/tmp/ost -MDSSIZE=100000 -OSTSIZE=100000 +CONFIG=${CONFIG:-recovery-small.xml} +MDSDEV=${MDSDEV:-/tmp/mds} +OSTDEV=${OSTDEV:-/tmp/ost} +MDSSIZE=${MDSSIZE:-100000} +OSTSIZE=${OSTSIZE:-100000} +UPCALL=${UPCALL:-$RPWD/recovery-small-upcall.sh} +FSTYPE=${FSTYPE:-ext3} do_mds() { - $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" + $PDSH $MDSNODE "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@" || exit $? } do_client() { - $PDSH $CLIENT "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" + $PDSH $CLIENT "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@" || exit $? } do_ost() { - $PDSH $OSTNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" + $PDSH $OSTNODE "PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; $@" || exit $? } drop_request() { +# OBD_FAIL_MDS_ALL_REQUEST_NET do_mds "echo 0x121 > /proc/sys/lustre/fail_loc" do_client "$1" do_mds "echo 0 > /proc/sys/lustre/fail_loc" } drop_reply() { +# OBD_FAIL_MDS_ALL_REPLY_NET do_mds "echo 0x120 > /proc/sys/lustre/fail_loc" do_client "$@" do_mds "echo 0 > /proc/sys/lustre/fail_loc" @@ -52,9 +65,9 @@ make_config() { --nettype $NETWORKTYPE || exit 4 done lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --dev $MDSDEV \ - --size $MDSSIZE || exit 5 + --size $MDSSIZE --fstype $FSTYPE || exit 5 lmc -m $CONFIG --add ost --node $OSTNODE --ost ost1 --dev $OSTDEV \ - --size $OSTSIZE || exit 6 + --size $OSTSIZE --fstype $FSTYPE || exit 6 lmc -m $CONFIG --add mtpt --node $CLIENT --path $MOUNTPT --mds mds1 \ --ost ost1 || exit 7 } @@ -84,12 +97,11 @@ unmount_client() { } setup() { - make_config - start_mds ${REFORMAT:---reformat} - start_ost ${REFORMAT:---reformat} + start_mds ${REFORMAT} + start_ost ${REFORMAT} # XXX we should write our own upcall, when we move this somewhere better. mount_client --timeout=${TIMEOUT:-5} \ - --recovery_upcall=$PWD/../../ltest/functional/llite/09/client-upcall.sh + --lustre_upcall=$UPCALL } cleanup() { @@ -114,7 +126,11 @@ if [ ! -z "$ONLY" ]; then exit $? fi -setup +make_config + +REFORMAT=--reformat $SETUP +unset REFORMAT + drop_request "mcreate /mnt/lustre/1" drop_reply "mcreate /mnt/lustre/2" # replay "mcreate /mnt/lustre/3" @@ -140,5 +156,4 @@ drop_reply "mlink /mnt/lustre/renamed-again /mnt/lustre/link2" drop_request "munlink /mnt/lustre/link1" drop_reply "munlink /mnt/lustre/link2" - -cleanup +$CLEANUP diff --git a/lustre/tests/runas.c b/lustre/tests/runas.c index 3d29f1b..8731699 100644 --- a/lustre/tests/runas.c +++ b/lustre/tests/runas.c @@ -19,7 +19,7 @@ Usage_and_abort() exit(-1); } -// Usage: runas -u user_id [ -g grp_id ] "command_to_be_run" +// Usage: runas -u user_id [ -g grp_id ] [--] command_to_be_run // return: the return value of "command_to_be_run" // NOTE: returning -1 might be the return code of this program itself or // the "command_to_be_run" @@ -30,8 +30,7 @@ Usage_and_abort() int main(int argc, char**argv) { - char command[1024]; - char *cmd_ptr; + char **my_argv; int status; int c,i; int gid_is_set = 0; @@ -44,7 +43,7 @@ main(int argc, char**argv) } // get UID and GID - while ((c = getopt (argc, argv, "u:g:h")) != -1) { + while ((c = getopt (argc, argv, "+u:g:h")) != -1) { switch (c) { case 'u': user_id = (uid_t)atoi(optarg); @@ -79,12 +78,18 @@ main(int argc, char**argv) Usage_and_abort(); } - // assemble the command - cmd_ptr = command ; - for (i = optind; i < argc; i++) - cmd_ptr += sprintf(cmd_ptr, "%s ", argv[i]); - + my_argv = (char**)malloc(sizeof(char*)*(argc+1-optind)); + if(my_argv == NULL) { + fprintf(stderr, "Error in allocating memory. (%s)\n", strerror(errno)); + exit(-1); + } + + for(i=optind; i< argc; i++) { + my_argv[i-optind] = argv[i]; +// printf("%s\n",my_argv[i-optind]); + } + my_argv[i-optind]=NULL; #if DEBUG system("whoami"); @@ -94,7 +99,7 @@ main(int argc, char**argv) status = setregid(grp_id, grp_id ); if( status == -1) { fprintf(stderr, "Cannot change grp_ID to %d, errno=%d (%s)\n", - grp_id, errno, strerror(errno) ); + grp_id, errno, strerror(errno) ); exit(-1); } @@ -102,32 +107,24 @@ main(int argc, char**argv) status = setreuid(user_id, user_id ); if(status == -1) { fprintf(stderr,"Cannot change user_ID to %d, errno=%d (%s)\n", - user_id, errno, strerror(errno) ); + user_id, errno, strerror(errno) ); exit(-1); } -#if DEBUG - system("whoami"); -#endif - fprintf(stdout, "running as USER(%d), Grp (%d): \"%s\" \n", - user_id, grp_id, command ); + fprintf(stderr, "running as USER(%d), Grp (%d): ", + user_id, grp_id ); - // run the command - status = system(command); + for(i=0; i<argc-optind; i++) + fprintf(stderr, " [%s]", my_argv[i]); - // pass the return code of command_to_be_run out of this wrapper - if (status == -1) { - fprintf(stderr, "%s: system() command failed to run\n", - argv[0]); - } - else{ - status = WEXITSTATUS(status); - fprintf(stderr, "[%s #%d] \"%s\" returns %d (%s).\n", argv[0], - user_id, argv[optind], status, strerror(status)); + fprintf(stderr, "\n"); + fflush(stderr); - } + // The command to be run + execvp(my_argv[0], my_argv); + fprintf(stderr, "execvp fails running %s\n", my_argv[0]); + exit(-1); - return(status); } diff --git a/lustre/tests/runobdstat b/lustre/tests/runobdstat new file mode 100644 index 0000000..886ce8f2 --- /dev/null +++ b/lustre/tests/runobdstat @@ -0,0 +1,7 @@ +#!/bin/sh +PATH=`dirname $0`/../utils:$PATH + +obdstat filter 1 | while read LINE; do + echo "`date +s`: $LINE" + [ "$1" ] && echo "`date +s`: $LINE" >> $1 +done diff --git a/lustre/tests/runvmstat b/lustre/tests/runvmstat index 6bff5ce..3ce6810 100755 --- a/lustre/tests/runvmstat +++ b/lustre/tests/runvmstat @@ -1,2 +1,5 @@ #!/bin/sh -vmstat 1 | while read LINE ; do echo "`date +%H:%M:%S`: $LINE" ; done +vmstat 1 | while read LINE ; do + echo "`date +s`: $LINE" + [ "$1" ] && echo "`date +s`: $LINE" >> $1 +done diff --git a/lustre/tests/sanity-ldlm.sh b/lustre/tests/sanity-ldlm.sh new file mode 100644 index 0000000..e5bd422 --- /dev/null +++ b/lustre/tests/sanity-ldlm.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +set -e + +SRCDIR=`dirname $0` +PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH + +MOUNT=${MOUNT:-/mnt/lustre} +DIR=${DIR:-$MOUNT} +export NAME=$NAME +clean() { + echo -n "cln.." + sh llmountcleanup.sh > /dev/null || exit 20 +} +CLEAN=${CLEAN:-clean} +start() { + echo -n "mnt.." + sh llrmount.sh > /dev/null || exit 10 + echo "done" +} +START=${START:-start} + +log() { + echo "$*" + lctl mark "$*" || /bin/true +} + +pass() { + echo PASS +} + +mount | grep $MOUNT || sh llmount.sh + +log '== drop ldlm request ======================== test 1' +echo 0x302 > /proc/sys/lustre/fail_loc +echo 3 > /proc/sys/lustre/timeout +touch $DIR/f & +sleep 5 +echo 0 > /proc/sys/lustre/fail_loc +lctl --device 6 recover +pass +$CLEAN +$START + +log '== drop ldlm reply (bug 1139) ================ test 2' +echo 0x213 > /proc/sys/lustre/fail_loc +echo 3 > /proc/sys/lustre/timeout +touch $DIR/f +pass +$CLEAN +$START + +log '== drop reply after completion (bug 1068) ==== test 3' +touch $DIR/f +stat $DIR/f +echo 0x213 > /proc/sys/lustre/fail_loc +echo 3 > /proc/sys/lustre/timeout +echo foo >> $DIR/f +pass +$CLEAN +$START diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index fdaf82e..84572bf 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1,9 +1,8 @@ #!/bin/bash - set -e SRCDIR=`dirname $0` -PATH=$SRCDIR:$SRCDIR/../utils:$PATH +PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH CHECKSTAT=${CHECKSTAT:-"./checkstat -v"} CREATETEST=${CREATETEST:-createtest} @@ -11,6 +10,7 @@ LFIND=${LFIND:-lfind} LSTRIPE=${LSTRIPE:-lstripe} MCREATE=${MCREATE:-mcreate} TOEXCL=${TOEXCL:-toexcl} +TRUNCATE=${TRUNCATE:-truncate} RUNAS_ID=${RUNAS_ID:-500} RUNAS=${RUNAS:-"runas -u $RUNAS_ID"} @@ -32,7 +32,7 @@ START=${START:-start} log() { echo "$*" - lctl mark "$*" + lctl mark "$*" || /bin/true } error() { @@ -46,6 +46,15 @@ pass() { mount | grep $MOUNT || sh llmount.sh +echo preparing for tests involving mounts +EXT2_DEV=/tmp/SANITY.LOOP +dd if=/dev/zero of=$EXT2_DEV bs=1k count=1000 +#losetup /dev/loop0 || losetup /dev/loop0 /tmp/SANITY.LOOP +#mke2fs -c /dev/loop0 100 +#losetup -d /dev/loop0 +mke2fs -F /tmp/SANITY.LOOP + + log '== touch .../f ; rm .../f ======================== test 0' touch $DIR/f $CHECKSTAT -t file $DIR/f || error @@ -301,10 +310,11 @@ $START log '== unpack tar archive as non-root user =========== test 22' mkdir $DIR/d22 -[ $UID -ne 0 ] && RUNAS="" [ $UID -ne 0 ] && RUNAS_ID="$UID" +[ $UID -ne 0 ] && RUNAS="" chown $RUNAS_ID $DIR/d22 -$RUNAS tar cf - /etc/hosts /etc/sysconfig/network | $RUNAS tar xfC - $DIR/d22 +# Tar gets pissy if it can't access $PWD *sigh* +(cd /tmp ; $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | $RUNAS tar xfC - $DIR/d22) ls -lR $DIR/d22/etc $CHECKSTAT -t dir $DIR/d22/etc || error $CHECKSTAT -u \#$RUNAS_ID $DIR/d22/etc || error @@ -516,7 +526,10 @@ pass $CLEAN $START -log "--test 27.8 lfind " +log "--test 27.8 mcreate file without objects to test lfind" +$MCREATE $DIR/d27/fnone || error + +log "--test 27.9 lfind " $LFIND $DIR/d27 pass $CLEAN @@ -554,8 +567,281 @@ log '== open-unlink file ============================== test31' ./openunlink $DIR/f31 $DIR/f31 || error pass + +log '== more mountpoints and symlinks ================= test32' + +log '-- test 32-R1: stat d32/ext2-mountpoint/..' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +$CHECKSTAT -t dir $DIR/d32/ext2-mountpoint/.. || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R2: open d32/ext2-mountpoint/..' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +ls -al $DIR/d32/ext2-mountpoint/.. || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R3: stat d32/ext2-mountpoint/../d2/test_dir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +mkdir -p $DIR/d32/d2/test_dir +$CHECKSTAT -t dir $DIR/d32/ext2-mountpoint/../d2/test_dir || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R4: open d32/ext2-mountpoint/../d2/test_dir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +mkdir -p $DIR/d32/d2/test_dir +ls -al $DIR/d32/ext2-mountpoint/../d2/test_dir || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R5: stat d32/symlink->tmp/symlink->lustre-subdir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/d32 $TMP_DIR/symlink11 +ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 +$CHECKSTAT -t link $DIR/d32/tmp/symlink11 || error +$CHECKSTAT -t link $DIR/d32/symlink01 || error +pass +$CLEAN +$START + +log '-- test 32-R6: open d32/symlink->tmp/symlink->lustre-subdir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/d32 $TMP_DIR/symlink11 +ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 +ls $DIR/d32/tmp/symlink11 || error +ls $DIR/d32/symlink01 || error +pass +$CLEAN +$START + +log '-- test 32-R7: stat d32/symlink->tmp/symlink->lustre-subdir/test_dir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +[ -e $DIR/test_dir ] && rm -fr $DIR/test_dir +mkdir -p $DIR/test_dir +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/test_dir $TMP_DIR/symlink12 +ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 +$CHECKSTAT -t link $DIR/d32/tmp/symlink12 || error +$CHECKSTAT -t link $DIR/d32/symlink02 || error +$CHECKSTAT -t dir -f $DIR/d32/tmp/symlink12 || error +$CHECKSTAT -t dir -f $DIR/d32/symlink02 || error +pass +$CLEAN +$START + +log '-- test 32-R8: open d32/symlink->tmp/symlink->lustre-subdir/test_dir' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +[ -e $DIR/test_dir ] && rm -fr $DIR/test_dir +mkdir -p $DIR/test_dir +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/test_dir $TMP_DIR/symlink12 +ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 +ls $DIR/d32/tmp/symlink12 || error +ls $DIR/d32/symlink02 || error +pass +$CLEAN +$START + +log '-- test 32-R9: stat d32/ext2-mountpoint/../test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +touch $DIR/d32/test_file +$CHECKSTAT -t file $DIR/d32/ext2-mountpoint/../test_file || error +umount $DIR/d32/ext2-mountpoint +pass +$CLEAN +$START + +log '-- test 32-R10: open d32/ext2-mountpoint/../test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +touch $DIR/d32/test_file +cat $DIR/d32/ext2-mountpoint/../test_file || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R11: stat d32/ext2-mountpoint/../d2/test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +mkdir -p $DIR/d32/d2 +touch $DIR/d32/d2/test_file +$CHECKSTAT -t file $DIR/d32/ext2-mountpoint/../d2/test_file || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R12: open d32/ext2-mountpoint/../d2/test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/ext2-mountpoint +mount -t ext2 -o loop $EXT2_DEV $DIR/d32/ext2-mountpoint +mkdir -p $DIR/d32/d2 +touch $DIR/d32/d2/test_file +cat $DIR/d32/ext2-mountpoint/../d2/test_file || error +umount $DIR/d32/ext2-mountpoint/ +pass +$CLEAN +$START + +log '-- test 32-R13: stat d32/symlink->tmp/symlink->lustre-root' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR $TMP_DIR/symlink11 +ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 +$CHECKSTAT -t link $DIR/d32/tmp/symlink11 || error +$CHECKSTAT -t link $DIR/d32/symlink01 || error +pass +$CLEAN +$START + +log '-- test 32-R14: open d32/symlink->tmp/symlink->lustre-root' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR $TMP_DIR/symlink11 +ln -s $TMP_DIR/symlink11 $TMP_DIR/../symlink01 +ls -l $DIR/d32/tmp/symlink11 || error +ls -l $DIR/d32/symlink01 || error +pass +$CLEAN +$START + +log '-- test 32-R15: stat d32/symlink->tmp/symlink->lustre-root/test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +[ -e $DIR/test_file ] && rm -fr $DIR/test_file +touch $DIR/test_file +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/test_file $TMP_DIR/symlink12 +ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 +$CHECKSTAT -t link $DIR/d32/tmp/symlink12 || error +$CHECKSTAT -t link $DIR/d32/symlink02 || error +$CHECKSTAT -t file -f $DIR/d32/tmp/symlink12 || error +$CHECKSTAT -t file -f $DIR/d32/symlink02 || error +pass +$CLEAN +$START + +log '-- test 32-R16: open d32/symlink->tmp/symlink->lustre-root/test_file' +[ -e $DIR/d32 ] && rm -fr $DIR/d32 +[ -e $DIR/test_file ] && rm -fr $DIR/test_file +touch $DIR/test_file +mkdir -p $DIR/d32/tmp +TMP_DIR=$DIR/d32/tmp +ln -s $DIR/test_file $TMP_DIR/symlink12 +ln -s $TMP_DIR/symlink12 $TMP_DIR/../symlink02 +cat $DIR/d32/tmp/symlink12 || error +cat $DIR/d32/symlink02 || error +pass +$CLEAN +$START + +log '-- test 33: write file with mode 444 (should return error)' +# chmod 444 /mnt/lustre/somefile +# open(/mnt/lustre/somefile, O_RDWR) +# Should return -1 +[ $UID -ne 0 ] && RUNAS_ID="$UID" +[ $UID -ne 0 ] && RUNAS="" +[ -e $DIR/test_33_file ] && rm -fr $DIR/test_33_file +touch $DIR/test_33_file +chmod 444 $DIR/test_33_file +chown $RUNAS_ID $DIR/test_33_file +$RUNAS openfile -f O_RDWR $DIR/test_33_file && error +pass +$CLEAN +$START + +if [ -n "$BUG1360" ]; then +log '-- test 34: execute a file with mode 444 (should return error)' +[ $UID -ne 0 ] && RUNAS_ID="$UID" +[ $UID -ne 0 ] && RUNAS="" +[ -e $DIR/test_35_file ] && rm -fr $DIR/test_35_file +cp /bin/sh $DIR/test_35_file +chmod 444 $DIR/test_35_file +chown $RUNAS_ID $DIR/test_35_file +$DIR/test_35_file && error +pass +$CLEAN +$START +else +echo "Skipping test for 1360: set \$BUG_1360 to run it (fail cleanup, likely)." +fi + +if [ -n "$BUG_1365" ]; then +log '-- test 35: truncate file that has not been opened' +$MCREATE $DIR/f +$TRUNCATE $DIR/f 100 +rm $DIR/f +pass +$CLEAN +$START +else +echo "Skipping test for 1365: set \$BUG_1365 to run it (and crash, likely)." +fi + +log '-- test 36: cvs operations' +[ $UID -ne 0 ] && RUNAS_ID="$UID" +[ $UID -ne 0 ] && RUNAS="" +mkdir -p $DIR/cvsroot +log '-- test 36-1: cvs init' +cvs -d $DIR/cvsroot init +$CLEAN +$START +log '-- test 36-2: cvs import' +(cd /etc/init.d ; cvs -d $DIR/cvsroot import -m "nomesg" reposname vtag rtag ) +$CLEAN +$START +log '-- test 36-3: cvs checkout' +(cd $DIR ; cvs -d $DIR/cvsroot co reposname ) +$CLEAN +$START +log '-- test 36-4: cvs add' +(cd $DIR/reposname ; touch foo34 ; cvs add -m 'addmsg' foo34 ) +$CLEAN +$START +log '-- test 36-5: cvs update' +(cd $DIR/reposname ; cvs update ) +$CLEAN +$START +log '-- test 36-5: cvs commit' +# +# XXX change this: use a non rooot users +(cd $DIR/reposname ; cvs commit -m 'nomsg' foo32 ) +pass +$CLEAN +$START + log '== cleanup =============================================' rm -r $DIR/[Rdfs][1-9]* $DIR/ls echo '======================= finished =======================' -exit diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index a4930de..8145e63 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -80,10 +80,28 @@ echo -n "test 6: fstat validation on multiple mount points..." ./multifstat $MOUNT1/f6 $MOUNT2/f6 pass +if [ -n "$BUG_1365" ]; then +echo -n "test 7: create a file on one mount, truncate it on the other..." +mcreate $MOUNT1/f1 +truncate $MOUNT2/f1 100 +rm $MOUNT1/f1 +pass +else +echo "Skipping test for 1365: set \$BUG_1365 to run it (and crash, likely)." +fi + echo "test 9: remove of open file on other node..." ./openunlink $MOUNT1/f9 $MOUNT2/f9 || error pass +echo "test 9b: remove of open directory on other node..." +./opendirunlink $MOUNT1/dir1 $MOUNT2/dir1 || error +pass + +#echo "test 9c: remove of open special file on other node..." +#./opendevunlink $MOUNT1/dev1 $MOUNT2/dev1 || error +#pass + echo -n "test 10: append of file with sub-page size on multiple mounts..." MTPT=1 > $MOUNT2/f10 @@ -106,35 +124,8 @@ for C in a b c d e f g h i j k l; do done [ "`cat $MOUNT1/f11`" = "abcdefghijkl" ] && pass || error -echo "test 12: file length and contents across mounts" -dd if=$SHELL of=$MOUNT1/f12 bs=4096 count=1 -$CHECKSTAT -s 4096 $MOUNT1/f12 $MOUNT2/f12 || error -dd if=$SHELL bs=4096 count=1 | \ - md5sum - $MOUNT1/f12 $MOUNT2/f12 | ( \ - read GOODSUM DASH; \ - while read SUM FILE ; do \ - [ $SUM == $GOODSUM ] || exit 2; \ - done; ) || error - -echo "test 13: open(,O_TRUNC,), close() across mounts" -dd if=$SHELL of=$MOUNT1/f13 bs=4096 count=1 -> $MOUNT1/f13 -$CHECKSTAT -s 0 $MOUNT1/f13 $MOUNT2/f13 || error - -echo "test 14: file extension while holding the fd open" -> $MOUNT1/f14 -# ugh. -touch $MOUNT1/f14-start -sh -c " - echo -n a; - mv $MOUNT1/f14-start $MOUNT1/f14-going; - while [ -f $MOUNT1/f14-going ] ; do sleep 1; done; - " >> $MOUNT1/f14 & -while [ -f $MOUNT1/f14-start ] ; do sleep 1; done; -$CHECKSTAT -s 1 $MOUNT1/f14 $MOUNT2/f14 || error -rm $MOUNT1/f14-going - rm -f $MOUNT1/f[0-9]* $MOUNT1/lnk + $CLEAN exit diff --git a/lustre/tests/test_brw.c b/lustre/tests/test_brw.c index 396f3b0..6cbfcb5 100644 --- a/lustre/tests/test_brw.c +++ b/lustre/tests/test_brw.c @@ -17,7 +17,8 @@ #define CERROR(fmt, arg...) fprintf(stderr, fmt, ## arg) #ifndef __u64 #define __u64 long long -#define HTON__u64(v) (v) +#define cpu_to_le64(v) (v) +#define le64_to_cpu(v) (v) #endif #ifndef LPU64 @@ -31,8 +32,8 @@ #define LPDS sizeof(__u64) int page_debug_setup(void *addr, int len, __u64 off, __u64 id) { - off = HTON__u64(off); - id = HTON__u64(id); + off = cpu_to_le64(off); + id = cpu_to_le64(id); memcpy(addr, (char *)&off, LPDS); memcpy(addr + LPDS, (char *)&id, LPDS); @@ -48,8 +49,8 @@ int page_debug_check(char *who, void *addr, int size, __u64 off, __u64 id) __u64 ne_off; int err = 0; - ne_off = HTON__u64(off); - id = HTON__u64(id); + ne_off = le64_to_cpu(off); + id = le64_to_cpu(id); if (memcmp(addr, (char *)&ne_off, LPDS)) { CERROR("%s: for offset "LPU64" off: "LPX64" != "LPX64"\n", who, off, *(__u64 *)addr, ne_off); @@ -199,7 +200,7 @@ int main(int argc, char **argv) return 5; } - for (offset = 0; offset < last && cmd && READ; offset += len) { + for (offset = 0; offset < last && cmd & READ; offset += len) { int i; rc = read(fd, buf, len); diff --git a/lustre/tests/uml.sh b/lustre/tests/uml.sh index 599bd21..2b3adc3 100644 --- a/lustre/tests/uml.sh +++ b/lustre/tests/uml.sh @@ -1,15 +1,22 @@ #!/bin/bash -config=${1-uml.xml} -LMC=${LMC-../utils/lmc} +export PATH=`dirname $0`/../utils:$PATH + +config=${1:-uml.xml} +LMC=${LMC:-lmc} TMP=${TMP:-/tmp} MDSDEV=${MDSDEV:-$TMP/mds1} MDSSIZE=${MDSSIZE:-50000} -OSTDEV1=${OSTDEV1:-$TMP/ost1} -OSTDEV2=${OSTDEV2:-$TMP/ost2} +OSTDEVBASE=$TMP/ost +#OSTDEV1=${OSTDEV1:-${OSTDEVBASE}1} +#OSTDEV2=${OSTDEV2:-${OSTDEVBASE}2} +#etc OSTSIZE=${OSTSIZE:-100000} +STRIPECNT=${STRIPECNT:-1} + +FSTYPE=${FSTYPE:-ext3} NETTYPE=${NETTYPE:-tcp} @@ -66,17 +73,17 @@ done # configure mds server echo; echo "adding MDS on: $MDSNODE" -${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --dev $MDSDEV --size $MDSSIZE ||exit 10 +${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE ||exit 10 # configure ost -${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 1 --stripe_pattern 0 || exit 20 +${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20 COUNT=1 echo -n "adding OST on:" for NODE in $OSTNODES; do eval OSTDEV=\$OSTDEV$COUNT echo -n " $NODE" - OSTDEV=${OSTDEV:-$OSTDEV1} - ${LMC} -m $config --add ost --node $NODE --lov lov1 --dev $OSTDEV --size $OSTSIZE || exit 21 + OSTDEV=${OSTDEV:-$OSTDEVBASE$COUNT} + ${LMC} -m $config --add ost --node $NODE --lov lov1 --fstype $FSTYPE --dev $OSTDEV --size $OSTSIZE || exit 21 COUNT=`expr $COUNT + 1` done diff --git a/lustre/tests/unlinkmany.c b/lustre/tests/unlinkmany.c new file mode 100644 index 0000000..ba1bee7 --- /dev/null +++ b/lustre/tests/unlinkmany.c @@ -0,0 +1,74 @@ +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <time.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> + +void usage(char *prog) +{ + printf("usage: %s filenamefmt count\n", prog); + printf(" %s filenamefmt start count\n", prog); +} + +int main(int argc, char ** argv) +{ + int i, rc = 0; + char format[4096], *fmt; + char filename[4096]; + long start, last; + long begin = 0, count; + + if (argc < 3 || argc > 4) { + usage(argv[0]); + return 1; + } + + if (strlen(argv[1]) > 4080) { + printf("name too long\n"); + return 1; + } + + start = last = time(0); + + if (argc == 3) { + count = strtol(argv[2], NULL, 0); + if (count < 1) { + printf("count must be at least one\n"); + return 1; + } + } else { + begin = strtol(argv[2], NULL, 0); + count = strtol(argv[3], NULL, 0); + } + + if (strchr(argv[1], '%')) { + fmt = argv[1]; + } else { + sprintf(format, "%s%%d", argv[1]); + fmt = format; + } + for (i = 0; i < count; i++, begin++) { + sprintf(filename, fmt, begin); + rc = unlink(filename); + if (rc) { + printf("unlink(%s) error: %s\n", + filename, strerror(errno)); + rc = errno; + break; + } + if ((i % 10000) == 0) { + printf(" - unlinked %d (time %ld ; total %ld ; last " + "%ld)\n", i, time(0), time(0) - start, + time(0) - last); + last = time(0); + } + } + printf("total: %d unlinks in %ld seconds: %f unlinks/second\n", i, + time(0) - start, ((float)i / (time(0) - start))); + + return rc; +} diff --git a/lustre/tests/writeme.c b/lustre/tests/writeme.c index ab8692f..a376063 100644 --- a/lustre/tests/writeme.c +++ b/lustre/tests/writeme.c @@ -6,27 +6,27 @@ int main(int argc, char **argv) { - int fd, rc; + int fd, rc; int i = 0; char buf[4096]; - + memset(buf, 0, 4096); - if (argc != 2) { - printf("Usage openme <filename>\n"); + if (argc != 2) { + printf("Usage: %s <filename>\n", argv[0]); exit(1); } fd = open(argv[1], O_RDWR | O_CREAT, 0600); - if (fd == -1) { + if (fd == -1) { printf("Error opening %s\n", argv[1]); exit(1); } - while (1) { - sprintf(buf, "write %d\n", i); - rc = write(fd, buf, sizeof(buf)); - sleep(1); + while (1) { + sprintf(buf, "write %d\n", i); + rc = write(fd, buf, sizeof(buf)); + sleep(1); } return 0; } diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index 4775289..06a1588 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -11,8 +11,8 @@ obdctl lctl lfind lstripe -lconf obdstat obdio obdbarrier lload +wirecheck \ No newline at end of file diff --git a/lustre/utils/Lustre/.cvsignore b/lustre/utils/Lustre/.cvsignore new file mode 100644 index 0000000..97e22b9 --- /dev/null +++ b/lustre/utils/Lustre/.cvsignore @@ -0,0 +1,4 @@ +Makefile +Makefile.in +.deps +*.pyc diff --git a/lustre/utils/Lustre/Makefile.am b/lustre/utils/Lustre/Makefile.am new file mode 100644 index 0000000..e8e522f --- /dev/null +++ b/lustre/utils/Lustre/Makefile.am @@ -0,0 +1,2 @@ +pymod_SCRIPTS = __init__.py lustredb.py error.py cmdline.py +EXTRA_DIST = $(pymod_SCRIPTS) diff --git a/lustre/utils/Lustre/__init__.py b/lustre/utils/Lustre/__init__.py new file mode 100644 index 0000000..c1b93e6 --- /dev/null +++ b/lustre/utils/Lustre/__init__.py @@ -0,0 +1,7 @@ +__all__ = ["lustredb"] + +from lustredb import LustreDB, LustreDB_XML, LustreDB_LDAP +from error import LconfError, OptionError +from cmdline import Options + +CONFIG_VERSION="2003060501" diff --git a/lustre/utils/Lustre/cmdline.py b/lustre/utils/Lustre/cmdline.py new file mode 100644 index 0000000..53bb6e8 --- /dev/null +++ b/lustre/utils/Lustre/cmdline.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python +# +# Copyright (C) 2002 Cluster File Systems, Inc. +# Author: Robert Read <rread@clusterfs.com> +# This file is part of Lustre, http://www.lustre.org. +# +# Lustre is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# Lustre is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Lustre; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + +# Standard the comand line handling for all the python tools. + +import sys, getopt, types +import string +import error + +class Options: + FLAG = 1 + PARAM = 2 + INTPARAM = 3 + def __init__(self, cmd, remain_help, options): + self.options = options + shorts = "" + longs = [] + options.append(('help,h', "Print this help")) + for opt in options: + long = self.long(opt) + short = self.short(opt) + if self.type(opt) in (Options.PARAM, Options.INTPARAM): + if short: short = short + ':' + if long: long = long + '=' + shorts = shorts + short + longs.append(long) + self.short_opts = shorts + self.long_opts = longs + self.cmd = cmd + self.remain_help = remain_help + + def init_values(self): + values = {} + for opt in self.options: + values[self.key(opt)] = self.default(opt) + return values + + def long(self, option): + n = string.find(option[0], ',') + if n < 0: return option[0] + else: return option[0][0:n] + + def key(self, option): + key = self.long(option) + return string.replace(key, '-', '_') + + def short(self, option): + n = string.find(option[0], ',') + if n < 0: return '' + else: return option[0][n+1:] + + def help(self, option): + return option[1] + + def type(self, option): + if len(option) >= 3: + return option[2] + return Options.FLAG + + def default(self, option): + if len(option) >= 4: + return option[3] + return None + + def lookup_option(self, key, key_func): + for opt in self.options: + if key_func(opt) == key: + return opt + + def lookup_short(self, key): + return self.lookup_option(key, self.short) + + def lookup_long(self, key): + return self.lookup_option(key, self.long) + + def handle_opts(self, opts): + values = self.init_values() + for o, a in opts: + if o[0:2] != '--': + option = self.lookup_short(o[1:]) + else: + option = self.lookup_long(o[2:]) + if self.type(option) == Options.PARAM: + val = a + elif self.type(option) == Options.INTPARAM: + try: + val = int(a) + except ValueError, e: + raise error.OptionError("option: '%s' expects integer value, got '%s' " % (o,a)) + else: + val = 1 + values[self.key(option)] = val + return values + + + class option_wrapper: + def __init__(self, values): + self.__dict__['values'] = values + def __getattr__(self, name): + if self.values.has_key(name): + return self.values[name] + else: + raise error.OptionError("bad option name: " + name) + def __setattr__(self, name, value): + self.values[name] = value + + def parse(self, argv): + try: + opts, args = getopt.getopt(argv, self.short_opts, self.long_opts) + values = self.handle_opts(opts) + if values["help"]: + self.usage() + sys.exit(0) + return self.option_wrapper(values), args + except getopt.error, e: + raise error.OptionError(str(e)) + + def usage(self): + ret = 'usage: %s [options] %s\n' % (self.cmd, self.remain_help) + for opt in self.options: + s = self.short(opt) + if s: str = "-%s|--%s" % (s,self.long(opt)) + else: str = "--%s" % (self.long(opt),) + if self.type(opt) in (Options.PARAM, Options.INTPARAM): + str = "%s <arg>" % (str,) + help = self.help(opt) + n = string.find(help, '\n') + if self.default(opt) != None: + if n < 0: + str = "%-15s %s (default=%s)" %(str, help, + self.default(opt)) + else: + str = "%-15s %s (default=%s)%s" %(str, help[0:n], + self.default(opt), + help[n:]) + else: + str = "%-15s %s" %(str, help) + ret = ret + str + "\n" + print ret + +# Test driver +if __name__ == "__main__": + cl = Options("test", "xml_file", [ + ('verbose,v', "verbose ", Options.FLAG, 0), + ('cleanup,d', "shutdown"), + ('gdb', "Display gdb module file ", Options.FLAG, 0), + ('device', "device path ", Options.PARAM), + ('ldapurl', "LDAP server URL ", Options.PARAM), + ('lustre', "Lustre source dir ", Options.PARAM), + ('portals', "Portals source dir ", Options.PARAM), + ('maxlevel', """Specify the maximum level + Levels are aproximatly like: + 70 - mountpoint, echo_client, osc, mdc, lov""", + Options.INTPARAM, 100), + + ]) + + conf, args = cl.parse(sys.argv[1:]) + + for key in conf.values.keys(): + print "%-10s = %s" % (key, conf.values[key]) diff --git a/lustre/utils/Lustre/error.py b/lustre/utils/Lustre/error.py new file mode 100644 index 0000000..6c30416 --- /dev/null +++ b/lustre/utils/Lustre/error.py @@ -0,0 +1,10 @@ +import exceptions + +class LconfError (exceptions.Exception): + def __init__(self, args): + self.args = args + +class OptionError (exceptions.Exception): + def __init__(self, args): + self.args = args + diff --git a/lustre/utils/Lustre/lustredb.py b/lustre/utils/Lustre/lustredb.py new file mode 100644 index 0000000..35bca56 --- /dev/null +++ b/lustre/utils/Lustre/lustredb.py @@ -0,0 +1,413 @@ +import sys, types, string, os +import re, exceptions +import xml.dom.minidom +import Lustre + +# ============================================================ +# XML processing and query + +class LustreDB: + def lookup(self, uuid): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_uuid(uuid) + + def lookup_name(self, name, class_name = ""): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_name(name, class_name) + + def lookup_class(self, class_name): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_class(class_name) + + def get_val(self, tag, default=None): + v = self._get_val(tag) + if v: + return v + if default != None: + return default + return None + + def get_class(self): + return self._get_class() + + def get_val_int(self, tag, default=0): + str = self._get_val(tag) + try: + if str: + return int(str) + return default + except ValueError: + raise LconfError("text value is not integer:", str) + + def get_first_ref(self, tag): + """ Get the first uuidref of the type TAG. Only + one is expected. Returns the uuid.""" + uuids = self._get_refs(tag) + if len(uuids) > 0: + return uuids[0] + return None + + def get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = self._get_refs(tag) + return uuids + + def get_all_refs(self): + """ Get all the refs. Returns list of uuids. """ + uuids = self._get_all_refs() + return uuids + + def nid2server(self, nid, net_type): + netlist = self.lookup_class('network') + for net_db in netlist: + if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type: + return net_db + return None + + # Find the target_device for target on a node + # node->profiles->device_refs->target + def get_node_tgt_dev(self, node_name, target_uuid): + node_db = self.lookup_name(node_name) + if not node_db: + return None + return self.get_tgt_dev(target_uuid) + + # get all network uuids for this node + def get_networks(self): + ret = [] + prof_list = self.get_refs('profile') + for prof_uuid in prof_list: + prof_db = self.lookup(prof_uuid) + net_list = prof_db.get_refs('network') + for net_uuid in net_list: + ret.append(net_uuid) + return ret + + def get_active_dev(self, tgtuuid): + tgt = self.lookup(tgtuuid) + tgt_dev_uuid =tgt.get_first_ref('active') + return tgt_dev_uuid + + def get_tgt_dev(self, tgtuuid): + prof_list = self.get_refs('profile') + for prof_uuid in prof_list: + prof_db = self.lookup(prof_uuid) + if not prof_db: + panic("profile:", profile, "not found.") + for ref_class, ref_uuid in prof_db.get_all_refs(): + if ref_class in ('osd', 'mdsdev'): + devdb = self.lookup(ref_uuid) + uuid = devdb.get_first_ref('target') + if tgtuuid == uuid: + return ref_uuid + return None + + def get_group(self, group): + ret = [] + devs = self.lookup_class('mds') + for tgt in devs: + if tgt.get_val('group', "") == group: + ret.append(tgt.getUUID()) + devs = self.lookup_class('ost') + for tgt in devs: + if tgt.get_val('group', "") == group: + ret.append(tgt.getUUID()) + return ret + + # Change the current active device for a target + def update_active(self, tgtuuid, new_uuid): + self._update_active(tgtuuid, new_uuid) + + def get_version(self): + return self.get_val('version') + +class LustreDB_XML(LustreDB): + def __init__(self, dom, root_node): + # init xmlfile + self.dom_node = dom + self.root_node = root_node + + def xmltext(self, dom_node, tag): + list = dom_node.getElementsByTagName(tag) + if len(list) > 0: + dom_node = list[0] + dom_node.normalize() + if dom_node.firstChild: + txt = string.strip(dom_node.firstChild.data) + if txt: + return txt + + def xmlattr(self, dom_node, attr): + return dom_node.getAttribute(attr) + + def _get_val(self, tag): + """a value could be an attribute of the current node + or the text value in a child node""" + ret = self.xmlattr(self.dom_node, tag) + if not ret: + ret = self.xmltext(self.dom_node, tag) + return ret + + def _get_class(self): + return self.dom_node.nodeName + + def get_ref_type(self, ref_tag): + res = string.split(ref_tag, '_') + return res[0] + + # + # [(ref_class, ref_uuid),] + def _get_all_refs(self): + list = [] + for n in self.dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + ref_uuid = self.xml_get_ref(n) + ref_class = self.get_ref_type(n.nodeName) + list.append((ref_class, ref_uuid)) + + list.sort() + return list + + def _get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = [] + refname = '%s_ref' % tag + reflist = self.dom_node.getElementsByTagName(refname) + for r in reflist: + uuids.append(self.xml_get_ref(r)) + return uuids + + def xmllookup_by_uuid(self, dom_node, uuid): + for n in dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + if self.xml_get_uuid(n) == uuid: + return n + else: + n = self.xmllookup_by_uuid(n, uuid) + if n: return n + return None + + def _lookup_by_uuid(self, uuid): + dom = self. xmllookup_by_uuid(self.root_node, uuid) + if dom: + return LustreDB_XML(dom, self.root_node) + + def xmllookup_by_name(self, dom_node, name): + for n in dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + if self.xml_get_name(n) == name: + return n + else: + n = self.xmllookup_by_name(n, name) + if n: return n + return None + + def _lookup_by_name(self, name, class_name): + dom = self.xmllookup_by_name(self.root_node, name) + if dom: + return LustreDB_XML(dom, self.root_node) + + def xmllookup_by_class(self, dom_node, class_name): + return dom_node.getElementsByTagName(class_name) + + def _lookup_by_class(self, class_name): + ret = [] + domlist = self.xmllookup_by_class(self.root_node, class_name) + for node in domlist: + ret.append(LustreDB_XML(node, self.root_node)) + return ret + + def xml_get_name(self, n): + return n.getAttribute('name') + + def getName(self): + return self.xml_get_name(self.dom_node) + + def xml_get_ref(self, n): + return n.getAttribute('uuidref') + + def xml_get_uuid(self, dom_node): + return dom_node.getAttribute('uuid') + + def getUUID(self): + return self.xml_get_uuid(self.dom_node) + + # Convert routes from the router to a route that will be used + # on the local system. The network type and gw are changed to the + # interface on the router the local system will connect to. + def get_local_routes(self, type, gw): + """ Return the routes as a list of tuples of the form: + [(type, gw, lo, hi),]""" + res = [] + tbl = self.dom_node.getElementsByTagName('routetbl') + for t in tbl: + routes = t.getElementsByTagName('route') + for r in routes: + net_type = self.xmlattr(r, 'type') + if type != net_type: + lo = self.xmlattr(r, 'lo') + hi = self.xmlattr(r, 'hi') + tgt_cluster_id = self.xmlattr(r, 'tgtclusterid') + res.append((type, gw, tgt_cluster_id, lo, hi)) + return res + + def get_route_tbl(self): + ret = [] + for r in self.dom_node.getElementsByTagName('route'): + net_type = self.xmlattr(r, 'type') + gw = self.xmlattr(r, 'gw') + gw_cluster_id = self.xmlattr(r, 'gwclusterid') + tgt_cluster_id = self.xmlattr(r, 'tgtclusterid') + lo = self.xmlattr(r, 'lo') + hi = self.xmlattr(r, 'hi') + ret.append((net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)) + return ret + + def _update_active(self, tgt, new): + raise LconfError("updates not implemented for XML") + +# ================================================================ +# LDAP Support +class LustreDB_LDAP(LustreDB): + def __init__(self, name, attrs, + base = "fs=lustre", + parent = None, + url = "ldap://localhost", + user = "cn=Manager, fs=lustre", + pw = "secret" + ): + self._name = name + self._attrs = attrs + self._base = base + self._parent = parent + self._url = url + self._user = user + self._pw = pw + if parent: + self.l = parent.l + self._base = parent._base + else: + self.open() + + def open(self): + import ldap + try: + self.l = ldap.initialize(self._url) + # Set LDAP protocol version used + self.l.protocol_version=ldap.VERSION3 + # user and pw only needed if modifying db + self.l.bind_s(self._user, self._pw, ldap.AUTH_SIMPLE); + except ldap.LDAPError, e: + raise Lustre.LconfError('Unable to connection to ldap server') + + try: + self._name, self._attrs = self.l.search_s(self._base, + ldap.SCOPE_BASE)[0] + except ldap.LDAPError, e: + raise Lustre.LconfError("no config found in ldap: %s" + % (self._base,)) + def close(self): + self.l.unbind_s() + + def ldap_search(self, filter): + """Return list of uuids matching the filter.""" + import ldap + dn = self._base + ret = [] + uuids = [] + try: + for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL, + filter, ["uuid"]): + for v in attrs['uuid']: + uuids.append(v) + except ldap.NO_SUCH_OBJECT, e: + pass + except ldap.LDAPError, e: + print e # FIXME: die here? + if len(uuids) > 0: + for uuid in uuids: + ret.append(self._lookup_by_uuid(uuid)) + return ret + + def _lookup_by_name(self, name, class_name): + list = self.ldap_search("lustreName=%s" %(name)) + if len(list) == 1: + return list[0] + return None + + def _lookup_by_class(self, class_name): + return self.ldap_search("objectclass=%s" %(string.upper(class_name))) + + def _lookup_by_uuid(self, uuid): + import ldap + dn = "uuid=%s,%s" % (uuid, self._base) + ret = None + try: + for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE, + "objectclass=*"): + ret = LustreDB_LDAP(name, attrs, parent = self) + + except ldap.NO_SUCH_OBJECT, e: + pass # just return empty list + except ldap.LDAPError, e: + print e # FIXME: die here? + return ret + + + def _get_val(self, k): + ret = None + if self._attrs.has_key(k): + v = self._attrs[k] + if type(v) == types.ListType: + ret = str(v[0]) + else: + ret = str(v) + return ret + + def _get_class(self): + return string.lower(self._attrs['objectClass'][0]) + + def get_ref_type(self, ref_tag): + return ref_tag[:-3] + + # + # [(ref_class, ref_uuid),] + def _get_all_refs(self): + list = [] + for k in self._attrs.keys(): + if re.search('.*Ref', k): + for uuid in self._attrs[k]: + ref_class = self.get_ref_type(k) + list.append((ref_class, uuid)) + return list + + def _get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = [] + refname = '%sRef' % tag + if self._attrs.has_key(refname): + return self._attrs[refname] + return [] + + def getName(self): + return self._get_val('lustreName') + + def getUUID(self): + return self._get_val('uuid') + + def get_route_tbl(self): + return [] + + def _update_active(self, tgtuuid, newuuid): + """Return list of uuids matching the filter.""" + import ldap + dn = "uuid=%s,%s" %(tgtuuid, self._base) + ret = [] + uuids = [] + try: + self.l.modify_s(dn, [(ldap.MOD_REPLACE, "activeRef", newuuid)]) + except ldap.NO_SUCH_OBJECT, e: + print e + except ldap.LDAPError, e: + print e # FIXME: die here? + return diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index d345b64..e78bb7d 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -1,13 +1,15 @@ # Administration utilities Makefile DEFS= +SUBDIRS = Lustre -CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(PORTALS)/include -I$(srcdir)/../include -Wall -L$(PORTALSLIB) +CFLAGS:=-g -O2 -I$(top_srcdir)/utils -I$(top_srcdir)/portals/include -I$(srcdir)/../include -Wall -L../portals/utils KFLAGS:= CPPFLAGS = $(HAVE_LIBREADLINE) lctl_LDADD := $(LIBREADLINE) -lptlctl lload_LDADD := -lptlctl -sbin_PROGRAMS = lctl lfind lstripe obdio obdbarrier obdstat lload +sbin_PROGRAMS = lctl lfind lstripe obdio obdbarrier obdstat lload wirecheck sbin_SCRIPTS = lconf lmc llanalyze +wirecheck_SOURCES = wirecheck.c lctl_SOURCES = parser.c obd.c lctl.c parser.h obdctl.h lload_SOURCES = lload.c obdio_SOURCES = obdio.c obdiolib.c obdiolib.h diff --git a/lustre/utils/lactive b/lustre/utils/lactive new file mode 100644 index 0000000..6d7771d5 --- /dev/null +++ b/lustre/utils/lactive @@ -0,0 +1,85 @@ +#!/usr/bin/env python +# +# Copyright (C) 2002 Cluster File Systems, Inc. +# Author: Robert Read <rread@clusterfs.com> +# This file is part of Lustre, http://www.lustre.org. +# +# Lustre is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. +# +# Lustre is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Lustre; if not, write to the Free Software +# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# + +# For all the OST/MDSs that are primary on the --primary node, set +# them to be active on --active if that OST is available on --active. +# +# Make the active node the active node for all devices it shares with the +# old. The bulk of this code is for figuring out which devices to +# change, and what to change them to. + +# XXX add error checking +# XXX make this code less ugly + +import sys, getopt, types +import string, os +import ldap +import Lustre + +lactive_options = [ + ('ldapurl',"LDAP server URL", Lustre.Options.PARAM, + "ldap://localhost"), + ('config', "Cluster config name used for LDAP query", Lustre.Options.PARAM), + ('group', "The group of devices to update", Lustre.Options.PARAM), + ('active', "The active node name", Lustre.Options.PARAM), + ] + +def fatal(*args): + msg = string.join(map(str,args)) + print "! " + msg + sys.exit(1) + + +cl = Lustre.Options("lactive","", lactive_options) +config, args = cl.parse(sys.argv[1:]) + +if not (config.group or config.active): + fatal("Must specify both group and active node.") + +if not config.config: + fatal("Missing config") + +base = "config=%s,fs=lustre" % (config.config,) +db = Lustre.LustreDB_LDAP('', {}, base=base, url = config.ldapurl) + +active_node = db.lookup_name(config.active) +if not active_node: + fatal(config.active, "node not found in database.") + +devices = db.get_group(config.group) +if len(devices) < 0: + fatal("no devices found for group", config.group) + +# for all devices in group + # lookup device in active node + # update the active device +for tgtuuid in devices: + active_uuid = db.get_active_dev(tgtuuid) + new_active_uuid = active_node.get_tgt_dev(tgtuuid) + if active_uuid != new_active_uuid: + print ("%s: changing active %s to %s:%s" + % (tgtuuid, active_uuid, + config.active, new_active_uuid)) + db.update_active(tgtuuid, new_active_uuid) + + + + + diff --git a/lustre/utils/lconf.in b/lustre/utils/lconf similarity index 60% rename from lustre/utils/lconf.in rename to lustre/utils/lconf index cbe05dd..7b31fef 100755 --- a/lustre/utils/lconf.in +++ b/lustre/utils/lconf @@ -26,7 +26,7 @@ import sys, getopt, types import string, os, stat, popen2, socket, time, random, fcntl, select -import re, exceptions +import re, exceptions, signal import xml.dom.minidom if sys.version[0] == '1': @@ -34,6 +34,19 @@ if sys.version[0] == '1': else: from fcntl import F_GETFL, F_SETFL +PYMOD_DIR = "/usr/lib/lustre/python" + +def development_mode(): + base = os.path.dirname(sys.argv[0]) + if os.access(base+"/Makefile.am", os.R_OK): + return 1 + return 0 + +if not development_mode(): + sys.path.append(PYMOD_DIR) + +import Lustre + # Global parameters MAXTCPBUF = 1048576 DEFAULT_TCPBUF = 1048576 @@ -41,7 +54,61 @@ DEFAULT_TCPBUF = 1048576 # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) MAX_LOOP_DEVICES = 256 -PORTALS_DIR = '@PORTALSLOC@' +PORTALS_DIR = 'portals' + + +# Please keep these uptodate with the values in portals/kp30.h +ptldebug_names = { + "trace" : (1 << 0), + "inode" : (1 << 1), + "super" : (1 << 2), + "ext2" : (1 << 3), + "malloc" : (1 << 4), + "cache" : (1 << 5), + "info" : (1 << 6), + "ioctl" : (1 << 7), + "blocks" : (1 << 8), + "net" : (1 << 9), + "warning" : (1 << 10), + "buffs" : (1 << 11), + "other" : (1 << 12), + "dentry" : (1 << 13), + "portals" : (1 << 14), + "page" : (1 << 15), + "dlmtrace" : (1 << 16), + "error" : (1 << 17), + "emerg" : (1 << 18), + "ha" : (1 << 19), + "rpctrace" : (1 << 20), + "vfstrace" : (1 << 21), + } + +subsystem_names = { + "undefined" : (0 << 24), + "mdc" : (1 << 24), + "mds" : (2 << 24), + "osc" : (3 << 24), + "ost" : (4 << 24), + "class" : (5 << 24), + "obdfs" : (6 << 24), + "llite" : (7 << 24), + "rpc" : (8 << 24), + "ext2obd" : (9 << 24), + "portals" : (10 << 24), + "socknal" : (11 << 24), + "qswnal" : (12 << 24), + "pinger" : (13 << 24), + "filter" : (14 << 24), + "trace" : (15 << 24), + "echo" : (16 << 24), + "ldlm" : (17 << 24), + "lov" : (18 << 24), + "gmnal" : (19 << 24), + "ptlrouter" : (20 << 24), + "cobd" : (21 << 24), + "ptlbd" : (22 << 24), + } + first_cleanup_error = 0 def cleanup_error(rc): @@ -49,194 +116,16 @@ def cleanup_error(rc): if not first_cleanup_error: first_cleanup_error = rc - -def usage(): - print """usage: lconf config.xml - -config.xml Lustre configuration in xml format. ---ldapurl LDAP server URL, eg. ldap://localhost ---config Cluster config name used for LDAP query ---node <nodename> Load config for <nodename> ---select service=nodeA,service2=nodeB U --d | --cleanup Cleans up config. (Shutdown) --f | --force Forced unmounting and/or obd detach during cleanup --v | --verbose Print system commands as they are run --h | --help Print this help ---gdb Prints message after creating gdb module script - and sleeps for 5 seconds. --n | --noexec Prints the commands and steps that will be run for a - config without executing them. This can used to check if a - config file is doing what it should be doing. (Implies -v) ---nomod Skip load/unload module step. ---nosetup Skip device setup/cleanup step. ---reformat Reformat all devices (without question) ---dump <file> Dump the kernel debug log before portals is unloaded ---minlevel <num> Specify the minimum level of services to configure/cleanup (default 0) ---maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100) - Levels are aproximatly like: - 10 - network - 20 - device, ldlm - 30 - osd, mdd - 40 - mds, ost - 50 - mdc, osc - 60 - lov - 70 - mountpoint, echo_client ---lustre=src_dir Base directory of lustre sources. This parameter will cause lconf - to load modules from a source tree. ---portals=src_dir Portals source directory. If this is a relative path, then it is - assumed to be relative to lustre. - -""" - TODO = """ ---ldap server LDAP server with lustre config database ---makeldiff Translate xml source to LDIFF -This are perhaps not needed: -""" - sys.exit() - -# ============================================================ -# Config parameters, encapsulated in a class -class Config: - def __init__(self): - # flags - self._noexec = 0 - self._verbose = 0 - self._reformat = 0 - self._cleanup = 0 - self._gdb = 0 - self._nomod = 0 - self._nosetup = 0 - self._force = 0 - # parameters - self._modules = None - self._node = None - self._url = None - self._gdb_script = '/tmp/ogdb' - self._debug_path = '/tmp/lustre-log' - self._dump_file = None - self._lustre_dir = '' - self._portals_dir = '' - self._minlevel = 0 - self._maxlevel = 100 - self._timeout = 0 - self._recovery_upcall = '' - self._ldapurl = '' - self._config_name = '' - self._select = {} - self._lctl_dump = '' - - def verbose(self, flag = None): - if flag: self._verbose = flag - return self._verbose - - def noexec(self, flag = None): - if flag: self._noexec = flag - return self._noexec - - def reformat(self, flag = None): - if flag: self._reformat = flag - return self._reformat - - def cleanup(self, flag = None): - if flag: self._cleanup = flag - return self._cleanup - - def gdb(self, flag = None): - if flag: self._gdb = flag - return self._gdb - - def nomod(self, flag = None): - if flag: self._nomod = flag - return self._nomod - - def nosetup(self, flag = None): - if flag: self._nosetup = flag - return self._nosetup - - def force(self, flag = None): - if flag: self._force = flag - return self._force - - def node(self, val = None): - if val: self._node = val - return self._node - - def gdb_script(self): - if os.path.isdir('/r'): - return '/r' + self._gdb_script - else: - return self._gdb_script - - def debug_path(self): - if os.path.isdir('/r'): - return '/r' + self._debug_path - else: - return self._debug_path - - def dump_file(self, val = None): - if val: self._dump_file = val - return self._dump_file - def minlevel(self, val = None): - if val: self._minlevel = int(val) - return self._minlevel - - def maxlevel(self, val = None): - if val: self._maxlevel = int(val) - return self._maxlevel - - def portals_dir(self, val = None): - if val: self._portals_dir = val - return self._portals_dir - - def lustre_dir(self, val = None): - if val: self._lustre_dir = val - return self._lustre_dir - - def timeout(self, val = None): - if val: self._timeout = val - return self._timeout - - def recovery_upcall(self, val = None): - if val: self._recovery_upcall = val - return self._recovery_upcall - - def ldapurl(self, val = None): - if val: self._ldapurl = val - return self._ldapurl - - def config_name(self, val = None): - if val: self._config_name = val - return self._config_name - - def init_select(self, arg): - # arg = "service=nodeA,service2=nodeB" - list = string.split(arg, ',') - for entry in list: - srv, node = string.split(entry, '=') - self._select[srv] = node - - def select(self, srv): - if self._select.has_key(srv): - return self._select[srv] - return None - - def lctl_dump(self, val = None): - if val: self._lctl_dump = val - return self._lctl_dump - - -config = Config() - # ============================================================ # debugging and error funcs def fixme(msg = "this feature"): - raise LconfError, msg + ' not implmemented yet.' + raise Lustre.LconfError, msg + ' not implmemented yet.' def panic(*args): msg = string.join(map(str,args)) - if not config.noexec(): - raise LconfError(msg) + if not config.noexec: + raise Lustre.LconfError(msg) else: print "! " + msg @@ -249,10 +138,24 @@ def logall(msgs): print string.strip(s) def debug(*args): - if config.verbose(): + if config.verbose: msg = string.join(map(str,args)) print msg + +# ack, python's builtin int() does not support '0x123' syntax. +# eval can do it, although what a hack! +def my_int(s): + try: + if s[0:2] == '0x': + return eval(s, {}, {}) + else: + return int(s) + except SyntaxError, e: + raise ValueError("not a number") + except NameError, e: + raise ValueError("not a number") + # ============================================================ # locally defined exceptions class CommandError (exceptions.Exception): @@ -278,10 +181,6 @@ class CommandError (exceptions.Exception): else: print self.cmd_err -class LconfError (exceptions.Exception): - def __init__(self, args): - self.args = args - # ============================================================ # handle daemons, like the acceptor @@ -374,6 +273,14 @@ def run_acceptors(): if not daemon.running(): daemon.start() +def run_one_acceptor(port): + if acceptors.has_key(port): + daemon = acceptors[port] + if not daemon.running(): + daemon.start() + else: + panic("run_one_acceptor: No acceptor defined for port:", port) + def stop_acceptor(port): if acceptors.has_key(port): daemon = acceptors[port] @@ -395,7 +302,7 @@ class LCTLInterface: self.lctl = find_prog(cmd) self.save_file = '' if not self.lctl: - if config.noexec(): + if config.noexec: debug('! lctl not found') self.lctl = 'lctl' else: @@ -422,7 +329,7 @@ class LCTLInterface: cmds = '\n dump ' + self.save_file + cmds debug("+", cmd_line, cmds) - if config.noexec(): return (0, []) + if config.noexec: return (0, []) child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command child.tochild.write(cmds + "\n") @@ -474,19 +381,16 @@ class LCTLInterface: def network(self, net, nid): """ initialized network and add "self" """ - # Idea: "mynid" could be used for all network types to add "self," and then - # this special case would be gone and the "self" hack would be hidden. - if net in ('tcp', 'toe'): - cmds = """ + cmds = """ network %s mynid %s quit """ % (net, nid) - self.run(cmds) + self.run(cmds) # create a new connection def connect(self, srv): cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type) - if srv.net_type in ('tcp', 'toe') and not config.lctl_dump(): + if srv.net_type in ('tcp', 'toe') and not config.lctl_dump: flags = '' if srv.irq_affinity: flags = flags + 'i' @@ -503,6 +407,14 @@ class LCTLInterface: cmds = cmds + "\n quit" self.run(cmds) + + # Recover a device + def recover(self, dev_uuid, new_conn): + cmds = """ + device %%%s + probe + recover %s""" %(dev_uuid, new_conn) + self.run(cmds) # add a route to a range def add_route(self, net, gw, lo, hi): @@ -553,6 +465,13 @@ class LCTLInterface: quit""" % (net, nid, servuuid) self.run(cmds) + def del_uuid(self, servuuid): + cmds = """ + ignore_errors + del_uuid %s + quit""" % (servuuid,) + self.run(cmds) + # disconnect all def disconnectAll(self, net): cmds = """ @@ -572,17 +491,20 @@ class LCTLInterface: self.run(cmds) # cleanup a device - def cleanup(self, name, uuid): + def cleanup(self, name, uuid, force, failover = 0): + if failover: force = 1 cmds = """ ignore_errors device $%s - cleanup %s + cleanup %s %s detach - quit""" % (name, ('', 'force')[config.force()]) + quit""" % (name, ('', 'force')[force], + ('', 'failover')[failover]) self.run(cmds) # create an lov - def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist): + def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, + pattern, devlist): cmds = """ device $%s probe @@ -599,7 +521,13 @@ class LCTLInterface: # get list of devices def device_list(self): - rc, out = self.runcmd('device_list') + try: + rc, out = self.runcmd('device_list') + except CommandError, e: + if config.cleanup: + out = [] + else: + raise e return out # get lustre version @@ -607,6 +535,12 @@ class LCTLInterface: rc, out = self.runcmd('version') return out + # dump mount options + def mount_option(self, option): + cmds = """ + mount_option %s + quit""" % (option) + self.run(cmds) # ============================================================ # Various system-level functions # (ideally moved to their own module) @@ -616,7 +550,7 @@ class LCTLInterface: # save it if necessary def runcmd(cmd): debug ("+", cmd) - if config.noexec(): return (0, []) + if config.noexec: return (0, []) f = os.popen(cmd + ' 2>&1') out = f.readlines() ret = f.close() @@ -634,7 +568,7 @@ def run(*args): def run_daemon(*args): cmd = string.join(map(str,args)) debug ("+", cmd) - if config.noexec(): return 0 + if config.noexec: return 0 f = os.popen(cmd + ' 2>&1') ret = f.close() if ret: @@ -649,8 +583,8 @@ def find_prog(cmd): syspath = string.split(os.environ['PATH'], ':') cmdpath = os.path.dirname(sys.argv[0]) syspath.insert(0, cmdpath); - if config.portals_dir(): - syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/')) + if config.portals: + syspath.insert(0, os.path.join(config.portals, 'utils/')) for d in syspath: prog = os.path.join(d,cmd) if os.access(prog, os.X_OK): @@ -690,25 +624,32 @@ def is_block(path): # build fs according to type # fixme: dangerous -def mkfs(dev, devsize, fstype): +def mkfs(dev, devsize, fstype,jsize): block_cnt = '' + jopt = '' if devsize: + if devsize < 8000: + panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"% + (dev, devsize)) # devsize is in 1k, and fs block count is in 4k block_cnt = devsize/4 - if(fstype in ('ext3', 'extN')): + if fstype in ('ext3', 'extN'): + # ext3 journal size is in megabytes + if jsize: jopt = "-J size=%d" %(jsize,) mkfs = 'mkfs.ext2 -j -b 4096 -F ' - elif (fstype == 'reiserfs'): + elif fstype == 'reiserfs': + # reiserfs journal size is in blocks + if jsize: jopt = "--journal_size %d" %(jsize,) mkfs = 'mkreiserfs -ff' else: print 'unsupported fs type: ', fstype - (ret, out) = run (mkfs, dev, block_cnt) + (ret, out) = run (mkfs, jopt, dev, block_cnt) if ret: - panic("Unable to build fs:", dev) + panic("Unable to build fs:", dev, string.join(out)) # enable hash tree indexing on fsswe - # FIXME: this check can probably go away on 2.5 - if fstype == 'extN': + if fstype in ('ext3', 'extN'): htree = 'echo "feature FEATURE_C5" | debugfs -w' (ret, out) = run (htree, dev) if ret: @@ -731,7 +672,7 @@ def find_loop(file): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) - if (out and stat == 0): + if out and stat == 0: m = re.search(r'\((.*)\)', out[0]) if m and file == m.group(1): return dev @@ -740,18 +681,19 @@ def find_loop(file): return '' # create file if necessary and assign the first free loop device -def init_loop(file, size, fstype): +def init_loop(file, size, fstype, journal_size): dev = find_loop(file) if dev: print 'WARNING file:', file, 'already mapped to', dev return dev - if config.reformat() or not os.access(file, os.R_OK | os.W_OK): + if config.reformat or not os.access(file, os.R_OK | os.W_OK): if size < 8000: - panic(file, "size must be larger than 8MB, currently set to:", size) + panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size)) (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file)) if ret: panic("Unable to create backing store:", file) + mkfs(file, size, fstype, journal_size) loop = loop_base() # find next free loop @@ -759,7 +701,7 @@ def init_loop(file, size, fstype): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) - if (stat): + if stat: run('losetup', dev, file) return dev else: @@ -783,12 +725,12 @@ def need_format(fstype, dev): return 0 # initialize a block device if needed -def block_dev(dev, size, fstype, format): - if config.noexec(): return dev +def block_dev(dev, size, fstype, format, journal_size): + if config.noexec: return dev if not is_block(dev): - dev = init_loop(dev, size, fstype) - if config.reformat() or (need_format(fstype, dev) and format == 'yes'): - mkfs(dev, size, fstype) + dev = init_loop(dev, size, fstype, journal_size) + elif config.reformat or (need_format(fstype, dev) and format == 'yes'): + mkfs(dev, size, fstype, journal_size) # else: # panic("device:", dev, @@ -807,8 +749,7 @@ def if2addr(iface): return ip def get_local_nid(net_type, wildcard): - """Return the local nid. First look for an elan interface, - then use the local address. """ + """Return the local nid.""" local = "" if os.access('/proc/elan/device0/position', os.R_OK): local = get_local_address('elan', '*') @@ -843,16 +784,28 @@ def get_local_address(net_type, wildcard): log(e) elif net_type == 'gm': fixme("automatic local address for GM") + elif net_type == 'scimac': + scinode="/opt/scali/sbin/scinode" + if os.path.exists(scinode): + (rc,local) = run(scinode) + else: + panic (scinode, " not found on node with scimac networking") + if rc: + panic (scinode, " failed") + local=string.rstrip(local[0]) + return local def is_prepared(uuid): """Return true if a device exists for the uuid""" - # expect this format: - # 1 UP ldlm ldlm ldlm_UUID 2 - if config.lctl_dump(): + if config.lctl_dump: return 0 + if config.noexec and config.cleanup: + return 1 try: + # expect this format: + # 1 UP ldlm ldlm ldlm_UUID 2 out = lctl.device_list() for s in out: if uuid == string.split(s)[4]: @@ -861,20 +814,27 @@ def is_prepared(uuid): e.dump() return 0 -def is_network_prepared(): - """If the PTLRPC device exists, then assumet that all networking - has been configured""" - if config.lctl_dump(): +def is_prepared_name(name): + """Return true if a device exists for the name""" + if config.lctl_dump: return 0 + if config.noexec and config.cleanup: + return 1 try: + # expect this format: + # 1 UP ldlm ldlm ldlm_UUID 2 out = lctl.device_list() for s in out: - if 'RPCDEV_UUID' == string.split(s)[4]: + if name == string.split(s)[3]: return 1 except CommandError, e: e.dump() return 0 - + +def is_network_prepared(): + """If the LDLM device exists, then assume that all networking + has been configured""" + return is_prepared('ldlm_UUID') def fs_is_mounted(path): """Return true if path is a mounted lustre filesystem""" @@ -915,7 +875,7 @@ class Module: """ default cleanup, used for most modules """ self.info() try: - lctl.cleanup(self.name, self.uuid) + lctl.cleanup(self.name, self.uuid, config.force) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() @@ -923,11 +883,11 @@ class Module: def add_portals_module(self, dev_dir, modname): """Append a module to list of modules to load.""" - self.kmodule_list.append((config.portals_dir(), dev_dir, modname)) + self.kmodule_list.append((config.portals, dev_dir, modname)) def add_lustre_module(self, dev_dir, modname): """Append a module to list of modules to load.""" - self.kmodule_list.append((config.lustre_dir(), dev_dir, modname)) + self.kmodule_list.append((config.lustre, dev_dir, modname)) def mod_loaded(self, modname): """Check if a module is already loaded. Look in /proc/modules for it.""" @@ -943,9 +903,9 @@ class Module: """Load all the modules in the list in the order they appear.""" for src_dir, dev_dir, mod in self.kmodule_list: # (rc, out) = run ('/sbin/lsmod | grep -s', mod) - if self.mod_loaded(mod) and not config.noexec(): + if self.mod_loaded(mod) and not config.noexec: continue - log ('loading module:', mod) + log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir) if src_dir: module = find_module(src_dir, dev_dir, mod) if not module: @@ -960,27 +920,34 @@ class Module: def cleanup_module(self): """Unload the modules in the list in reverse order.""" + if not self.safe_to_clean(): + return rev = self.kmodule_list rev.reverse() for src_dir, dev_dir, mod in rev: - if not self.mod_loaded(mod): + if not self.mod_loaded(mod) and not config.noexec: continue # debug hack - if mod == 'portals' and config.dump_file(): - lctl.dump(config.dump_file()) + if mod == 'portals' and config.dump: + lctl.dump(config.dump) log('unloading module:', mod) - if config.noexec(): - continue (rc, out) = run('/sbin/rmmod', mod) if rc: log('! unable to unload module:', mod) logall(out) + + def safe_to_clean(self): + return 1 + + def safe_to_clean_modules(self): + return self.safe_to_clean() class Network(Module): def __init__(self,db): Module.__init__(self, 'NETWORK', db) self.net_type = self.db.get_val('nettype') self.nid = self.db.get_val('nid', '*') + self.cluster_id = self.db.get_val('clusterid', "0") self.port = self.db.get_val_int('port', 0) self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF) self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF) @@ -988,7 +955,10 @@ class Network(Module): self.nid_exchange = self.db.get_val_int('nidexchange', 0) if '*' in self.nid: - self.nid = get_local_nid(self.net_type, self.nid) + if self.nid_exchange: + self.nid = get_local_nid(self.net_type, self.nid) + else: + self.nid = get_local_address(self.net_type, self.nid) if not self.nid: panic("unable to set nid for", self.net_type, self.nid) debug("nid:", self.nid) @@ -999,31 +969,88 @@ class Network(Module): if not self.nid: panic("unable to set nid for", self.net_type, self.hostaddr) debug("hostaddr:", self.hostaddr) - # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type) - self.add_portals_module("linux/oslib", 'portals') + self.add_portals_module("libcfs", 'portals') if node_needs_router(): - self.add_portals_module("linux/router", 'kptlrouter') + self.add_portals_module("router", 'kptlrouter') if self.net_type == 'tcp': - self.add_portals_module("linux/socknal", 'ksocknal') + self.add_portals_module("knals/socknal", 'ksocknal') if self.net_type == 'toe': - self.add_portals_module("/linux/toenal", 'ktoenal') + self.add_portals_module("knals/toenal", 'ktoenal') if self.net_type == 'elan': - self.add_portals_module("/linux/rqswnal", 'kqswnal') + self.add_portals_module("knals/qswnal", 'kqswnal') if self.net_type == 'gm': - self.add_portals_module("/linux/gmnal", 'kgmnal') - self.add_lustre_module('obdclass', 'obdclass') + self.add_portals_module("knals/gmnal", 'kgmnal') + if self.net_type == 'scimac': + self.add_portals_module("knals/scimacnal", 'kscimacnal') def prepare(self): if is_network_prepared(): return self.info(self.net_type, self.nid, self.port) lctl.network(self.net_type, self.nid) + if self.port and node_is_router(): + run_one_acceptor(self.port) + self.connect_peer_gateways() + + def connect_peer_gateways(self): + for router in self.db.lookup_class('node'): + if router.get_val_int('router', 0): + # if this is a peer with a nid less than mine, + # then connect. + for netuuid in router.get_networks(): + net = self.db.lookup(netuuid) + gw = Network(net) + if (gw.cluster_id == self.cluster_id and + gw.net_type == self.net_type): + # hack: compare as numbers if possible, this should all + # go away once autoconnect is done. + # This also conveniently prevents us from connecting to ourself. + try: + gw_nid = my_int(gw.nid) + self_nid = my_int(self.nid) + except ValueError, e: + print "Error!", str(e) + gw_nid = gw.nid + self_nid = self.nid + if gw_nid < self_nid: + lctl.connect(gw) + + def disconnect_peer_gateways(self): + for router in self.db.lookup_class('node'): + if router.get_val_int('router', 0): + # if this is a peer with a nid less than mine, + # then connect. + if (gw.cluster_id == self.cluster_id and + gw.net_type == self.net_type): + # hack: compare as numbers if possible, this should all + # go away once autoconnect is done. + # This also conveniently prevents us from connecting to ourself. + try: + gw_nid = my_int(gw.nid) + self_nid = my_int(self.nid) + except ValueError, e: + print "Error!", str(e) + gw_nid = gw.nid + self_nid = self.nid + if gw_nid < self_nid: + try: + lctl.disconnect(router.net_type, router.nid, router.port, + router.uuid) + except CommandError, e: + print "disconnectAll failed: ", self.name + e.dump() + cleanup_error(e.rc) + + def safe_to_clean(self): + return not is_network_prepared() def cleanup(self): self.info(self.net_type, self.nid, self.port) - if self.net_type in ('tcp', 'toe'): + if self.port: stop_acceptor(self.port) + if node_is_router(): + self.disconnect_peer_gateways() try: lctl.disconnectAll(self.net_type) except CommandError, e: @@ -1031,25 +1058,31 @@ class Network(Module): e.dump() cleanup_error(e.rc) -class Router(Module): +class RouteTable(Module): def __init__(self,db): - Module.__init__(self, 'ROUTER', db) + Module.__init__(self, 'ROUTES', db) def prepare(self): if is_network_prepared(): return self.info() - for net_type, gw, lo, hi in self.db.get_route_tbl(): + for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): lctl.add_route(net_type, gw, lo, hi) - if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '': + if net_type in ('tcp', 'toe') and local_net_type(net_type) and lo == hi: srvdb = self.db.nid2server(lo, net_type) - if not srvdb: panic("no server for nid", lo) else: srv = Network(srvdb) lctl.connect(srv) + + def safe_to_clean(self): + return not is_network_prepared() + def cleanup(self): - for net_type, gw, lo, hi in self.db.get_route_tbl(): + if is_network_prepared(): + # the network is still being used, don't clean it up + return + for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '': srvdb = self.db.nid2server(lo, net_type) if not srvdb: @@ -1072,25 +1105,20 @@ class Router(Module): class LDLM(Module): def __init__(self,db): Module.__init__(self, 'LDLM', db) + self.add_lustre_module('obdclass', 'obdclass') + self.add_lustre_module('ptlrpc', 'ptlrpc') self.add_lustre_module('ldlm', 'ldlm') + def prepare(self): if is_prepared(self.uuid): return self.info() lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid)) - def cleanup(self): - if is_prepared(self.uuid): - Module.cleanup(self) -class PTLRPC(Module): - def __init__(self,db): - Module.__init__(self, 'PTLRPC', db) - self.add_lustre_module('ptlrpc', 'ptlrpc') - def prepare(self): - if is_prepared(self.uuid): - return - self.info() - lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid)) + def safe_to_clean(self): + out = lctl.device_list() + return len(out) <= 1 + def cleanup(self): if is_prepared(self.uuid): Module.cleanup(self) @@ -1109,7 +1137,7 @@ class LOV(Module): self.devlist = self.db.get_refs('obd') self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) self.osclist = [] - self.mdc_uudi = '' + self.mdc_uuid = '' for obd_uuid in self.devlist: obd = self.db.lookup(obd_uuid) osc = get_osc(obd, self.name) @@ -1123,11 +1151,12 @@ class LOV(Module): return for osc in self.osclist: try: - # Ignore connection failures, because the LOV will DTRT with - # an unconnected OSC. - osc.prepare(ignore_connect_failure=1) - except CommandError: + # Only ignore connect failures with --force, which + # isn't implemented here yet. + osc.prepare(ignore_connect_failure=0) + except CommandError, e: print "Error preparing OSC %s (inactive)\n" % osc.uuid + raise e self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mds_name) @@ -1178,27 +1207,33 @@ class MDSDEV(Module): Module.__init__(self, 'MDSDEV', db) self.devpath = self.db.get_val('devpath','') self.size = self.db.get_val_int('devsize', 0) + self.journal_size = self.db.get_val_int('journalsize', 0) self.fstype = self.db.get_val('fstype', '') # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid target_uuid = self.db.get_first_ref('target') mds = self.db.lookup(target_uuid) self.name = mds.getName() self.lovconfig_uuids = mds.get_refs('lovconfig') + self.filesystem_uuids = mds.get_refs('filesystem') # FIXME: if fstype not set, then determine based on kernel version self.format = self.db.get_val('autoformat', "no") - - active_uuid = mds.get_active_target() + if mds.get_val('failover', 0): + self.failover_mds = 'f' + else: + self.failover_mds = '' + active_uuid = get_active_target(mds) if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 else: self.active = 0 + if self.active and config.group and config.group != ost.get_val('group'): + self.active = 0 + self.target_dev_uuid = self.uuid self.uuid = target_uuid # modules - if self.fstype == 'extN': - self.add_lustre_module('extN', 'extN') self.add_lustre_module('mds', 'mds') if self.fstype: self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype)) @@ -1215,7 +1250,8 @@ class MDSDEV(Module): return self.info(self.devpath, self.fstype, self.format) run_acceptors() - blkdev = block_dev(self.devpath, self.size, self.fstype, self.format) + blkdev = block_dev(self.devpath, self.size, self.fstype, self.format, + self.journal_size) if not is_prepared('MDT_UUID'): lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'), setup ="") @@ -1225,17 +1261,57 @@ class MDSDEV(Module): db = self.db.lookup(uuid) lovconfig = LOVConfig(db) lovconfig.prepare() + if config.mds_ost_conn: + for uuid in self.filesystem_uuids: + log("open clients for filesystem:", uuid) + fs = self.db.lookup(uuid) + obd_uuid = fs.get_first_ref('obd') + client = VOSC(self.db.lookup(obd_uuid), self.name) + client.prepare() + + def msd_remaining(self): + out = lctl.device_list() + for s in out: + if string.split(s)[2] in ('mds',): + return 1 + + def safe_to_clean(self): + return self.active + + def safe_to_clean_modules(self): + return not self.msd_remaining() + def cleanup(self): - if is_prepared('MDT_UUID'): + if not self.active: + debug(self.uuid, "not active") + return + if is_prepared(self.uuid): + self.info() try: - lctl.cleanup("MDT", "MDT_UUID") + lctl.cleanup(self.name, self.uuid, config.force, + config.failover) + except CommandError, e: + log(self.module_name, "cleanup failed: ", self.name) + e.dump() + cleanup_error(e.rc) + Module.cleanup(self) + if config.mds_ost_conn: + for uuid in self.filesystem_uuids: + log("clean clients for filesystem:", uuid) + log("open clients for filesystem:", uuid) + fs = self.db.lookup(uuid) + obd_uuid = fs.get_first_ref('obd') + client = VOSC(self.db.lookup(obd_uuid), self.name) + client.cleanup() + if not self.msd_remaining() and is_prepared('MDT_UUID'): + try: + lctl.cleanup("MDT", "MDT_UUID", config.force, + config.failover) except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) - if is_prepared(self.uuid): - Module.cleanup(self) clean_loop(self.devpath) class OSD(Module): @@ -1244,29 +1320,35 @@ class OSD(Module): self.osdtype = self.db.get_val('osdtype') self.devpath = self.db.get_val('devpath', '') self.size = self.db.get_val_int('devsize', 0) + self.journal_size = self.db.get_val_int('journalsize', 0) self.fstype = self.db.get_val('fstype', '') target_uuid = self.db.get_first_ref('target') ost = self.db.lookup(target_uuid) self.name = ost.getName() - # FIXME: if fstype not set, then determine based on kernel version self.format = self.db.get_val('autoformat', 'yes') - if self.fstype == 'extN': - self.add_lustre_module('extN', 'extN') + if ost.get_val('failover', 0): + self.failover_ost = 'f' + else: + self.failover_ost = '' - active_uuid = ost.get_active_target() + active_uuid = get_active_target(ost) if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 else: self.active = 0 + if self.active and config.group and config.group != ost.get_val('group'): + self.active = 0 + self.target_dev_uuid = self.uuid self.uuid = target_uuid # modules self.add_lustre_module('ost', 'ost') - self.add_lustre_module(self.osdtype, self.osdtype) + # FIXME: should we default to ext3 here? if self.fstype: self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype)) + self.add_lustre_module(self.osdtype, self.osdtype) def load_module(self): if self.active: @@ -1281,28 +1363,54 @@ class OSD(Module): if not self.active: debug(self.uuid, "not active") return - self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format) + self.info(self.osdtype, self.devpath, self.size, self.fstype, + self.format, self.journal_size) run_acceptors() if self.osdtype == 'obdecho': blkdev = '' else: - blkdev = block_dev(self.devpath, self.size, self.fstype, self.format) + blkdev = block_dev(self.devpath, self.size, self.fstype, + self.format, self.journal_size) lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid), - setup ="%s %s" %(blkdev, self.fstype)) + setup ="%s %s %s" %(blkdev, self.fstype, + self.failover_ost)) if not is_prepared('OSS_UUID'): lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'), setup ="") + def osd_remaining(self): + out = lctl.device_list() + for s in out: + if string.split(s)[2] in ('obdfilter', 'obdecho'): + return 1 + + def safe_to_clean(self): + return self.active + + def safe_to_clean_modules(self): + return not self.osd_remaining() + def cleanup(self): - if is_prepared('OSS_UUID'): + if not self.active: + debug(self.uuid, "not active") + return + if is_prepared(self.uuid): + self.info() try: - lctl.cleanup("OSS", "OSS_UUID") + lctl.cleanup(self.name, self.uuid, config.force, + config.failover) + except CommandError, e: + log(self.module_name, "cleanup failed: ", self.name) + e.dump() + cleanup_error(e.rc) + if not self.osd_remaining() and is_prepared('OSS_UUID'): + try: + lctl.cleanup("OSS", "OSS_UUID", config.force, + config.failover) except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) - if is_prepared(self.uuid): - Module.cleanup(self) if not self.osdtype == 'obdecho': clean_loop(self.devpath) @@ -1313,7 +1421,7 @@ class Client(Module): self.target_uuid = tgtdb.getUUID() self.db = tgtdb - self.tgt_dev_uuid = tgtdb.get_active_target() + self.tgt_dev_uuid = get_active_target(tgtdb) if not self.tgt_dev_uuid: panic("No target device found for target:", self.target_name) @@ -1323,9 +1431,10 @@ class Client(Module): self.module = module self.module_name = string.upper(module) - self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name) - self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576), - int(random.random() * 1048576),self.name, + self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(), + self.target_name, owner) + self.uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576), + self.name, int(random.random() * 1048576), int(random.random() * 1048576)) self.uuid = self.uuid[0:36] @@ -1334,7 +1443,7 @@ class Client(Module): def lookup_server(self, srv_uuid): """ Lookup a server's network information """ - self._server_nets = self.db.get_ost_net(srv_uuid) + self._server_nets = get_ost_net(self.db, srv_uuid) if len(self._server_nets) == 0: panic ("Unable to find a server for:", srv_uuid) @@ -1342,11 +1451,11 @@ class Client(Module): return self._server_nets def prepare(self, ignore_connect_failure = 0): - if is_prepared(self.uuid): - return self.info(self.target_uuid) + if is_prepared_name(self.name): + self.cleanup() try: - srv = local_net(self.get_servers()) + srv = choose_local_server(self.get_servers()) if srv: lctl.connect(srv) else: @@ -1355,34 +1464,28 @@ class Client(Module): lctl.add_route_host(r[0], srv.uuid, r[1], r[2]) else: panic ("no route to", self.target_uuid) - except CommandError: - if (ignore_connect_failure == 0): - pass + except CommandError, e: + if not ignore_connect_failure: + raise e if srv: lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid), setup ="%s %s" %(self.target_uuid, srv.uuid)) def cleanup(self): - Module.cleanup(self) - srv = local_net(self.get_servers()) - if srv: + if is_prepared_name(self.name): + Module.cleanup(self) try: - lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) + srv = choose_local_server(self.get_servers()) + if srv: + lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) + else: + srv, r = find_route(self.get_servers()) + if srv: + lctl.del_route_host(r[0], srv.uuid, r[1], r[2]) except CommandError, e: - log(self.module_name, "disconnect failed: ", self.name) + log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) - else: - self.info(self.target_uuid) - srv, r = find_route(self.get_servers()) - if srv: - try: - lctl.del_route_host(r[0], srv.uuid, r[1], r[2]) - except CommandError, e: - print "del_route failed: ", self.name - e.dump() - cleanup_error(e.rc) - class MDC(Client): @@ -1472,8 +1575,10 @@ class Mountpoint(Module): def __init__(self,db): Module.__init__(self, 'MTPT', db) self.path = self.db.get_val('path') - self.mds_uuid = self.db.get_first_ref('mds') - self.obd_uuid = self.db.get_first_ref('obd') + self.fs_uuid = self.db.get_first_ref('filesystem') + fs = self.db.lookup(self.fs_uuid) + self.mds_uuid = fs.get_first_ref('mds') + self.obd_uuid = fs.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) self.vosc = VOSC(obd, self.name) if self.vosc.need_mdc(): @@ -1482,25 +1587,36 @@ class Mountpoint(Module): def prepare(self): + if fs_is_mounted(self.path): + log(self.path, "already mounted.") + return self.vosc.prepare() if self.vosc.need_mdc(): mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) else: mdc_uuid = self.vosc.get_mdc_uuid() if not mdc_uuid: + self.vosc.cleanup() panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.") self.info(self.path, self.mds_uuid, self.obd_uuid) + if config.lctl_dump: + cmd = "osc=%s,mdc=%s" % (self.vosc.get_uuid(), mdc_uuid) + lctl.mount_option(cmd) + return cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ (self.vosc.get_uuid(), mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: - panic("mount failed:", self.path) + self.vosc.cleanup() + if self.vosc.need_mdc(): + cleanup_mdc(self.db, self.name, self.mds_uuid) + panic("mount failed:", self.path, ":", string.join(val)) def cleanup(self): self.info(self.path, self.mds_uuid,self.obd_uuid) - if fs_is_mounted(self.path): - if config.force(): + if fs_is_mounted(self.path): + if config.force: (rc, out) = run("umount", "-f", self.path) else: (rc, out) = run("umount", self.path) @@ -1523,426 +1639,64 @@ class Mountpoint(Module): # ============================================================ -# XML processing and query - -class LustreDB: - def lookup(self, uuid): - """ lookup returns a new LustreDB instance""" - return self._lookup_by_uuid(uuid) - - def lookup_name(self, name, class_name = ""): - """ lookup returns a new LustreDB instance""" - return self._lookup_by_name(name, class_name) - - def lookup_class(self, class_name): - """ lookup returns a new LustreDB instance""" - return self._lookup_by_class(class_name) - - def get_val(self, tag, default=None): - v = self._get_val(tag) - if v: - return v - if default != None: - return default - debug("LustreDB", self.getName(), " no value for:", tag) - return None - - def get_class(self): - return self._get_class() - - def get_val_int(self, tag, default=0): - str = self._get_val(tag) - try: - if str: - return int(str) - return default - except ValueError: - panic("text value is not integer:", str) - - def get_first_ref(self, tag): - """ Get the first uuidref of the type TAG. Only - one is expected. Returns the uuid.""" - uuids = self._get_refs(tag) - if len(uuids) > 0: - return uuids[0] - return None - - def get_refs(self, tag): - """ Get all the refs of type TAG. Returns list of uuids. """ - uuids = self._get_refs(tag) - return uuids - - def get_all_refs(self): - """ Get all the refs. Returns list of uuids. """ - uuids = self._get_all_refs() - return uuids - - def get_ost_net(self, osd_uuid): - srv_list = [] - if not osd_uuid: - return srv_list - osd = self.lookup(osd_uuid) - node_uuid = osd.get_first_ref('node') - node = self.lookup(node_uuid) - if not node: - panic("unable to find node for osd_uuid:", osd_uuid, - " node_ref:", node_uuid) - for net_uuid in node.get_networks(): - db = node.lookup(net_uuid) - srv_list.append(Network(db)) - return srv_list +# misc query functions - def nid2server(self, nid, net_type): - netlist = self.lookup_class('network') - for net_db in netlist: - if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type: - return net_db - return None - - # the tag name is the service type - # fixme: this should do some checks to make sure the dom_node is a service - # - # determine what "level" a particular node is at. - - # the order of iniitailization is based on level. - def getServiceLevel(self): - type = self.get_class() - ret=0; - if type in ('network',): - ret = 5 - elif type in ('routetbl',): - ret = 6 - elif type in ('ptlrpc',): - ret = 7 - elif type in ('device', 'ldlm'): - ret = 20 - elif type in ('osd', 'mdd', 'cobd'): - ret = 30 - elif type in ('mdsdev','ost'): - ret = 40 - elif type in ('mdc','osc'): - ret = 50 - elif type in ('lov',): - ret = 60 - elif type in ('mountpoint', 'echoclient'): - ret = 70 - - if ret < config.minlevel() or ret > config.maxlevel(): - ret = 0 - return ret - - # - # return list of services in a profile. list is a list of tuples - # [(level, db_object),] - def getServices(self): - list = [] - for ref_class, ref_uuid in self.get_all_refs(): - servdb = self.lookup(ref_uuid) - if servdb: - level = servdb.getServiceLevel() - if level > 0: - list.append((level, servdb)) - else: - panic('service not found: ' + ref_uuid) - - list.sort() - return list - - # Find the target_device for target on a node - # node->profiles->device_refs->target - def get_target_device(self, target_uuid, node_name): - node_db = self.lookup_name(node_name) - if not node_db: - return None - prof_list = node_db.get_refs('profile') - for prof_uuid in prof_list: - prof_db = node_db.lookup(prof_uuid) - ref_list = prof_db.get_all_refs() - for ref in ref_list: - dev = self.lookup(ref[1]) - if dev and dev.get_first_ref('target') == target_uuid: - return ref[1] - return None - - def get_active_target(self): - target_uuid = self.getUUID() - target_name = self.getName() - node_name = config.select(target_name) - if node_name: - tgt_dev_uuid = self.get_target_device(target_uuid, node_name) - else: - tgt_dev_uuid = self.get_first_ref('active') - return tgt_dev_uuid - - - # get all network uuids for this node - def get_networks(self): - ret = [] - prof_list = self.get_refs('profile') - for prof_uuid in prof_list: - prof_db = self.lookup(prof_uuid) - net_list = prof_db.get_refs('network') - #debug("get_networks():", prof_uuid, net_list) - for net_uuid in net_list: - ret.append(net_uuid) - return ret - -class LustreDB_XML(LustreDB): - def __init__(self, dom, root_node): - # init xmlfile - self.dom_node = dom - self.root_node = root_node - - def xmltext(self, dom_node, tag): - list = dom_node.getElementsByTagName(tag) - if len(list) > 0: - dom_node = list[0] - dom_node.normalize() - if dom_node.firstChild: - txt = string.strip(dom_node.firstChild.data) - if txt: - return txt - - def xmlattr(self, dom_node, attr): - return dom_node.getAttribute(attr) - - def _get_val(self, tag): - """a value could be an attribute of the current node - or the text value in a child node""" - ret = self.xmlattr(self.dom_node, tag) - if not ret: - ret = self.xmltext(self.dom_node, tag) - return ret - - def _get_class(self): - return self.dom_node.nodeName - - # - # [(ref_class, ref_uuid),] - def _get_all_refs(self): - list = [] - for n in self.dom_node.childNodes: - if n.nodeType == n.ELEMENT_NODE: - ref_uuid = self.xml_get_ref(n) - ref_class = n.nodeName - list.append((ref_class, ref_uuid)) - - list.sort() - return list - - def _get_refs(self, tag): - """ Get all the refs of type TAG. Returns list of uuids. """ - uuids = [] - refname = '%s_ref' % tag - reflist = self.dom_node.getElementsByTagName(refname) - for r in reflist: - uuids.append(self.xml_get_ref(r)) - return uuids - - def xmllookup_by_uuid(self, dom_node, uuid): - for n in dom_node.childNodes: - if n.nodeType == n.ELEMENT_NODE: - if self.xml_get_uuid(n) == uuid: - return n - else: - n = self.xmllookup_by_uuid(n, uuid) - if n: return n - return None - - def _lookup_by_uuid(self, uuid): - dom = self. xmllookup_by_uuid(self.root_node, uuid) - if dom: - return LustreDB_XML(dom, self.root_node) - - def xmllookup_by_name(self, dom_node, name): - for n in dom_node.childNodes: - if n.nodeType == n.ELEMENT_NODE: - if self.xml_get_name(n) == name: - return n - else: - n = self.xmllookup_by_name(n, name) - if n: return n - return None - - def _lookup_by_name(self, name, class_name): - dom = self.xmllookup_by_name(self.root_node, name) - if dom: - return LustreDB_XML(dom, self.root_node) - - def xmllookup_by_class(self, dom_node, class_name): - return dom_node.getElementsByTagName(class_name) - - def _lookup_by_class(self, class_name): - ret = [] - domlist = self.xmllookup_by_class(self.root_node, class_name) - for node in domlist: - ret.append(LustreDB_XML(node, self.root_node)) - return ret - - def xml_get_name(self, n): - return n.getAttribute('name') - - def getName(self): - return self.xml_get_name(self.dom_node) - - def xml_get_ref(self, n): - return n.getAttribute('uuidref') - - def xml_get_uuid(self, dom_node): - return dom_node.getAttribute('uuid') - - def getUUID(self): - return self.xml_get_uuid(self.dom_node) - - def get_routes(self, type, gw): - """ Return the routes as a list of tuples of the form: - [(type, gw, lo, hi),]""" - res = [] - tbl = self.dom_node.getElementsByTagName('routetbl') - for t in tbl: - routes = t.getElementsByTagName('route') - for r in routes: - net_type = self.xmlattr(r, 'type') - if type != net_type: - lo = self.xmlattr(r, 'lo') - hi = self.xmlattr(r, 'hi') - res.append((type, gw, lo, hi)) - return res - - def get_route_tbl(self): - ret = [] - for r in self.dom_node.getElementsByTagName('route'): - net_type = self.xmlattr(r, 'type') - gw = self.xmlattr(r, 'gw') - lo = self.xmlattr(r, 'lo') - hi = self.xmlattr(r, 'hi') - ret.append((net_type, gw, lo, hi)) - return ret - - -# ================================================================ -# LDAP Support -class LustreDB_LDAP(LustreDB): - def __init__(self, name, attrs, - base = "fs=lustre", - parent = None, - url = "ldap://localhost", - user = "cn=Manager, fs=lustre", - pw = "secret" - ): - self._name = name - self._attrs = attrs - self._base = base - self._parent = parent - self._url = url - self._user = user - self._pw = pw - if parent: - self.l = parent.l - self._base = parent._base - else: - self.open() - - def open(self): - import ldap - try: - self.l = ldap.initialize(self._url) - # Set LDAP protocol version used - self.l.protocol_version=ldap.VERSION3 - # user and pw only needed if modifying db - self.l.bind_s("", "", ldap.AUTH_SIMPLE); - except ldap.LDAPError, e: - panic(e) - # FIXME, do something useful here - - def close(self): - self.l.unbind_s() - - def ldap_search(self, filter): - """Return list of uuids matching the filter.""" - import ldap - dn = self._base - ret = [] - uuids = [] - try: - for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL, - filter, ["uuid"]): - for v in attrs['uuid']: - uuids.append(v) - except ldap.NO_SUCH_OBJECT, e: - pass - except ldap.LDAPError, e: - print e # FIXME: die here? - if len(uuids) > 0: - for uuid in uuids: - ret.append(self._lookup_by_uuid(uuid)) - return ret - - def _lookup_by_name(self, name, class_name): - list = self.ldap_search("lustreName=%s" %(name)) - if len(list) == 1: - return list[0] - return [] - - def _lookup_by_class(self, class_name): - return self.ldap_search("objectclass=%s" %(string.upper(class_name))) - - def _lookup_by_uuid(self, uuid): - import ldap - dn = "uuid=%s,%s" % (uuid, self._base) - ret = None - try: - for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE, - "objectclass=*"): - ret = LustreDB_LDAP(name, attrs, parent = self) - - except ldap.NO_SUCH_OBJECT, e: - debug("NO_SUCH_OBJECT:", uuid) - pass # just return empty list - except ldap.LDAPError, e: - print e # FIXME: die here? - return ret +def get_ost_net(self, osd_uuid): + srv_list = [] + if not osd_uuid: + return srv_list + osd = self.lookup(osd_uuid) + node_uuid = osd.get_first_ref('node') + node = self.lookup(node_uuid) + if not node: + panic("unable to find node for osd_uuid:", osd_uuid, + " node_ref:", node_uuid) + for net_uuid in node.get_networks(): + db = node.lookup(net_uuid) + srv_list.append(Network(db)) + return srv_list + + +# the order of iniitailization is based on level. +def getServiceLevel(self): + type = self.get_class() + ret=0; + if type in ('network',): + ret = 5 + elif type in ('routetbl',): + ret = 6 + elif type in ('ldlm',): + ret = 20 + elif type in ('osd', 'cobd'): + ret = 30 + elif type in ('mdsdev',): + ret = 40 + elif type in ('mountpoint', 'echoclient'): + ret = 70 + else: + panic("Unknown type: ", type) + if ret < config.minlevel or ret > config.maxlevel: + ret = 0 + return ret - def _get_val(self, k): - ret = None - if self._attrs.has_key(k): - v = self._attrs[k] - if type(v) == types.ListType: - ret = str(v[0]) +# +# return list of services in a profile. list is a list of tuples +# [(level, db_object),] +def getServices(self): + list = [] + for ref_class, ref_uuid in self.get_all_refs(): + servdb = self.lookup(ref_uuid) + if servdb: + level = getServiceLevel(servdb) + if level > 0: + list.append((level, servdb)) else: - ret = str(v) - return ret - - def _get_class(self): - return string.lower(self._attrs['objectClass'][0]) - - # - # [(ref_class, ref_uuid),] - def _get_all_refs(self): - list = [] - for k in self._attrs.keys(): - if re.search('.*Ref', k): - for uuid in self._attrs[k]: - list.append((k, uuid)) - return list + panic('service not found: ' + ref_uuid) - def _get_refs(self, tag): - """ Get all the refs of type TAG. Returns list of uuids. """ - uuids = [] - refname = '%sRef' % tag - if self._attrs.has_key(refname): - return self._attrs[refname] - return [] + list.sort() + return list - def getName(self): - return self._get_val('lustreName') - - def getUUID(self): - return self._get_val('uuid') - - def get_route_tbl(self): - return [] ############################################################ # MDC UUID hack - @@ -1973,85 +1727,102 @@ def cleanup_mdc(db, owner, mds_uuid): ############################################################ # routing ("rooting") -# -routes = [] -local_node = [] -router_flag = 0 -def add_local_interfaces(node_db): - global local_node +# list of (nettype, cluster_id) +local_clusters = [] + +def find_local_clusters(node_db): + global local_clusters for netuuid in node_db.get_networks(): net = node_db.lookup(netuuid) srv = Network(net) debug("add_local", netuuid) - local_node.append((srv.net_type, srv.nid)) - if acceptors.has_key(srv.port): - panic("duplicate port:", srv.port) - if srv.net_type in ('tcp', 'toe'): + local_clusters.append((srv.net_type, srv.cluster_id)) + if srv.port > 0: + if acceptors.has_key(srv.port): + panic("duplicate port:", srv.port) acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type, srv.send_mem, srv.recv_mem, srv.irq_affinity, srv.nid_exchange) +# This node is a gateway. +is_router = 0 +def node_is_router(): + return is_router + +# If there are any routers found in the config, then this will be true +# and all nodes will load kptlrouter. +needs_router = 0 def node_needs_router(): - return router_flag + return needs_router or is_router + +# list of (nettype, gw, tgt_cluster_id, lo, hi) +# Currently, these local routes are only added to kptlrouter route +# table if they are needed to connect to a specific server. This +# should be changed so all available routes are loaded, and the +# ptlrouter can make all the decisions. +local_routes = [] -def init_route_config(lustre): - """ Scan the lustre config looking for routers. Build list of +def find_local_routes(lustre): + """ Scan the lustre config looking for routers . Build list of routes. """ - global routes, router_flag - routes = [] + global local_routes, needs_router + local_routes = [] list = lustre.lookup_class('node') - for node_db in list: - if node_db.get_val_int('router', 0): - router_flag = 1 - #debug("init_route_config: found router", node_db.getName()) - for (local_type, local_nid) in local_node: - #debug("init_route_config:", local_type, local_nid) + for router in list: + if router.get_val_int('router', 0): + needs_router = 1 + for (local_type, local_cluster_id) in local_clusters: gw = None - for netuuid in node_db.get_networks(): - db = node_db.lookup(netuuid) - if local_type == db.get_val('nettype'): + for netuuid in router.get_networks(): + db = router.lookup(netuuid) + if (local_type == db.get_val('nettype') and + local_cluster_id == db.get_val('clusterid')): gw = db.get_val('nid') break - #debug("init_route_config: gw is", gw) - if not gw: - continue - for route in node_db.get_routes(local_type, gw): - routes.append(route) - debug("init_route_config routes:", routes) - - -def local_net(srv_list): - global local_node - for iface in local_node: - for srv in srv_list: - #debug("local_net a:", srv.net_type, "b:", iface[0]) - if srv.net_type == iface[0]: - return srv - return None + if gw: + debug("find_local_routes: gw is", gw) + for route in router.get_local_routes(local_type, gw): + local_routes.append(route) + debug("find_local_routes:", local_routes) + + +def choose_local_server(srv_list): + for srv in srv_list: + if local_net_type(srv.net_type): + return srv def local_net_type(net_type): - global local_node - for iface in local_node: - if net_type == iface[0]: + for cluster in local_clusters: + if net_type == cluster[0]: return 1 return 0 def find_route(srv_list): - global local_node, routes - frm_type = local_node[0][0] + frm_type = local_clusters[0][0] for srv in srv_list: - #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type) + debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type) to_type = srv.net_type - to = srv.hostaddr - #debug ('looking for route to', to_type, to) - for r in routes: - #debug("find_route: ", r) - if r[2] == to: + to = srv.hostaddr # XXX should this be hostaddr, or nid? + cluster_id = srv.cluster_id + debug ('looking for route to', to_type, to) + for r in local_routes: + debug("find_route: ", r) + if (r[3] <= to and to <= r[4]) and cluster_id == r[2]: return srv, r return None,None +def get_active_target(db): + target_uuid = db.getUUID() + target_name = db.getName() + node_name = get_select(target_name) + if node_name: + tgt_dev_uuid = db.get_target_device(target_uuid, node_name) + else: + tgt_dev_uuid = db.get_first_ref('active') + return tgt_dev_uuid + ############################################################ # lconf level logic @@ -2062,14 +1833,12 @@ def newService(db): n = None if type == 'ldlm': n = LDLM(db) - elif type == 'ptlrpc': - n = PTLRPC(db) elif type == 'lov': n = LOV(db) elif type == 'network': n = Network(db) elif type == 'routetbl': - n = Router(db) + n = RouteTable(db) elif type == 'osd': n = OSD(db) elif type == 'cobd': @@ -2097,44 +1866,45 @@ def for_each_profile(db, prof_list, operation): prof_db = db.lookup(prof_uuid) if not prof_db: panic("profile:", profile, "not found.") - services = prof_db.getServices() + services = getServices(prof_db) operation(services) def doSetup(services): - if config.nosetup(): + if config.nosetup: return for s in services: n = newService(s[1]) n.prepare() def doModules(services): - if config.nomod(): + if config.nomod: return for s in services: n = newService(s[1]) n.load_module() def doCleanup(services): - if config.nosetup(): + if config.nosetup: return services.reverse() for s in services: n = newService(s[1]) - n.cleanup() + if n.safe_to_clean(): + n.cleanup() def doUnloadModules(services): - if config.nomod(): + if config.nomod: return services.reverse() for s in services: n = newService(s[1]) - n.cleanup_module() + if n.safe_to_clean_modules(): + n.cleanup_module() # # Load profile for def doHost(lustreDB, hosts): - global routes - global router_flag + global is_router node_db = None for h in hosts: node_db = lustreDB.lookup_name(h, 'node') @@ -2144,188 +1914,168 @@ def doHost(lustreDB, hosts): print 'No host entry found.' return - router_flag = node_db.get_val_int('router', 0) - recovery_upcall = node_db.get_val('recovery_upcall', '') + is_router = node_db.get_val_int('router', 0) + lustre_upcall = node_db.get_val('lustreUpcall', '') + portals_upcall = node_db.get_val('portalsUpcall', '') timeout = node_db.get_val_int('timeout', 0) - add_local_interfaces(node_db) - if not router_flag: - init_route_config(lustreDB) + find_local_clusters(node_db) + if not is_router: + find_local_routes(lustreDB) # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. prof_list = node_db.get_refs('profile') - if config.cleanup(): - if config.force(): + if config.recover: + if not (config.tgt_uuid and config.client_uuid and config.conn_uuid): + raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " + + "--client_uuid <UUID> --conn_uuid <UUID>") + doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid, + config.conn_uuid) + elif config.cleanup: + if config.force: # the command line can override this value timeout = 5 # ugly hack, only need to run lctl commands for --dump - if config.lctl_dump(): + if config.lctl_dump: for_each_profile(node_db, prof_list, doCleanup) return sys_set_timeout(timeout) - sys_set_recovery_upcall(recovery_upcall) + sys_set_ptldebug() + sys_set_subsystem() + sys_set_lustre_upcall(lustre_upcall) + sys_set_portals_upcall(portals_upcall) for_each_profile(node_db, prof_list, doCleanup) for_each_profile(node_db, prof_list, doUnloadModules) else: # ugly hack, only need to run lctl commands for --dump - if config.lctl_dump(): + if config.lctl_dump: for_each_profile(node_db, prof_list, doSetup) return + sys_make_devices() + sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) + sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) + for_each_profile(node_db, prof_list, doModules) sys_set_debug_path() - script = config.gdb_script() + sys_set_ptldebug() + sys_set_subsystem() + script = config.gdb_script run(lctl.lctl, ' modules >', script) - if config.gdb(): + if config.gdb: log ("The GDB module script is in", script) # pause, so user has time to break and # load the script time.sleep(5) sys_set_timeout(timeout) - sys_set_recovery_upcall(recovery_upcall) + sys_set_lustre_upcall(lustre_upcall) + sys_set_portals_upcall(portals_upcall) for_each_profile(node_db, prof_list, doSetup) -############################################################ -# Command line processing -# -def parse_cmdline(argv): - short_opts = "hdnvf" - long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb", - "portals=", "makeldiff", "cleanup", "noexec", - "help", "node=", "nomod", "nosetup", - "dump=", "force", "minlevel=", "maxlevel=", - "timeout=", "recovery_upcall=", - "ldapurl=", "config=", "select=", "lctl_dump="] - opts = [] - args = [] +def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid): + tgt = db.lookup(tgt_uuid) + if not tgt: + raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.") + new_uuid = get_active_target(tgt) + if not new_uuid: + raise Lustre.LconfError("doRecovery: no active target found for: " + + tgt_uuid) + net = choose_local_server(get_ost_net(db, new_uuid)) + if not net: + raise Lustre.LconfError("Unable to find a connection to:" + new_uuid) + # XXX, better to do a full disconnect here + log("Reconnecting", tgt_uuid, " to ", net.uuid); + lctl.del_uuid(conn_uuid) + lctl.connect(net) + lctl.recover(client_uuid, net.uuid) - try: - opts, args = getopt.getopt(argv, short_opts, long_opts) - except getopt.error: - print "invalid opt" - usage() - - for o, a in opts: - if o in ("-h", "--help"): - usage() - if o in ("-d","--cleanup"): - config.cleanup(1) - if o in ("-v", "--verbose"): - config.verbose(1) - if o in ("-n", "--noexec"): - config.noexec(1) - if o == "--portals": - config.portals_dir(a) - if o == "--lustre": - config.lustre_dir(a) - if o == "--reformat": - config.reformat(1) - if o == "--node": - config.node(a) - if o == "--gdb": - config.gdb(1) - if o == "--nomod": - config.nomod(1) - if o == "--nosetup": - config.nosetup(1) - if o == "--dump": - config.dump_file(a) - if o in ("-f", "--force"): - config.force(1) - if o == "--minlevel": - config.minlevel(a) - if o == "--maxlevel": - config.maxlevel(a) - if o == "--timeout": - config.timeout(a) - if o == "--recovery_upcall": - config.recovery_upcall(a) - if o == "--ldapurl": - config.ldapurl(a) - if o == "--config": - config.config_name(a) - if o == "--select": - config.init_select(a) - if o == "--lctl_dump": - config.lctl_dump(a) - - return args - -def fetch(url): - import urllib - data = "" - try: - s = urllib.urlopen(url) - data = s.read() - except: - usage() - return data def setupModulePath(cmd, portals_dir = PORTALS_DIR): base = os.path.dirname(cmd) - if os.access(base+"/Makefile", os.R_OK): - if not config.lustre_dir(): - config.lustre_dir(os.path.join(base, "..")) + if development_mode(): + if not config.lustre: + config.lustre = (os.path.join(base, "..")) # normalize the portals dir, using command line arg if set - if config.portals_dir(): - portals_dir = config.portals_dir() - dir = os.path.join(config.lustre_dir(), portals_dir) - config.portals_dir(dir) - elif config.lustre_dir() and config.portals_dir(): + if config.portals: + portals_dir = config.portals + dir = os.path.join(config.lustre, portals_dir) + config.portals = dir + debug('config.portals', config.portals) + elif config.lustre and config.portals: # production mode # if --lustre and --portals, normalize portals # can ignore POTRALS_DIR here, since it is probly useless here - dir = config.portals_dir() - dir = os.path.join(config.lustre_dir(), dir) - config.portals_dir(dir) + config.portals = os.path.join(config.lustre, config.portals) + debug('config.portals B', config.portals) def sysctl(path, val): - if config.noexec(): + debug("+ sysctl", path, val) + if config.noexec: return try: fp = open(os.path.join('/proc/sys', path), 'w') fp.write(str(val)) fp.close() except IOError, e: - print e + panic(str(e)) def sys_set_debug_path(): - debug("debug path: ", config.debug_path()) - sysctl('portals/debug_path', config.debug_path()) + sysctl('portals/debug_path', config.debug_path) -def sys_set_recovery_upcall(upcall): +def sys_set_lustre_upcall(upcall): # the command overrides the value in the node config - if config.recovery_upcall(): - upcall = config.recovery_upcall() + if config.lustre_upcall: + upcall = config.lustre_upcall + elif config.upcall: + upcall = config.upcall if upcall: - debug("setting recovery_upcall:", upcall) - sysctl('lustre/recovery_upcall', upcall) + sysctl('lustre/upcall', upcall) + +def sys_set_portals_upcall(upcall): + # the command overrides the value in the node config + if config.portals_upcall: + upcall = config.portals_upcall + elif config.upcall: + upcall = config.upcall + if upcall: + sysctl('portals/upcall', upcall) def sys_set_timeout(timeout): # the command overrides the value in the node config - if config.timeout() > 0: - timeout = config.timeout() - if timeout > 0: - debug("setting timeout:", timeout) + if config.timeout > 0: + timeout = config.timeout + if timeout != None and timeout > 0: sysctl('lustre/timeout', timeout) -def sys_set_ptldebug(ptldebug): - # the command overrides the value in the node config - if config.ptldebug(): - ptldebug = config.ptldebug() - sysctl('portals/debug', ptldebug) +def sys_set_ptldebug(): + if config.ptldebug != None: + try: + val = eval(config.ptldebug, ptldebug_names) + val = "0x%x" % (val,) + sysctl('portals/debug', val) + except NameError, e: + panic(str(e)) + +def sys_set_subsystem(): + if config.subsystem != None: + try: + val = eval(config.ptldebug, ptldebug_names) + val = "0x%x" % (val,) + sysctl('portals/subsystem_debug', val) + except NameError, e: + panic(str(e)) def sys_set_netmem_max(path, max): debug("setting", path, "to at least", max) - if config.noexec(): + if config.noexec: return fp = open(path) str = fp.readline() @@ -2351,6 +2101,20 @@ def add_to_path(new_dir): return os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir +def default_debug_path(): + path = '/tmp/lustre-log' + if os.path.isdir('/r'): + return '/r' + path + else: + return path + +def default_gdb_script(): + script = '/tmp/ogdb' + if os.path.isdir('/r'): + return '/r' + script + else: + return script + DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin') # ensure basic elements are in the system path @@ -2358,13 +2122,94 @@ def sanitise_path(): for dir in DEFAULT_PATH: add_to_path(dir) -# Initialize or shutdown lustre according to a configuration file -# * prepare the system for lustre -# * configure devices with lctl -# Shutdown does steps in reverse -# +# global hack for the --select handling +tgt_select = {} +def init_select(arg): + # arg = "service=nodeA,service2=nodeB" + global tgt_select + list = string.split(arg, ',') + for entry in list: + srv, node = string.split(entry, '=') + tgt_select[srv] = node + +def get_select(srv): + if tgt_select.has_key(srv): + return tgt_select[srv] + return None + + +PARAM = Lustre.Options.PARAM +INTPARAM = Lustre.Options.INTPARAM +lconf_options = [ + ('verbose,v', "Print system commands as they are run"), + ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM), + ('config', "Cluster config name used for LDAP query", PARAM), + ('select', "service=nodeA,service2=nodeB ", PARAM), + ('node', "Load config for <nodename>", PARAM), + ('cleanup,d', "Cleans up config. (Shutdown)"), + ('force,f', "Forced unmounting and/or obd detach during cleanup", + Lustre.Options.FLAG, 0), + ('mds_ost_conn', "Open connections to OSTs on the MDS"), + ('failover',"""Used to shut down without saving state. + This will allow this node to "give up" a service to a + another node for failover purposes. This will not + be a clean shutdown.""", + Lustre.Options.FLAG, 0), + ('gdb', """Prints message after creating gdb module script + and sleeps for 5 seconds."""), + ('noexec,n', """Prints the commands and steps that will be run for a + config without executing them. This can used to check if a + config file is doing what it should be doing"""), + ('nomod', "Skip load/unload module step."), + ('nosetup', "Skip device setup/cleanup step."), + ('reformat', "Reformat all devices (without question)"), + ('dump', "Dump the kernel debug log to file before portals is unloaded", + PARAM), + ('minlevel', "Minimum level of services to configure/cleanup", + INTPARAM, 0), + ('maxlevel', """Maximum level of services to configure/cleanup + Levels are aproximatly like: + 10 - network + 20 - device, ldlm + 30 - osd, mdd + 40 - mds, ost + 70 - mountpoint, echo_client, osc, mdc, lov""", + INTPARAM, 100), + ('lustre', """Base directory of lustre sources. This parameter will + cause lconf to load modules from a source tree.""", PARAM), + ('portals', """Portals source directory. If this is a relative path, + then it is assumed to be relative to lustre. """, PARAM), + ('timeout', "Set recovery timeout", PARAM), + ('upcall', "Set both portals and lustre upcall script", PARAM), + ('lustre_upcall', "Set lustre upcall script", PARAM), + ('portals_upcall', "Set portals upcall script", PARAM), + ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM), + ('ptldebug', "Set the portals debug level", PARAM), + ('subsystem', "Set the portals debug subsystem", PARAM), + ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()), + ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()), +# Client recovery options + ('recover', "Recover a device"), + ('group', "The group of devices to configure or cleanup", PARAM), + ('tgt_uuid', "The failed target (required for recovery)", PARAM), + ('client_uuid', "The failed client (required for recovery)", PARAM), + ('conn_uuid', "The failed connection (required for recovery)", PARAM), + ] + def main(): - global lctl, MAXTCPBUF + global lctl, config + + # in the upcall this is set to SIG_IGN + signal.signal(signal.SIGCHLD, signal.SIG_DFL) + + cl = Lustre.Options("lconf", "config.xml", lconf_options) + try: + config, args = cl.parse(sys.argv[1:]) + except Lustre.OptionError, e: + print e + sys.exit(1) + + setupModulePath(sys.argv[0]) host = socket.gethostname() @@ -2380,7 +2225,6 @@ def main(): sanitise_path() - args = parse_cmdline(sys.argv[1:]) if len(args) > 0: if not os.access(args[0], os.R_OK): print 'File not found or readable:', args[0] @@ -2390,44 +2234,48 @@ def main(): except Exception: panic("%s does not appear to be a config file." % (args[0])) sys.exit(1) # make sure to die here, even in debug mode. - db = LustreDB_XML(dom.documentElement, dom.documentElement) - elif config.ldapurl(): - if not config.config_name(): + db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement) + elif config.ldapurl: + if not config.config: panic("--ldapurl requires --config name") - dn = "config=%s,fs=lustre" % (config.config_name()) - db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl()) + dn = "config=%s,fs=lustre" % (config.config) + db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl) else: - usage() + cl.usage() + sys.exit(1) + + ver = db.get_version() + if not ver: + panic("No version found in config data, please recreate.") + if ver != Lustre.CONFIG_VERSION: + panic("Config version", ver, "does not match lconf version", + Lustre.CONFIG_VERSION) node_list = [] - if config.node(): - node_list.append(config.node()) + if config.node: + node_list.append(config.node) else: if len(host) > 0: node_list.append(host) node_list.append('localhost') + debug("configuring for host: ", node_list) if len(host) > 0: - config._debug_path = config._debug_path + '-' + host - config._gdb_script = config._gdb_script + '-' + host - - setupModulePath(sys.argv[0]) + config.debug_path = config.debug_path + '-' + host + config.gdb_script = config.gdb_script + '-' + host lctl = LCTLInterface('lctl') - if config.lctl_dump(): - lctl.use_save_file(config.lctl_dump()) - else: - sys_make_devices() - sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) - sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) + + if config.lctl_dump: + lctl.use_save_file(config.lctl_dump) doHost(db, node_list) if __name__ == "__main__": try: main() - except LconfError, e: + except Lustre.LconfError, e: print e except CommandError, e: e.dump() @@ -2435,4 +2283,3 @@ if __name__ == "__main__": if first_cleanup_error: sys.exit(first_cleanup_error) - diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index a143647..382e729 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -64,15 +64,17 @@ command_t cmdlist[] = { /* Network configuration commands */ {"==== network config ====", jt_noop, 0, "network config"}, {"network", jt_ptl_network, 0, "commands that follow apply to net\n" - "usage: network <tcp/elan/myrinet>"}, + "usage: network <tcp/elan/myrinet/scimac>"}, {"connect", jt_ptl_connect, 0, "connect to a remote nid\n" "usage: connect [[<hostname> <port>] | <elan id>]"}, {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid\n" "usage: disconnect <nid>"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local nid. " "The nid defaults to hostname for tcp networks and is automatically " - "setup for elan/myrinet networks.\n" + "setup for elan/myrinet/scimac networks.\n" "usage: mynid [nid]"}, + {"shownid", jt_ptl_shownid, 0, "print the local NID\n" + "usage: shownid"}, {"add_uuid", jt_obd_add_uuid, 0, "associate a UUID with a nid\n" "usage: add_uuid <uuid> <nid> <net_type>"}, {"close_uuid", jt_obd_close_uuid, 0, "disconnect a UUID\n" @@ -93,24 +95,21 @@ command_t cmdlist[] = { {"send_mem", jt_ptl_txmem, 0, "set socket send buffer size, " "if size is omited the current size is reported.\n" "usage: send_mem [size]"}, - {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omiting the " + {"nagle", jt_ptl_nagle, 0, "enable/disable nagle, omitting the " "argument will cause the current nagle setting to be reported.\n" "usage: nagle [on/off]"}, - + {"fail", jt_ptl_fail_nid, 0, "fail/restore communications.\n" + "Omitting the count means indefinitely, 0 means restore, " + "otherwise fail 'count' messages.\n" + "usage: fail nid|_all_ [count]"}, + /* Device selection commands */ {"=== device selection ===", jt_noop, 0, "device selection"}, {"newdev", jt_obd_newdev, 0, "create a new device\n" "usage: newdev"}, -#if 0 - {"uuid2dev", jt_obd_uuid2dev, 0, - "find device attached with <uuid> and make it the current device\n" - "usage: uuid2dev <uuid>"}, -#endif - {"name2dev", jt_obd_name2dev, 0, - "find device attached with <name> and make it the current device\n" - "usage: name2dev <name>"}, - {"device", jt_obd_device, 0, "set current device to <devno>\n" - "usage: device <devno>"}, + {"device", jt_obd_device, 0, + "set current device to <%uuid|$name|devno>\n" + "usage: device <%uuid|$name|devno>"}, {"device_list", jt_obd_list, 0, "show all devices\n" "usage: device_list"}, {"lustre_build_version", jt_get_version, 0, @@ -126,7 +125,7 @@ command_t cmdlist[] = { "type specific device configuration information\n" "usage: setup <args...>"}, {"cleanup", jt_obd_cleanup, 0, "cleanup previously setup device\n" - "usage: cleanup [force]"}, + "usage: cleanup [force | failover]"}, {"detach", jt_obd_detach, 0, "remove driver (and name and uuid) from current device\n" "usage: detach"}, @@ -156,7 +155,7 @@ command_t cmdlist[] = { "usage: setattr <objid> <mode>"}, {"create", jt_obd_create, 0, "create <num> OST objects (with <mode>)\n" - "usage: create [num [mode [verbose]]]"}, + "usage: create [num [mode [verbose [lsm data]]]]"}, {"destroy", jt_obd_destroy, 0, "destroy OST object <objid> [num [verbose]]\n" "usage: destroy <num> objects, starting at objid <objid>"}, @@ -185,21 +184,24 @@ command_t cmdlist[] = { "stop lock manager stress test (no args)\n"}, {"dump_ldlm", jt_obd_dump_ldlm, 0, "dump all lock manager state (no args)"}, - {"lov_set_osc_active", jt_obd_lov_set_osc_active, 0, - "(de)activate an OSC in a LOV\n" - "usage: lov_set_osc_active <OSC UUID> <1|0 (active|inactive)>"}, - {"newconn", jt_obd_newconn, 0, "newconn <olduuid> [newuuid]"}, - {"failconn", jt_obd_failconn, 0, "failconn <uuid>"}, + {"activate", jt_obd_activate, 0, "activate an import\n"}, + {"deactivate", jt_obd_deactivate, 0, "deactivate an import\n"}, + {"recover", jt_obd_recover, 0, "usage: recover [<connection UUID>]"}, {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup <directory> <file>"}, {"notransno", jt_obd_no_transno, 0, - "disable sending of committed-transno updates\n" - "usage: notransno"}, + "disable sending of committed-transno updates\n"}, {"readonly", jt_obd_set_readonly, 0, - "disable writes to the underlying device\n" - "usage: readonly"}, + "disable writes to the underlying device\n"}, + {"abort_recovery", jt_obd_abort_recovery, 0, + "abort recovery on MDS device\n"}, + {"mount_option", jt_obd_mount_option, 0, + "dump mount options to file\n"}, /* Debug commands */ {"======== debug =========", jt_noop, 0, "debug"}, + {"debug_daemon", jt_dbg_debug_daemon, 0, + "debug daemon control and dump to a file" + "usage: debug_daemon [start file <#MB>|stop|pause|continue]"}, {"debug_kernel", jt_dbg_debug_kernel, 0, "get debug buffer and dump to a file" "usage: debug_kernel [file] [raw]"}, @@ -244,10 +246,11 @@ int main(int argc, char **argv) if (dbg_initialize(argc, argv) < 0) exit(3); + Parser_init("lctl > ", cmdlist); + if (argc > 1) { rc = Parser_execarg(argc - 1, argv + 1, cmdlist); } else { - Parser_init("lctl > ", cmdlist); rc = Parser_commands(); } diff --git a/lustre/utils/llparser.pm b/lustre/utils/llparser.pm deleted file mode 100644 index 5cee31f..0000000 --- a/lustre/utils/llparser.pm +++ /dev/null @@ -1,399 +0,0 @@ -#!/usr/bin/perl -# Copyright (C) 2002 Cluster File Systems, Inc. -# Author: Hariharan Thantry <thantry@users.sourceforge.net> - -# This file is part of Lustre, http://www.lustre.org. -# -# Lustre is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# Lustre is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Lustre; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# - - -package llparser; -require Exporter; -@ISA = qw(Exporter); -@EXPORT = qw(parse_file print_rpcrelations parse_foptions %ll_subsystems - %subsysnum %trace_masks $e_subsys $e_mask $e_processor $e_time - $e_file $e_line $e_function $e_pid $e_stack $e_fmtstr $e_backref - $e_treeparent $e_numchildren $e_youngestchild $e_next $e_pidhead - $e_rpcsndrcv $e_rpcpid $e_rpcxid $e_rpcnid $e_rpcopc $e_rpcnext - $e_curlineref $SEND $RCV); - -($e_subsys, - $e_mask, - $e_processor, - $e_time, - $e_file, - $e_line, - $e_function, - $e_pid, - $e_stack, - $e_fmtstr, - $e_treeparent, - $e_numchildren, - $e_youngestchild, - $e_pidhead, - $e_next, - $e_backref) = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - -($e_rpcpid, - $e_rpcxid, - $e_rpcnid, - $e_rpcopc, - $e_rpcnext, - $e_rpcsndrcv, - $e_curlineref) = (0, 1, 2, 3, 4, 5, 6); - -$SEND = 0; -$RCV = 1; - -$REGEX=qr/^\s*(\w+)\s*:\s*(\d+)\s*:\s*(\d+)\s*:\s*(\d+\.(?:\d+))\s*\(\s*([^:]+)\s*:\s*(\d+)\s*:\s*([^()]+)\s*\(\)\s*(?:(?:\d+)\s*\|\s*)?(\d+)\s*\+\s*(\d+)\s*(?:.*)\):(.*)$/; - -$RPCREGEX = qr/^\s*(?:Sending|Handling)\s*RPC\s*pid:xid:nid:opc\s*(\d+):(?:0x)?(\w+):(?:0x)?(\w+):(\d+)\s*$/; -$FILEOPTIONREGEX = qr/(--server)|(-s)/; -$SENDING = qr/Sending/; - - -# Needs to match definition in portals/include/linux/kp30.h -%ll_subsystems = ("00" => "UNDEFINED", "01" => "MDC", "02" => "MDS", - "03" => "OSC", "04" => "OST", "05" => "CLASS", - "06" => "OBDFS","07" => "LLITE","08" => "RPC", - "09" => "EXT2OBD","0a" => "PORTALS","0b" => "SOCKNAL", - "0c" => "QSWNAL","0d" => "PINGER","0e" => "FILTER", - "0f" => "TRACE","10" => "ECHO","11" => "LDLM", - "12" => "LOV", "13" => "GMNAL","14" => "PTLROUTER" ); - -%subsysnum; -$subsysnum->{UNDEFINED} = 0; -$subsysnum->{MDC} = 1; -$subsysnum->{MDS} = 2; -$subsysnum->{OSC} = 3; -$subsysnum->{OST} = 4; -$subsysnum->{CLASS} = 5; -$subsysnum->{OBDFS} = 6; -$subsysnum->{LLITE} = 7; -$subsysnum->{RPC} = 8; -$subsysnum->{EXT2OBD} = 9; -$subsysnum->{PORTALS} = 10; -$subsysnum->{SOCKNAL} = 11; -$subsysnum->{QSWNAL} = 12; -$subsysnum->{PINGER} = 13; -$subsysnum->{FILTER} = 14; -$subsysnum->{TRACE} = 15; # obdtrace, not to be confused with D_TRACE */ -$subsysnum->{ECHO} = 16; -$subsysnum->{LDLM} = 17; -$subsysnum->{LOV} = 18; -$subsysnum->{GMNAL} = 19; -$subsysnum->{PTLROUTER} = 20; - -%tracemasks; -$tracemasks->{TRACE} = 1 << 0; # /* ENTRY/EXIT markers */ -$tracemasks->{INODE} = 1 << 1; # -$tracemasks->{SUPER} = 1 << 2; # -$tracemasks->{EXT2} = 1 << 3; # /* anything from ext2_debug */ -$tracemasks->{MALLOC} = 1 << 4; # /* print malloc, free information */ -$tracemasks->{CACHE} = 1 << 5; # /* cache-related items */ -$tracemasks->{INFO} = 1 << 6; # /* general information */ -$tracemasks->{IOCTL} = 1 << 7; # /* ioctl related information */ -$tracemasks->{BLOCKS} = 1 << 8; # /* ext2 block allocation */ -$tracemasks->{NET} = 1 << 9; # /* network communications */ -$tracemasks->{WARNING} = 1 << 10; # -$tracemasks->{BUFFS} = 1 << 11; # -$tracemasks->{OTHER} = 1 << 12; # -$tracemasks->{DENTRY} = 1 << 13; # -$tracemasks->{PORTALS} = 1 << 14; # /* ENTRY/EXIT markers */ -$tracemasks->{PAGE} = 1 << 15; # /* bulk page handling */ -$tracemasks->{DLMTRACE} = 1 << 16; # -$tracemasks->{ERROR} = 1 << 17; # /* CERROR} = ...) == CDEBUG} = D_ERROR, ...) */ -$tracemasks->{EMERG} = 1 << 18; # /* CEMERG} = ...) == CDEBUG} = D_EMERG, ...) */ -$tracemasks->{HA} = 1 << 19; # /* recovery and failover */ -$tracemasks->{RPCTRACE} = 1 << 19; # /* recovery and failover */ - -# Contains all the file names, the first filename is the -# client. After that are all servers. -my @filearray = (); - - -# Create backlinks between array entries based on the calling sequence -# For each new PID encountered, the first entry will be present in the -# PID hash. - -sub create_links { - my $arrayref = shift @_; - my $pidhashref = shift @_; - my $stitchref = shift @_; - my %local_hash; - my $hash_lineref; - my $tmpfmtref; - my $tmpref; - my $firstlineaftermarker = 0; - - foreach $lineref (@$arrayref) { - next if ($lineref->[$e_time] == 0); # Skip the client marker line - my $pidprevious = $pidhashref->{$lineref->[$e_pid]}; - if ($pidprevious->[$e_next] == 0) { - $pidprevious->[$e_next] = $lineref; - if (exists $local_hash{$lineref->[$e_pid]} - && $firstlineaftermarker) { - $hash_lineref=$local_hash{$lineref->[$e_pid]}; - $hash_lineref->[$e_next] =$lineref; - $firstlineaftermarker = 0; - } - } elsif ($local_hash{$lineref->[$e_pid]} == 0) { - # True only for the first line, the marker line. - $local_hash{$lineref->[$e_pid]}=$lineref; - #print "LINE ADDED TO HASH: @$lineref\n"; - $firstlineaftermarker = 1; - } - # Stack grows upward (assumes x86 kernel) - if ($lineref->[$e_stack] < $pidprevious->[$e_stack]) { - # lineref is not a child of pidprevious, find its parent - LINE: while(($lineref->[$e_stack] < $pidprevious->[$e_stack]) && - ($lineref->[$e_function] == $pidprevious->[$e_function]) - ) { - #This second part of the comparision is a HACK - last LINE if ($pidprevious->[$e_backref] == 0); - $pidprevious = $pidprevious->[$e_backref]; - } - } - if ($lineref->[$e_stack] > $pidprevious->[$e_stack]) { - # lineref is child of pidprevious, with the caveat that they must - # belong to different functions. This is a HACK - # until CDEBUG is modified - while($lineref->[$e_function] eq $pidprevious->[$e_function]) { - last if ($pidprevious->[$e_backref] == 0); - $pidprevious = $pidprevious->[$e_backref]; - } - - $lineref->[$e_backref] = $pidprevious; - $pidprevious->[$e_numchildren]++; - } else { - # lineref is sibling of pidprevious - $lineref->[$e_numchildren] = 0; - $lineref->[$e_backref] = $pidprevious->[$e_backref]; - ($lineref->[$e_backref])->[$e_numchildren]++; - } - - $pidhashref->{$lineref->[$e_pid]} = $lineref; - $lineref->[$e_youngestchild] = $lineref; - while ($pidprevious->[$e_backref] != 0) { - $pidprevious->[$e_youngestchild] = $lineref; - $pidprevious = $pidprevious->[$e_backref]; - } - $pidprevious->[$e_youngestchild] = $lineref; - $lineref->[$e_pidhead]=$pidprevious; - - # Stitch together rpc's - if($lineref->[$e_fmtstr] =~ $RPCREGEX) { - #print "RPC LINE: @$lineref\n"; - $tmpfmtref = [$1, $2, $3, $4, 0, 0, 0]; - if ($lineref->[$e_fmtstr] =~ $SENDING) { - $tmpfmtref->[$e_rpcsndrcv] = $SEND; - } else { $tmpfmtref->[$e_rpcsndrcv] = $RCV; } - $tmpfmtref->[$e_curlineref] = $lineref; - $stitchref->{$lineref->[$e_time]} = $tmpfmtref; - - } - - } -match_rpcs($stitchref); -return $arrayref; -} - - - - -# Main loop, parses the debug log - -sub parse_file { - my %hasharray; - my $input_files = shift; - - my $stitch_ref = shift; - my $pid = shift; - my $rpctrace = shift; - my $trace = shift; - my $nodlm = shift; - my $noclass = shift; - my $nonet = shift; - - print "$pid, $rpctrace, $nodlm, $noclass, $nonet\n"; - $backref = 0; - $treeparent = 0; - $numchildren = 0; - $youngestchild = 0; - $next = 0; - $pidhead = 0; - $iter = 0; - - foreach $file (@$input_files) { - - open(FILEHANDLE, $file) or die "Can't open file: $file\n"; - while(<FILEHANDLE>) { - if (/$REGEX/) { - @parsed_line=($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, - $treeparent, $numchildren, $youngestchild, - $pidhead, $next, $backref); - next if (($parsed_line[$e_pid] != $pid) && - ($pid) && ($iter == 0)); - next if (($parsed_line[$e_mask] != $tracemasks->{RPCTRACE}) - && ($rpctrace)); - next if ($trace && $parsed_line[$e_mask] != - $tracemasks->{TRACE}); - next if ($nodlm && hex($parsed_line[$e_subsys]) == - $subsysnum->{LDLM}); - next if ($noclass && hex($parsed_line[$e_subsys]) == - $subsysnum->{CLASS}); - next if ($nonet && (hex($parsed_line[$e_subsys]) == - $subsysnum->{RPC} || - hex($parsed_line[$e_subsys]) == - $subsysnum->{NET} || - hex($parsed_line[$e_subsys]) == - $subsysnum->{PORTALS} || - hex($parsed_line[$e_subsys]) == - $subsysnum->{SOCKNAL} || - hex($parsed_line[$e_subsys]) == - $subsysnum->{QSWNAL} || - hex($parsed_line[$e_subsys]) == - $subsysnum->{GMNAL})); - - - if (!exists($hasharray{$parsed_line[$e_pid]})) { - # Push a marker for the beginning of this PID - my @marker_line; - $marker_line[$e_subsys] = 0; - $marker_line[$e_mask] = 0; - $marker_line[$e_processor] = 0; - $marker_line[$e_time] = $parsed_line[$e_time]; - $marker_line[$e_file] = 0; - $marker_line[$e_line] = 0; - $marker_line[$e_function] = 0; - $marker_line[$e_pid] = $parsed_line[$e_pid]; - # marker lines are everyone's parent, so stack value zero - $marker_line[$e_stack] = 0; - $marker_line[$e_fmtstr] = ""; - $marker_line[$e_treeparent] = 0; - $marker_line[$e_numchildren] = 0; - $marker_line[$e_youngestchild] = 0; - $marker_line[$e_pidhead] = 0; - $marker_line[$e_next]= \@parsed_line; - $marker_line[$e_backref] = 0; - $hasharray{$parsed_line[$e_pid]} = \@marker_line; - push @$array_parsed, [ @marker_line ]; - - } - push @$array_parsed, [ @parsed_line ]; - } - - } - close(FILEHANDLE); - if ($iter == 0) { - # Insert end of client line marker, an all zero pattern; - @marker_line = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); - push @$array_parsed, [ @marker_line ]; - - } - $iter ++; - } - - $array_parsed=create_links($array_parsed, \%hasharray, $stitch_ref); - #print_array($array_parsed); - return $array_parsed; -} - -sub print_array { - - my $arrayref = shift; - foreach $lineref(@$arrayref){ - if ($lineref->[$e_backref]==0){ - print "MARKER LINE(addr): $lineref contents: [@$lineref]\n"; - } else { - - print "REGULAR LINE (addr) :$lineref contents:[@$lineref]\n"; - } - } - -} - -sub print_rpcrelations { - - my $rpchashref = shift; - foreach $rpckeys (sort keys %$rpchashref) { - $tmpref = $rpchashref->{$rpckeys}; - #print "Key: $rpckeys, Contents: @$tmpref\n"; - - } - -} -sub match_rpcs { - my $rpchashref = shift; - foreach $rpckeys (sort keys %$rpchashref) { - $tmpref = $rpchashref->{$rpckeys}; - #print "MATCHING: $@tmpref...\n"; - foreach $cmpkeys (sort keys %$rpchashref) { - next if($cmpkeys == $rpckeys); - $cmpref = $rpchashref->{$cmpkeys}; - # print "Line compared: @$cmpref\n"; - next if ($tmpref->[$e_rpcsndrcv] == $cmpref->[$e_rpcsndrcv]); - next if ($tmpref->[$e_rpcpid] != $cmpref->[$e_rpcpid]); - next if ($tmpref->[$e_rpcxid] != $cmpref->[$e_rpcxid]); - if ($tmpref->[$e_rpcsndrcv] == $SEND) { - $tmpref->[$e_rpcnext] = $cmpkeys; - #print "MACTHED: KEY 1: $rpckeys CONTENTS: @$tmpref", - #"KEY2: $cmpkeys CONTENTS: @$cmpref\n" - - } - - } - - } - -} - -sub getnextchild { - my $rootline = shift; - my $lineref = shift; - my $tempref = $lineref->[$e_next]; - if ($tempref == 0) { - return 0; - } - - if (($tempref->[$e_stack] > $rootline->[$e_stack]) || - (($tempref->[$e_stack] <= $rootline->[$e_stack]) && - ($tempref->[$e_function] == $rootline->[$e_function]) - )){ - # Child - return $tempref; - - } - return 0; - - -} - - -sub parse_foptions { - - my $inarg = shift; - my $idx = 0; - foreach $elem(@$inarg) { - next if ($elem =~ /$FILEOPTIONREGEX/); - $filearray[$idx] = $elem; - $idx++; - } - return \@filearray; -} - -1; -#$array_parsed=parse_file(); -#print_array($array_parsed); diff --git a/lustre/utils/llstat.pl b/lustre/utils/llstat.pl new file mode 100755 index 0000000..28eb778 --- /dev/null +++ b/lustre/utils/llstat.pl @@ -0,0 +1,122 @@ +#!/usr/bin/perl + +my $pname = $0; + +sub usage() +{ + print STDERR "Usage: $pname <stats_file> [<interval>]\n"; + exit 1; +} + + +my $statspath; +my $interval = 0; + +if (($#ARGV < 0) || ($#ARGV > 1)) { + usage(); +} else { + $statspath = $ARGV[0]; + if ($#ARGV == 1) { + $interval = $ARGV[1]; + } +} + + + +my %namehash; +my $anysum = 0; +my $anysumsquare = 0; +my $mhz = 0; + +sub get_cpumhz() +{ + my $cpu_freq; + my $itc_freq; # On Itanium systems use this + if (open(CPUINFO, "/proc/cpuinfo")==0) { + return; + } + while (<CPUINFO>) { + if (/^cpu MHz\s+:\s*([\d\.]+)/) { $cpu_freq=$1; } + elsif (/^itc MHz\s+:\s*([\d\.]+)/) { $itc_freq=$1; } + } + if (defined($itc_freq)) { $mhz = $itc_freq; } + elsif (defined($cpu_freq)) { $mhz = $cpu_freq; } + else { $mhz = 1; } +} + +get_cpumhz(); +print "Processor counters run at $mhz MHz\n"; + +sub readstat() +{ + open(STATS, $statspath) || die "Cannot open $statspath: $!\n"; + while (<STATS>) { + chop; + ($name, $cumulcount, $samples, $unit, $min, $max, $sum, $sumsquare) + = split(/\s+/, $_); + + $prevcount = %namehash->{$name}; + if (defined($prevcount)) { + $diff = $cumulcount - $prevcount; + if ($name eq "snapshot_time") { + $tdiff = $diff; + # printf "%-25s prev=$prevcount, cumul=$cumulcount diff=$diff, tdiff=$tdiff\n", $name; + printf "$statspath @ $cumulcount\n"; + printf "%-25s %-10s %-10s %-10s", "Name", "Cur.Count", "Cur.Rate", "#Events"; + if ($anysum) { + printf "%-8s %10s %12s %10s", "Unit", "min", "avg", "max"; + } + if ($anysumsquare) { + printf "%10s", "stddev"; + } + printf "\n"; + } + elsif ($cumulcount!=0) { + printf "%-25s %-10Lu %-10Lu %-10Lu", + $name, $diff, ($diff/$tdiff), $cumulcount; + + if (defined($sum)) { + my $sum_orig = $sum; + if (($unit eq "[cycles]") && ($mhz != 1)) { + $unit = "[usecs]"; + $min = $min/$mhz; + $sum = $sum/$mhz; + $max = $max/$mhz; + } + printf "%-8s %10Lu %12.2f %10Lu", $unit, $min, ($sum/$cumulcount), $max; + if (defined($sumsquare)) { + my $s = $sumsquare - (($sum_orig*$sum_orig)/$cumulcount); + if ($s >= 0) { + my $cnt = ($cumulcount >= 2) ? $cumulcount : 2 ; + my $stddev = sqrt($s/($cnt - 1)); + if (($unit eq "[usecs]") && ($mhz != 1)) { + $stddev = $stddev/$mhz; + } + printf " %10.2f", $stddev; + } + } + } + printf "\n"; + } + } + else { + if ($cumulcount!=0) { + printf "%-25s $cumulcount\n", $name + } + if (defined($sum)) { + $anysum = 1; + } + if (defined($sumsquare)) { + $anysumsquare = 1; + } + } + %namehash->{$name} = $cumulcount; + } +} + +do { + readstat(); + if ($interval) { + sleep($interval); + } +} while ($interval); diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 76757a7..8ab7278 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -29,9 +29,22 @@ import sys, os, getopt, string, exceptions import xml.dom.minidom from xml.dom.ext import PrettyPrint +PYMOD_DIR = "/usr/lib/lustre/python" + +def development_mode(): + base = os.path.dirname(sys.argv[0]) + if os.access(base+"/Makefile.am", os.R_OK): + return 1 + return 0 + +if not development_mode(): + sys.path.append(PYMOD_DIR) + +import Lustre + DEFAULT_PORT = 988 -def usage(): +def reference(): print """usage: lmc --add object [object parameters] Object creation command summary: @@ -39,12 +52,15 @@ Object creation command summary: --add node --node node_name --timeout num - --recovery_upcall path + --upcall path + --lustre_upcall path + --portals_upcall path --add net --node node_name --nid nid - --nettype tcp|elan|toe|gm + --cluster_id + --nettype tcp|elan|toe|gm|scimac --hostaddr addr --port port --tcpbuf size @@ -81,7 +97,74 @@ Object creation command summary: --mds mds_name --ost ost_name OR --lov lov_name """ - sys.exit(1) + +PARAM = Lustre.Options.PARAM +lmc_options = [ + # lmc input/output options + ('reference', "Print short reference for commands"), + ('verbose,v', "Print system commands as they are run"), + ('merge,m', "", PARAM), + ('output,o', "", PARAM), + ('input,i', "", PARAM), + ('batch', "", PARAM), + + # commands + ('add', "", PARAM), + + # node options + ('node', "", PARAM), + ('timeout', "", PARAM), + ('upcall', "Set both lustre and portals upcall scripts.", PARAM), + ('lustre_upcall', "Set location of lustre upcall script.", PARAM), + ('portals_upcall', "Set location of portals upcall script.", PARAM), + + # network + ('nettype', "", PARAM), + ('nid', "", PARAM), + ('tcpbuf', "", PARAM, 0), + ('port', "", PARAM, DEFAULT_PORT), + ('nid_exchange', "", PARAM, 0), + ('irq_affinity', "", PARAM, 0), + ('hostaddr', "", PARAM, ""), + ('cluster_id', "", PARAM, "0"), + + # routes + ('route', "", PARAM), + ('router', ""), + ('gw', "", PARAM), + ('gw_cluster_id', "", PARAM, "0"), + ('target_cluster_id', "", PARAM, "0"), + ('lo', "", PARAM), + ('hi', "", PARAM, ""), + + # servers: mds and ost + ('mds', "", PARAM), + ('ost', "", PARAM, ""), + ('osdtype', "", PARAM, "obdfilter"), + ('failover', ""), + ('group', "", PARAM), + ('dev', "", PARAM, ""), + ('size', "", PARAM, 0), + ('journal_size', "", PARAM, 0), + ('fstype', "", PARAM, "ext3"), + ('ostuuid', "", PARAM, ""), + ('format', ""), + + # clients: mountpoint and echo + ('echo_client', "", PARAM), + ('path', "", PARAM), + ('filesystem', "Lustre filesystem name", PARAM, ''), + + # lov + ('lov', "", PARAM, ''), + ('stripe_sz', "", PARAM), + ('stripe_cnt', "", PARAM, 0), + ('stripe_pattern', "", PARAM, 0), + + # cobd + ('real_obd', "", PARAM), + ('cache_obd', "", PARAM), + ] def error(*args): msg = string.join(map(str,args)) @@ -118,17 +201,12 @@ def new_uuid(name): ldlm_name = 'ldlm' ldlm_uuid = 'ldlm_UUID' -ptlrpc_name = 'RPCDEV' -ptlrpc_uuid = 'RPCDEV_UUID' - def new_lustre(dom): """Create a new empty lustre document""" # adding ldlm here is a bit of a hack, but one is enough. - str = """<lustre> + str = """<lustre version="%s"> <ldlm name="%s" uuid="%s"/> - <ptlrpc name="%s" uuid="%s"/> - </lustre>""" % (ldlm_name, ldlm_uuid, - ptlrpc_name, ptlrpc_uuid) + </lustre>""" % (Lustre.CONFIG_VERSION, ldlm_name, ldlm_uuid) return dom.parseString(str) names = {} @@ -146,9 +224,8 @@ def init_names(doc): init_names(n) def get_format_flag(options): - if options.has_key('format'): - if options['format']: - return 'yes' + if options.format: + return 'yes' return 'no' ############################################################ @@ -187,11 +264,13 @@ class GenConfig: node.appendChild(new) return new - def network(self, name, uuid, nid, net, hostaddr="", port=0, tcpbuf=0, irq_aff=0, nid_xchg=0): + def network(self, name, uuid, nid, cluster_id, net, hostaddr="", + port=0, tcpbuf=0, irq_aff=0, nid_xchg=0): """create <network> node""" network = self.newService("network", name, uuid) network.setAttribute("nettype", net); self.addElement(network, "nid", nid) + self.addElement(network, "clusterid", cluster_id) if hostaddr: self.addElement(network, "hostaddr", hostaddr) if port: @@ -211,11 +290,13 @@ class GenConfig: rtbl = self.newService("routetbl", name, uuid) return rtbl - def route(self, net_type, gw, lo, hi): + def route(self, gw_net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): """ create one entry for the route table """ ref = self.doc.createElement('route') - ref.setAttribute("type", net_type) + ref.setAttribute("type", gw_net_type) ref.setAttribute("gw", gw) + ref.setAttribute("gwclusterid", gw_cluster_id) + ref.setAttribute("tgtclusterid", tgt_cluster_id) ref.setAttribute("lo", lo) if hi: ref.setAttribute("hi", hi) @@ -237,7 +318,8 @@ class GenConfig: ldlm = self.newService("ldlm", name, uuid) return ldlm - def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, node_uuid, dev_size=0): + def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, + node_uuid, dev_size=0, journal_size=0): osd = self.newService("osd", name, uuid) osd.setAttribute('osdtype', osdtype) osd.appendChild(self.ref("target", ost_uuid)) @@ -249,6 +331,8 @@ class GenConfig: self.addElement(osd, "autoformat", format) if dev_size: self.addElement(osd, "devsize", "%s" % (dev_size)) + if journal_size: + self.addElement(osd, "journalsize", "%s" % (journal_size)) return osd def cobd(self, name, uuid, real_uuid, cache_uuid): @@ -257,9 +341,11 @@ class GenConfig: cobd.appendChild(self.ref("cacheobd",cache_uuid)) return cobd - def ost(self, name, uuid, osd_uuid): + def ost(self, name, uuid, osd_uuid, group=""): ost = self.newService("ost", name, uuid) ost.appendChild(self.ref("active", osd_uuid)) + if group: + self.addElement(ost, "group", group) return ost def oss(self, name, uuid): @@ -279,30 +365,39 @@ class GenConfig: lovconfig.appendChild(self.ref("lov", lov_uuid)) return lovconfig - def mds(self, name, uuid, mdd_uuid): + def mds(self, name, uuid, mdd_uuid, group=""): mds = self.newService("mds", name, uuid) mds.appendChild(self.ref("active",mdd_uuid)) + if group: + self.addElement(mds, "group", group) return mds def mdsdev(self, name, uuid, fs, devname, format, node_uuid, - mds_uuid, dev_size=0 ): + mds_uuid, dev_size=0, journal_size=0): mdd = self.newService("mdsdev", name, uuid) self.addElement(mdd, "fstype", fs) dev = self.addElement(mdd, "devpath", devname) self.addElement(mdd, "autoformat", format) if dev_size: self.addElement(mdd, "devsize", "%s" % (dev_size)) + if journal_size: + self.addElement(mdd, "journalsize", "%s" % (journal_size)) mdd.appendChild(self.ref("node", node_uuid)) mdd.appendChild(self.ref("target", mds_uuid)) return mdd - def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path): + def mountpoint(self, name, uuid, fs_uuid, path): mtpt = self.newService("mountpoint", name, uuid) - mtpt.appendChild(self.ref("mds", mds_uuid)) - mtpt.appendChild(self.ref("obd", osc_uuid)) + mtpt.appendChild(self.ref("filesystem", fs_uuid)) self.addElement(mtpt, "path", path) return mtpt + def filesystem(self, name, uuid, mds_uuid, obd_uuid): + fs = self.newService("filesystem", name, uuid) + fs.appendChild(self.ref("mds", mds_uuid)) + fs.appendChild(self.ref("obd", obd_uuid)) + return fs + def echo_client(self, name, uuid, osc_uuid): ec = self.newService("echoclient", name, uuid) ec.appendChild(self.ref("obd", osc_uuid)) @@ -352,6 +447,12 @@ def name2uuid(lustre, name, tag="", fatal=1): return "" return getUUID(ret) +def lookup_filesystem(lustre, mds_uuid, ost_uuid): + for n in lustre.childNodes: + if n.nodeType == n.ELEMENT_NODE and n.nodeName == 'filesystem': + if ref_exists(n, mds_uuid) and ref_exists(n, ost_uuid): + return getUUID(n) + return None # XXX: assumes only one network element per node. will fix this # as soon as support for routers is added @@ -403,6 +504,27 @@ def get_attr(dom_node, attr, default=""): ############################################################ # Top level commands # +def set_node_options(gen, node, options): + if options.router: + node.setAttribute('router', '1') + if options.timeout: + gen.addElement(node, "timeout", get_option(options, 'timeout')) + if options.upcall: + default_upcall = get_option(options, 'upcall') + else: + default_upcall = '' + if default_upcall or options.lustre_upcall: + if options.lustre_upcall: + gen.addElement(node, 'lustreUpcall', options.lustre_upcall) + else: + gen.addElement(node, 'lustreUpcall', default_upcall) + if default_upcall or options.portals_upcall: + if options.portals_upcall: + gen.addElement(node, 'portalsUpcall', options.portals_upcall) + else: + gen.addElement(node, 'portalsUpcall', default_upcall) + return node + def do_add_node(gen, lustre, options, node_name): uuid = new_uuid(node_name) prof_name = new_name("PROFILE_" + node_name) @@ -413,13 +535,7 @@ def do_add_node(gen, lustre, options, node_name): lustre.appendChild(profile) node_add_profile(gen, node, 'ldlm', ldlm_uuid) - node_add_profile(gen, node, 'ptlrpc', ptlrpc_uuid) - if has_option(options, 'router'): - node.setAttribute('router', '1') - if has_option(options, 'timeout'): - node.setAttribute('timeout', get_option(options, 'timeout')) - if has_option(options, 'recovery_upcall'): - node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall')) + set_node_options(gen, node, options) return node @@ -439,15 +555,16 @@ def add_net(gen, lustre, options): node_name = get_option(options, 'node') nid = get_option(options, 'nid') - hostaddr = get_option(options, 'hostaddr', '') + cluster_id = get_option(options, 'cluster_id') + hostaddr = get_option(options, 'hostaddr') net_type = get_option(options, 'nettype') if net_type in ('tcp', 'toe'): - port = get_option_int(options, 'port', DEFAULT_PORT) - tcpbuf = get_option_int(options, 'tcpbuf', 0) - irq_aff = get_option_int(options, 'irq_affinity', 0) - nid_xchg = get_option_int(options, 'nid_exchange', 0) - elif net_type in ('elan', 'gm'): + port = get_option_int(options, 'port') + tcpbuf = get_option_int(options, 'tcpbuf') + irq_aff = get_option_int(options, 'irq_affinity') + nid_xchg = get_option_int(options, 'nid_exchange') + elif net_type in ('elan', 'gm', 'scimac'): port = 0 tcpbuf = 0 irq_aff = 0 @@ -461,9 +578,12 @@ def add_net(gen, lustre, options): node = do_add_node(gen, lustre, options, node_name) else: node = ret + set_node_options(gen, node, options) + net_name = new_name('NET_'+ node_name +'_'+ net_type) net_uuid = new_uuid(net_name) - node.appendChild(gen.network(net_name, net_uuid, nid, net_type, hostaddr, port, tcpbuf, irq_aff, nid_xchg)) + node.appendChild(gen.network(net_name, net_uuid, nid, cluster_id, net_type, + hostaddr, port, tcpbuf, irq_aff, nid_xchg)) node_add_profile(gen, node, "network", net_uuid) @@ -471,10 +591,14 @@ def add_route(gen, lustre, options): """ create a node with a network config """ node_name = get_option(options, 'node') - net_type = get_option(options, 'nettype') + gw_net_type = get_option(options, 'nettype') gw = get_option(options, 'gw') + gw_cluster_id = get_option(options, 'gw_cluster_id') + tgt_cluster_id = get_option(options, 'target_cluster_id') lo = get_option(options, 'lo') - hi = get_option(options, 'hi', '') + hi = get_option(options, 'hi') + if not hi: + hi = lo node = findByName(lustre, node_name, "node") if not node: @@ -489,7 +613,8 @@ def add_route(gen, lustre, options): rtbl = gen.routetbl(rtbl_name, rtbl_uuid) node.appendChild(rtbl) node_add_profile(gen, node, "routetbl", rtbl_uuid) - rtbl.appendChild(gen.route(net_type, gw, lo, hi)) + rtbl.appendChild(gen.route(gw_net_type, gw, gw_cluster_id, tgt_cluster_id, + lo, hi)) def add_mds(gen, lustre, options): @@ -501,12 +626,17 @@ def add_mds(gen, lustre, options): mds_uuid = name2uuid(lustre, mds_name, fatal=0) if not mds_uuid: mds_uuid = new_uuid(mds_name) - mds = gen.mds(mds_name, mds_uuid, mdd_uuid) + mds = gen.mds(mds_name, mds_uuid, mdd_uuid, options.group) lustre.appendChild(mds) - + else: + mds = lookup(lustre, mds_uuid) + if options.failover: + mds.setAttribute('failover', "1") + devname = get_option(options, 'dev') - size = get_option(options, 'size', 0) - fstype = get_option(options, 'fstype', 'extN') + size = get_option(options, 'size') + fstype = get_option(options, 'fstype') + journal_size = get_option(options, 'journal_size') node_uuid = name2uuid(lustre, node_name, 'node') @@ -516,15 +646,16 @@ def add_mds(gen, lustre, options): if not net_uuid: error("NODE: ", node_name, "not found") - mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options), - node_uuid, mds_uuid, dev_size=size) + mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, + get_format_flag(options), node_uuid, mds_uuid, + dev_size=size, journal_size=journal_size) lustre.appendChild(mdd) def add_ost(gen, lustre, options): node_name = get_option(options, 'node') - lovname = get_option(options, 'lov', '') - osdtype = get_option(options, 'osdtype', 'obdfilter', deprecated_tag="obdtype") + lovname = get_option(options, 'lov') + osdtype = get_option(options, 'osdtype') node_uuid = name2uuid(lustre, node_name) @@ -533,37 +664,46 @@ def add_ost(gen, lustre, options): devname = '' size = 0 fstype = '' + journal_size = '' else: - devname = get_option(options, 'dev', '') # can be unset for bluearcs - size = get_option(options, 'size', 0) - fstype = get_option(options, 'fstype', 'extN') + devname = get_option(options, 'dev') # can be unset for bluearcs + size = get_option(options, 'size') + fstype = get_option(options, 'fstype') + journal_size = get_option(options, 'journal_size') - ostname = get_option(options, 'ost', '', deprecated_tag='obd') + ostname = get_option(options, 'ost') if not ostname: ostname = new_name('OST_'+ node_name) - osdname = new_name("OSD_" + ostname) + osdname = new_name("OSD_" + ostname + "_" + node_name) osd_uuid = new_uuid(osdname) ost_uuid = name2uuid(lustre, ostname, fatal=0) if not ost_uuid: - ost_uuid = get_option(options, 'ostuuid', '', deprecated_tag = 'obduuid') + ost_uuid = get_option(options, 'ostuuid') if ost_uuid: if lookup(lustre, ost_uuid): error("Duplicate OST UUID:", ost_uuid) else: ost_uuid = new_uuid(ostname) - ost = gen.ost(ostname, ost_uuid, osd_uuid) + ost = gen.ost(ostname, ost_uuid, osd_uuid, options.group) lustre.appendChild(ost) if lovname: lov = findByName(lustre, lovname, "lov") if not lov: error('add_ost:', '"'+lovname+'"', "lov element not found.") lov_add_obd(gen, lov, ost_uuid) + else: + ost = lookup(lustre, ost_uuid) - osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname, get_format_flag(options), ost_uuid, - node_uuid, size) + if options.failover: + ost.setAttribute('failover', "1") + + + osd = gen.osd(osdname, osd_uuid, fstype, osdtype, devname, + get_format_flag(options), ost_uuid, node_uuid, size, + journal_size) node = findByName(lustre, node_name, "node") @@ -623,8 +763,8 @@ def add_lov(gen, lustre, options): mds_name = get_option(options, 'mds') stripe_sz = get_option_int(options, 'stripe_sz') - stripe_cnt = get_option_int(options, 'stripe_cnt', 0) - pattern = get_option_int(options, 'stripe_pattern', 0) + stripe_cnt = get_option_int(options, 'stripe_cnt') + pattern = get_option_int(options, 'stripe_pattern') uuid = new_uuid(name) ret = findByName(lustre, name, "lov") @@ -643,50 +783,57 @@ def add_lov(gen, lustre, options): lovconfig = gen.lovconfig(lovconfig_name, lovconfig_uuid, uuid) lustre.appendChild(lovconfig) +def new_filesystem(gen, lustre, mds_uuid, obd_uuid): + fs_name = new_name("FS_fsname") + fs_uuid = new_uuid(fs_name) + mds = lookup(lustre, mds_uuid) + mds.appendChild(gen.ref("filesystem", fs_uuid)) + fs = gen.filesystem(fs_name, fs_uuid, mds_uuid, obd_uuid) + lustre.appendChild(fs) + return fs_uuid +def get_fs_uuid(gen, lustre, mds_name, obd_name): + mds_uuid = name2uuid(lustre, mds_name, tag='mds') + obd_uuid = name2uuid(lustre, obd_name, tag='lov', fatal=0) + if not obd_uuid: + obd_uuid = name2uuid(lustre, obd_name, tag='ost', fatal=1) + fs_uuid = lookup_filesystem(lustre, mds_uuid, obd_uuid) + if not fs_uuid: + fs_uuid = new_filesystem(gen, lustre, mds_uuid, obd_uuid) + return fs_uuid + def add_mtpt(gen, lustre, options): """ create mtpt on a node """ node_name = get_option(options, 'node') path = get_option(options, 'path') - mds_name = get_option(options, 'mds') - lov_name = get_option(options, 'lov', '') - if lov_name == '': - lov_name = get_option(options, 'ost', '', deprecated_tag='obd') + fs_name = get_option(options, 'filesystem') + if fs_name == '': + mds_name = get_option(options, 'mds') + lov_name = get_option(options, 'lov') if lov_name == '': - error("--add mtpt requires either --lov lov_name or --ost ost_name") + lov_name = get_option(options, 'ost') + if lov_name == '': + error("--add mtpt requires either --filesystem or --mds with an --lov lov_name or --ost ost_name") + fs_uuid = get_fs_uuid(gen, lustre, mds_name, lov_name) + else: + fs_uuid = name2uuid(lustre, fs_name, tag='filesystem') name = new_name('MNT_'+ node_name) ret = findByName(lustre, name, "mountpoint") if ret: + # this can't happen, because new_name creates unique names error("MOUNTPOINT: ", name, " already exists.") - mds_uuid = name2uuid(lustre, mds_name, tag='mds') - lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) - if not lov_uuid: - lov_uuid = name2uuid(lustre, lov_name, tag='ost', fatal=1) - uuid = new_uuid(name) - mtpt = gen.mountpoint(name, uuid, mds_uuid, lov_uuid, path) + mtpt = gen.mountpoint(name, uuid, fs_uuid, path) node = findByName(lustre, node_name, "node") if not node: error('node:', node_name, "not found.") node_add_profile(gen, node, "mountpoint", uuid) lustre.appendChild(mtpt) -# obsolete, leaving behind for reference -def add_oscref(gen, lustre, options): - """ create mtpt on a node """ - node_name = get_option(options, 'node') - osc_name = get_option(options, 'osc') - - osc_uuid = name2uuid(lustre, osc_name, tag='osc') - node = findByName(lustre, node_name, "node") - if not node: - error('node:', node_name, "not found") - node_add_profile(gen, node, "osc",osc_uuid) - ############################################################ # Command line processing # @@ -694,161 +841,23 @@ class OptionError (exceptions.Exception): def __init__(self, args): self.args = args -def has_option(options, tag): - """Look for tag in options hash and return the true if set""" - if options.has_key(tag): - return 1 - return 0 - -def get_option(options, tag, default = None, deprecated_tag=None): +def get_option(options, tag): """Look for tag in options hash and return the value if set. If not set, then if return default it is set, otherwise exception.""" - if options.has_key(tag): - return options[tag] - elif deprecated_tag and options.has_key(deprecated_tag): - warning('--'+deprecated_tag, " is deprecated, please use:", '--'+tag) - return options[deprecated_tag] - elif default != None: - return default + if options.__getattr__(tag) != None: + return options.__getattr__(tag) else: - raise OptionError("--add %s requires --%s <value>" % (options['add'], tag)) - # this exception should print an error like '--add blah requires --<tag> value' + raise OptionError("--add %s requires --%s <value>" % (options.add, tag)) -def get_option_int(options, tag, default = None): +def get_option_int(options, tag): """Return an integer option. Raise exception if the value is not an int""" - val = get_option(options, tag, default) + val = get_option(options, tag) try: n = int(val) except ValueError: raise OptionError("--%s <num> (value must be integer)" % (tag)) return n -def parse_cmdline(argv): - short_opts = "ho:i:m:" - long_opts = ["add=", "node=", "nettype=", "nid=", "tcpbuf=", "port=", - "echo_client=", "stripe_sz=", "stripe_cnt=", "stripe_pattern=", - "mds=", "route", "router", "merge=", "format", "reformat", "output=", - "dev=", "size=", "obd=", "ost=", "obdtype=", "osdtype=", "obduuid=", "in=", - "ostuuid=", "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=", - "osc=", "real_obd=", "cache_obd=", "fstype=", - "timeout=", "recovery_upcall=", "nid_exchange=", "irq_affinity=", - "hostaddr=",] - opts = [] - args = [] - options = {} - try: - opts, args = getopt.getopt(argv, short_opts, long_opts) - except getopt.error, e: - panic(string.join(sys.argv), e) - - for o, a in opts: - # Commands to create new devices - if o == "--add": - options['add'] = a - - if o == "--node": - options['node'] = a - - # devices names - if o == "--lov": - options['lov'] = a - if o == "--mds": - options['mds'] = a - if o == "--obd": - options['obd'] = a - if o == "--ost": - options['ost'] = a - - # node options - if o == "--timeout": - options['timeout'] = a - if o == "--recovery_upcall": - options['recovery_upcall'] = a - if o == "--router": - options['router'] = 1 - - # network options - if o == "--nid": - options['nid'] = a - if o == "--hostaddr": - options['hostaddr'] = a - if o == "--nettype": - options['nettype'] = a - if o == "--net": - options[''] = a - if o == "--tcpbuf": - options['tcpbuf'] = a - if o == "--port": - options['port'] = a - if o == "--mtpt": - options['mtpt'] = 1 - if o == "--route": - options['route'] = 1 - if o == "--nid_exchange": - options['nid_exchange'] = a - if o == "--irq_affinity": - options['irq_affinity'] = a - - # ost options - if o == "--dev": - options['dev'] = a - if o == "--size": - options['size'] = a - if o == "--path": - options['path'] = a - if o == "--osc": - options['osc'] = a - if o == "--obdtype": - options['obdtype'] = a - if o == "--osdtype": - options['osdtype'] = a - if o == "--fstype": - options['fstype'] = a - if o == "--obduuid": - options['obduuid'] = a - if o == "--ostuuid": - options['ostuuid'] = a - - # lov options - if o == "--stripe_sz": - options['stripe_sz'] = a - if o == "--stripe_cnt": - options['stripe_cnt'] = a - if o == "--stripe_pattern": - options['stripe_pattern'] = a - if o == "--gw": - options['gw'] = a - if o == "--lo": - options['lo'] = a - if o == "--hi": - options['hi'] = a - - # cobd - if o == "--cache_obd": - options['cache_obd'] = a - if o == "--real_obd": - options['real_obd'] = a - - # lmc options - if o in ("-h", "--help"): - usage() - if o in ("-o", "--output"): - options['output'] = a - if o in ("-m", "--merge"): - options['merge'] = a - if o == "--format": - options['format'] = 1 - if o == "--reformat": - warning("the lmc --reformat option is not supported. Use lconf --reformat") - options['reformat'] = 1 - if o == "--batch": - options['batch'] = a - if o in ("--in" , "-i"): - options['in'] = a - - return options, args - - # simple class for profiling import time class chrono: @@ -868,8 +877,6 @@ class chrono: str = '%s: %g secs' % (msg, d) print str - - ############################################################ # Main # @@ -877,8 +884,6 @@ class chrono: def add(devtype, gen, lustre, options): if devtype == 'net': add_net(gen, lustre, options) - elif devtype =='osc': - add_osc(gen, lustre, options) elif devtype == 'mtpt': add_mtpt(gen, lustre, options) elif devtype == 'mds': @@ -899,28 +904,40 @@ def add(devtype, gen, lustre, options): error("unknown device type:", devtype) def do_command(gen, lustre, options, args): - if options.has_key('add'): - add(options['add'], gen, lustre, options) + if options.add: + add(options.add, gen, lustre, options) else: error("Missing command") def main(): - options, args = parse_cmdline(sys.argv[1:]) + cl = Lustre.Options("lmc", "", lmc_options) + try: + options, args = cl.parse(sys.argv[1:]) + except Lustre.OptionError, e: + panic("lmc", e) + + if len(args) > 0: + panic(string.join(sys.argv), "Unexpected extra arguments on command line: " + string.join(args)) + + if options.reference: + reference() + sys.exit(0) + outFile = '-' - if options.has_key('merge'): - outFile = options['merge'] + if options.merge: + outFile = options.merge if os.access(outFile, os.R_OK): doc = xml.dom.minidom.parse(outFile) else: doc = new_lustre(xml.dom.minidom) - elif options.has_key('in'): - doc = xml.dom.minidom.parse(options['in']) + elif options.input: + doc = xml.dom.minidom.parse(options.input) else: doc = new_lustre(xml.dom.minidom) - if options.has_key('output'): - outFile = options['output'] + if options.output: + outFile = options.output lustre = doc.documentElement init_names(lustre) @@ -930,21 +947,25 @@ def main(): gen = GenConfig(doc) - if options.has_key('batch'): - fp = open(options['batch']) + if options.batch: + fp = open(options.batch) batchCommands = fp.readlines() fp.close() for cmd in batchCommands: - options, args = parse_cmdline(string.split(cmd)) try: + options, args = cl.parse(string.split(cmd)) do_command(gen, lustre, options, args) except OptionError, e: panic(cmd, e) + except Lustre.OptionError, e: + panic(cmd, e) else: try: do_command(gen, lustre, options, args) except OptionError, e: panic(string.join(sys.argv),e) + except Lustre.OptionError, e: + panic("lmc", e) if outFile == '-': PrettyPrint(doc) diff --git a/lustre/utils/load_ldap.sh b/lustre/utils/load_ldap.sh new file mode 100755 index 0000000..531d385 --- /dev/null +++ b/lustre/utils/load_ldap.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# +# Load a lustre config xml into an openldap database. +# See https://projects.clusterfs.com/lustre/LustreLDAP +# for more details. +# +# Usage: load_ldap.sh <xml_file> +set -e + +LDAP_BASE=${LDAP_BASE:-fs=lustre} +LDAP_ROOTDN=${LDAP_ROOTDN:-cn=Manager,fs=lustre} +LDAP_PW=${LDAP_PW:-secret} +LDAP_AUTH="-x -D $LDAP_ROOTDN -w $LDAP_PW" +LUSTRE=${LUSTRE:-`dirname $0`/..} + +[ ! -z $LDAPURL ] && LDAP_AUTH="$LDAP_AUTH -H $LDAPURL" + +XML=${XML:-$1} + +if [ -z "$XML" ] || [ ! -r $XML ]; then + echo "usage: $0 xmlfile" + exit 1 +fi + +NAME=`basename $XML .xml` +LDIF=/tmp/$NAME.ldif + +# add the top level record, if needed +ldapsearch $LDAP_AUTH -b $LDAP_BASE > /dev/null 2>&1 || + ldapadd $LDAP_AUTH -f $LUSTRE/conf/top.ldif + +# If this config already exists, then delete it +ldapsearch $LDAP_AUTH -b config=$NAME,$LDAP_BASE > /dev/null 2>&1 && + ldapdelete $LDAP_AUTH -r config=$NAME,$LDAP_BASE + +4xslt -D config=$NAME $XML $LUSTRE/conf/lustre2ldif.xsl > $LDIF + +echo "Loading config to 'config=$NAME,$LDAP_BASE' ..." +ldapadd $LDAP_AUTH -f $LDIF + +rm -f $LDIF diff --git a/lustre/utils/lstripe.c b/lustre/utils/lstripe.c index 39e2bdf..2cdf5d2 100644 --- a/lustre/utils/lstripe.c +++ b/lustre/utils/lstripe.c @@ -46,8 +46,13 @@ int create_file(char *name, long stripe_size, int stripe_offset, name, strerror(errno)); result = -errno; } else if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &a_striping)) { + char *errmsg = "stripe already set"; + + if (errno != EEXIST && errno != EALREADY) + errmsg = strerror(errno); + fprintf(stderr, "\nError on ioctl for '%s' (%d): %s\n", - name, fd, strerror(errno)); + name, fd, errmsg); result = -errno; } else if (close(fd) < 0) { fprintf(stderr, "\nError on close for '%s' (%d): %s\n", diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 95e5445..a89e15d 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -43,14 +43,12 @@ #include <linux/lustre_idl.h> #include <linux/lustre_dlm.h> #include <linux/obd.h> /* for struct lov_stripe_md */ -#include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */ #include <linux/lustre_build_version.h> #include <unistd.h> #include <sys/un.h> #include <time.h> #include <sys/time.h> -#include <netinet/in.h> #include <errno.h> #include <string.h> @@ -76,8 +74,7 @@ static long long counter_snapshot[2][MAX_SHMEM_COUNT]; struct timeval prev_time; #endif -uint64_t conn_addr = -1; -uint64_t conn_cookie; +uint64_t conn_cookie = -1; char rawbuf[8192]; char *buf = rawbuf; int max = sizeof(rawbuf); @@ -94,7 +91,6 @@ static char *cmdname(char *func); #define IOC_INIT(data) \ do { \ memset(&data, 0, sizeof(data)); \ - data.ioc_addr = conn_addr; \ data.ioc_cookie = conn_cookie; \ } while (0) @@ -149,11 +145,27 @@ static int do_name2dev(char *func, char *name) IOC_PACK(func, data); rc = l_ioctl(OBD_DEV_ID, OBD_IOC_NAME2DEV, buf); - if (rc < 0) { - fprintf(stderr, "error: %s: %s - %s\n", cmdname(func), - name, strerror(rc = errno)); - return rc; - } + if (rc < 0) + return errno; + IOC_UNPACK(func, data); + + return data.ioc_dev + N2D_OFF; +} + +static int do_uuid2dev(char *func, char *uuid) +{ + struct obd_ioctl_data data; + int rc; + + IOC_INIT(data); + + data.ioc_inllen1 = strlen(uuid) + 1; + data.ioc_inlbuf1 = uuid; + + IOC_PACK(func, data); + rc = l_ioctl(OBD_DEV_ID, OBD_IOC_UUID2DEV, buf); + if (rc < 0) + return errno; IOC_UNPACK(func, data); return data.ioc_dev + N2D_OFF; @@ -161,8 +173,7 @@ static int do_name2dev(char *func, char *name) /* * resolve a device name to a device number. - * supports a number or name. - * FIXME: support UUID + * supports a number, $name or %uuid. */ static int parse_devname(char *func, char *name) { @@ -172,16 +183,31 @@ static int parse_devname(char *func, char *name) if (!name) return ret; if (name[0] == '$') { - rc = do_name2dev(func, name + 1); + name++; + rc = do_name2dev(func, name); if (rc >= N2D_OFF) { ret = rc - N2D_OFF; - printf("%s is device %d\n", name, ret); + printf("Name %s is device %d\n", name, ret); } else { - fprintf(stderr, "error: %s: %s: %s\n", cmdname(func), - name, "device not found"); + printf("No device found for name %s: %s\n", + name, strerror(rc)); } - } else + } else if (name[0] == '%') { + name++; + rc = do_uuid2dev(func, name); + if (rc >= N2D_OFF) { + ret = rc - N2D_OFF; + printf("UUID %s is device %d\n", name, ret); + } else { + printf("No device found for UUID %s: %s\n", + name, strerror(rc)); + } + } else { + /* Assume it's a number. This means that bogus strings become + * 0. I might care about that some day. */ ret = strtoul(name, NULL, 0); + printf("Selected device %d\n", ret); + } return ret; } @@ -380,7 +406,7 @@ int do_disconnect(char *func, int verbose) int rc; struct obd_ioctl_data data; - if (conn_addr == -1) + if (conn_cookie == -1) return 0; IOC_INIT(data); @@ -393,8 +419,8 @@ int do_disconnect(char *func, int verbose) } else { if (verbose) printf("%s: disconnected conn "LPX64"\n", cmdname(func), - conn_addr); - conn_addr = -1; + conn_cookie); + conn_cookie = -1; } return rc; @@ -548,10 +574,8 @@ int jt_obd_connect(int argc, char **argv) if (rc < 0) fprintf(stderr, "error: %s: OBD_IOC_CONNECT %s\n", cmdname(argv[0]), strerror(rc = errno)); - else { - conn_addr = data.ioc_addr; + else conn_cookie = data.ioc_cookie; - } return rc; } @@ -560,7 +584,7 @@ int jt_obd_disconnect(int argc, char **argv) if (argc != 1) return CMD_HELP; - if (conn_addr == -1) + if (conn_cookie == -1) return 0; return do_disconnect(argv[0], 0); @@ -705,19 +729,29 @@ int jt_obd_cleanup(int argc, char **argv) { struct obd_ioctl_data data; char force = 'F'; + char failover = 'A'; + char flags[3]; + int flag_cnt = 0, n; int rc; IOC_INIT(data); - if (argc != 1 && argc != 2) + if (argc < 1 || argc > 3) return CMD_HELP; - if (argc == 2) { - if (strcmp(argv[1], "force")) + for (n = 1; n < argc; n++) + if (strcmp(argv[n], "force") == 0) { + flags[flag_cnt++] = force; + } else if (strcmp(argv[n], "failover") == 0) { + flags[flag_cnt++] = failover; + } else { + fprintf(stderr, "unknown option: %s", argv[n]); return CMD_HELP; - data.ioc_inllen1 = 1; - data.ioc_inlbuf1 = &force; - } + } + + data.ioc_inllen1 = flag_cnt; + if (flag_cnt) + data.ioc_inlbuf1 = flags; IOC_PACK(argv[0], data); rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLEANUP, buf); @@ -766,6 +800,25 @@ int jt_obd_set_readonly(int argc, char **argv) return rc; } +int jt_obd_abort_recovery(int argc, char **argv) +{ + struct obd_ioctl_data data; + int rc; + + IOC_INIT(data); + + if (argc != 1) + return CMD_HELP; + + IOC_PACK(argv[0], data); + rc = l_ioctl(OBD_DEV_ID, OBD_IOC_ABORT_RECOVERY, buf); + if (rc < 0) + fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), + strerror(rc = errno)); + + return rc; +} + int jt_obd_newdev(int argc, char **argv) { int rc; @@ -789,6 +842,29 @@ int jt_obd_newdev(int argc, char **argv) return rc; } +int jt_obd_mount_option(int argc, char **argv) +{ + int rc; + struct obd_ioctl_data data; + + IOC_INIT(data); + + if (argc != 2) + return CMD_HELP; + + data.ioc_inllen1 = strlen(argv[1]) + 1; + data.ioc_inlbuf1 = argv[1]; + + IOC_PACK(argv[0], data); + rc = l_ioctl(OBD_DEV_ID, OBD_IOC_MOUNTOPT, buf); + if (rc < 0) { + fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), + strerror(rc = errno)); + } + + return rc; +} + int jt_get_version(int argc, char **argv) { int rc; @@ -800,8 +876,7 @@ int jt_get_version(int argc, char **argv) memset(buf, 0, sizeof(buf)); data->ioc_version = OBD_IOCTL_VERSION; - data->ioc_addr = conn_addr; - data->ioc_cookie = conn_addr; + data->ioc_cookie = conn_cookie; data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data)); data->ioc_len = obd_ioctl_packlen(data); @@ -828,8 +903,7 @@ int jt_obd_list(int argc, char **argv) memset(buf, 0, sizeof(buf)); data->ioc_version = OBD_IOCTL_VERSION; - data->ioc_addr = conn_addr; - data->ioc_cookie = conn_addr; + data->ioc_cookie = conn_cookie; data->ioc_inllen1 = sizeof(buf) - size_round(sizeof(*data)); data->ioc_len = obd_ioctl_packlen(data); @@ -887,23 +961,6 @@ int jt_obd_attach(int argc, char **argv) return rc; } -int jt_obd_name2dev(int argc, char **argv) -{ - int rc; - - if (argc != 2) - return CMD_HELP; - - rc = do_name2dev(argv[0], argv[1]); - if (rc >= N2D_OFF) { - int dev = rc - N2D_OFF; - rc = do_device(argv[0], dev); - if (rc == 0) - printf("%d\n", dev); - } - return rc; -} - int jt_obd_setup(int argc, char **argv) { struct obd_ioctl_data data; @@ -1047,15 +1104,15 @@ int jt_obd_unset_stripe (int argc, char **argv) if (argc != 2) return CMD_HELP; - id = strtoll (argv[1], &end, 0); - if (*end == 0) { + id = strtoull (argv[1], &end, 0); + if (*end != 0) { fprintf (stderr, "error: %s: invalid object id '%s'\n", cmdname (argv[0]), argv[1]); return CMD_HELP; } IOC_INIT (data); - data.ioc_obdo1.o_id = lsm_buffer.lsm.lsm_object_id; + data.ioc_obdo1.o_id = id; data.ioc_obdo1.o_mode = S_IFREG | 0644; data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLMODE; @@ -1070,8 +1127,8 @@ int jt_obd_unset_stripe (int argc, char **argv) return (0); } -/* Create one or more objects, arg[1] may describe stripe meta-data. If - * not, defaults assumed. This echo-client instances stashes the stripe +/* Create one or more objects, arg[4] may describe stripe meta-data. If + * not, defaults assumed. This echo-client instance stashes the stripe * object ids. Use get_stripe on this node to print full lsm and * set_stripe on another node to cut/paste between nodes. */ @@ -1787,23 +1844,20 @@ int jt_obd_ldlm_regress_stop(int argc, char **argv) return rc; } -int jt_obd_lov_set_osc_active(int argc, char **argv) +static int do_activate(int argc, char **argv, int flag) { struct obd_ioctl_data data; int rc; IOC_INIT(data); - if (argc != 3) + if (argc != 1) return CMD_HELP; - data.ioc_inlbuf1 = argv[1]; - data.ioc_inllen1 = strlen(argv[1]) + 1; - /* reuse offset for 'active' */ - data.ioc_offset = atoi(argv[2]); + data.ioc_offset = flag; IOC_PACK(argv[0], data); - rc = l_ioctl(OBD_DEV_ID, IOC_LOV_SET_OSC_ACTIVE, buf); + rc = l_ioctl(OBD_DEV_ID, IOC_OSC_SET_ACTIVE, buf); if (rc) fprintf(stderr, "error: %s: failed: %s\n", cmdname(argv[0]), strerror(rc = errno)); @@ -1811,49 +1865,36 @@ int jt_obd_lov_set_osc_active(int argc, char **argv) return rc; } -int jt_obd_newconn(int argc, char **argv) +int jt_obd_deactivate(int argc, char **argv) { - int rc; - struct obd_ioctl_data data; - - IOC_INIT(data); - if (argc < 2 || argc > 3) - return CMD_HELP; - - data.ioc_inllen1 = strlen(argv[1]) + 1; - data.ioc_inlbuf1 = argv[1]; - - if (argc == 3) { - data.ioc_inllen2 = strlen(argv[2]) + 1; - data.ioc_inlbuf2 = argv[2]; - } - - IOC_PACK(argv[0], data); - rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_NEWCONN, buf); - if (rc < 0) - fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), - strerror(rc = errno)); + return do_activate(argc, argv, 0); +} - return rc; +int jt_obd_activate(int argc, char **argv) +{ + return do_activate(argc, argv, 1); } -int jt_obd_failconn(int argc, char **argv) +int jt_obd_recover(int argc, char **argv) { int rc; struct obd_ioctl_data data; IOC_INIT(data); - if (argc < 2) + if (argc > 2) return CMD_HELP; - data.ioc_inllen1 = strlen(argv[1]) + 1; - data.ioc_inlbuf1 = argv[1]; + if (argc == 2) { + data.ioc_inllen1 = strlen(argv[1]) + 1; + data.ioc_inlbuf1 = argv[1]; + } IOC_PACK(argv[0], data); - rc = l_ioctl(OBD_DEV_ID, OBD_IOC_RECOVD_FAILCONN, buf); - if (rc < 0) + rc = l_ioctl(OBD_DEV_ID, OBD_IOC_CLIENT_RECOVER, buf); + if (rc < 0) { fprintf(stderr, "error: %s: %s\n", cmdname(argv[0]), strerror(rc = errno)); + } return rc; } diff --git a/lustre/utils/obdctl.c b/lustre/utils/obdctl.c index 860b908..8fd4f7c 100644 --- a/lustre/utils/obdctl.c +++ b/lustre/utils/obdctl.c @@ -91,10 +91,11 @@ int main(int argc, char **argv) if (obd_initialize(argc, argv) < 0) exit(1); + Parser_init("obdctl > ", cmdlist); + if (argc > 1) { rc = Parser_execarg(argc - 1, argv + 1, cmdlist); } else { - Parser_init("obdctl > ", cmdlist); rc = Parser_commands(); } diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index f0e1a97..0203579 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -40,10 +40,11 @@ int jt_obd_detach(int argc, char **argv); int jt_obd_cleanup(int argc, char **argv); int jt_obd_no_transno(int argc, char **argv); int jt_obd_set_readonly(int argc, char **argv); +int jt_obd_abort_recovery(int argc, char **argv); int jt_obd_newdev(int argc, char **argv); +int jt_obd_mount_option(int argc, char **argv); int jt_obd_list(int argc, char **argv); int jt_obd_attach(int argc, char **argv); -int jt_obd_name2dev(int argc, char **argv); int jt_obd_setup(int argc, char **argv); int jt_obd_create(int argc, char **argv); int jt_obd_setattr(int argc, char **argv); @@ -60,9 +61,9 @@ int jt_obd_test_ldlm(int argc, char **argv); int jt_obd_ldlm_regress_start(int argc, char **argv); int jt_obd_ldlm_regress_stop(int argc, char **argv); int jt_obd_dump_ldlm(int argc, char **argv); -int jt_obd_lov_set_osc_active(int argc, char **argv); -int jt_obd_newconn(int argc, char **argv); -int jt_obd_failconn(int argc, char **argv); +int jt_obd_activate(int argc, char **argv); +int jt_obd_deactivate(int argc, char **argv); +int jt_obd_recover(int argc, char **argv); int jt_obd_mdc_lookup(int argc, char **argv); int jt_get_version(int argc, char **argv); int jt_obd_add_uuid(int argc, char **argv); diff --git a/lustre/utils/obdiolib.c b/lustre/utils/obdiolib.c index 8c79c67..c871818 100644 --- a/lustre/utils/obdiolib.c +++ b/lustre/utils/obdiolib.c @@ -38,7 +38,6 @@ obdio_iocinit (struct obdio_conn *conn) { memset (&conn->oc_data, 0, sizeof (conn->oc_data)); conn->oc_data.ioc_version = OBD_IOCTL_VERSION; - conn->oc_data.ioc_addr = conn->oc_conn_addr; conn->oc_data.ioc_cookie = conn->oc_conn_cookie; conn->oc_data.ioc_len = sizeof (conn->oc_data); } @@ -103,12 +102,11 @@ obdio_connect (int device) obdio_iocinit (conn); rc = obdio_ioctl (conn, OBD_IOC_CONNECT); if (rc != 0) { - fprintf (stderr, "obdio_connect: Can't connect to device %d: %s\n", - device, strerror (errno)); + fprintf(stderr, "obdio_connect: Can't connect to device " + "%d: %s\n", device, strerror (errno)); goto failed; } - conn->oc_conn_addr = conn->oc_data.ioc_addr; conn->oc_conn_cookie = conn->oc_data.ioc_cookie; return (conn); diff --git a/lustre/utils/obdiolib.h b/lustre/utils/obdiolib.h index 9b06941..3811b41 100644 --- a/lustre/utils/obdiolib.h +++ b/lustre/utils/obdiolib.h @@ -2,7 +2,7 @@ * vim:expandtab:shiftwidth=8:tabstop=8: * * Copyright (C) 2003 Cluster File Systems, Inc. - * Author: Eric Barton <eeb@clusterfs.com> + * Author: Eric Barton <eeb@clusterfs.com> * * This file is part of Lustre, http://www.lustre.org. * @@ -33,8 +33,7 @@ #include <linux/obd_class.h> struct obdio_conn { - int oc_fd; - uint64_t oc_conn_addr; + int oc_fd; uint64_t oc_conn_cookie; struct obd_ioctl_data oc_data; char oc_buffer[8192]; @@ -42,25 +41,25 @@ struct obdio_conn { struct obdio_barrier { uint64_t ob_id; - uint64_t ob_oid; + uint64_t ob_oid; uint64_t ob_npeers; uint64_t ob_ordinal; uint64_t ob_count; }; - + extern struct obdio_conn * obdio_connect (int device); extern void obdio_disconnect (struct obdio_conn *conn); -extern int obdio_open (struct obdio_conn *conn, uint64_t oid, - struct lustre_handle *fh); -extern int obdio_close (struct obdio_conn *conn, uint64_t oid, - struct lustre_handle *fh); -extern int obdio_pread (struct obdio_conn *conn, uint64_t oid, - char *buffer, uint32_t count, uint64_t offset); -extern int obdio_pwrite (struct obdio_conn *conn, uint64_t oid, - char *buffer, uint32_t count, uint64_t offset); +extern int obdio_open (struct obdio_conn *conn, uint64_t oid, + struct lustre_handle *fh); +extern int obdio_close (struct obdio_conn *conn, uint64_t oid, + struct lustre_handle *fh); +extern int obdio_pread (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset); +extern int obdio_pwrite (struct obdio_conn *conn, uint64_t oid, + char *buffer, uint32_t count, uint64_t offset); extern int obdio_enqueue (struct obdio_conn *conn, uint64_t oid, - int mode, uint64_t offset, uint32_t count, - struct lustre_handle *lh); + int mode, uint64_t offset, uint32_t count, + struct lustre_handle *lh); extern int obdio_cancel (struct obdio_conn *conn, struct lustre_handle *lh); extern void *obdio_alloc_aligned_buffer (void **spacep, int size); extern struct obdio_barrier *obdio_new_barrier (uint64_t oid, uint64_t id, int npeers) ; diff --git a/lustre/utils/obdstat.c b/lustre/utils/obdstat.c index 01085b9..8139fb5 100644 --- a/lustre/utils/obdstat.c +++ b/lustre/utils/obdstat.c @@ -27,13 +27,13 @@ struct one_stat *close_reqs; struct one_stat *punch_reqs; struct one_stat * -init_one_stat (char *basename, char *name) +init_one_stat (char *basename, char *name) { char fname[1024]; struct one_stat *stat = (struct one_stat *)malloc (sizeof (*stat)); - + if (stat == NULL) { - fprintf (stderr, "Can't allocate stat %s: %s\n", + fprintf (stderr, "Can't allocate stat %s: %s\n", name, strerror (errno)); abort (); } @@ -45,7 +45,7 @@ init_one_stat (char *basename, char *name) stat->fd = open (fname, O_RDONLY); if (stat->fd < 0 ) { - fprintf (stderr, "Can't open stat %s: %s\n", + fprintf (stderr, "Can't open stat %s: %s\n", fname, strerror (errno)); abort (); } @@ -54,7 +54,7 @@ init_one_stat (char *basename, char *name) } void -update_one_stat (struct one_stat *stat) +update_one_stat (struct one_stat *stat) { static char buffer[1024]; long long prev = stat->current; @@ -67,7 +67,7 @@ update_one_stat (struct one_stat *stat) stat->name, strerror (errno)); abort (); } - + buffer[nob] = 0; if (sscanf (buffer, "%Ld", &stat->current) != 1) { fprintf (stderr, "Can't parse stat %s: %s\n", @@ -82,7 +82,7 @@ double timenow () { struct timeval tv; - + gettimeofday (&tv, NULL); return (tv.tv_sec + tv.tv_usec / 1000000.0); } @@ -93,7 +93,7 @@ do_stat (void) static double last = 0.0; double now; double t; - + now = timenow(); update_one_stat (read_bytes); @@ -108,7 +108,7 @@ do_stat (void) update_one_stat (destroy_reqs); update_one_stat (statfs_reqs); update_one_stat (punch_reqs); - + if (last == 0.0) { printf ("R %Ld/%Ld W %Ld/%Ld attr %Ld/%Ld open %Ld/%Ld create %Ld/%Ld stat %Ld punch %Ld\n", read_bytes->current, read_reqs->current, @@ -125,32 +125,32 @@ do_stat (void) read_bytes->delta / ((1<<20) * t), write_reqs->delta, (int)(write_reqs->delta / t), write_bytes->delta / ((1<<20) * t)); - + if (getattr_reqs->delta != 0) printf (" ga:%Ld,%d/s", getattr_reqs->delta, (int)(getattr_reqs->delta / t)); - + if (setattr_reqs->delta != 0) printf (" sa:%Ld", setattr_reqs->delta); if (open_reqs->delta != 0) printf (" op:%Ld", open_reqs->delta); - + if (close_reqs->delta != 0) printf (" cl:%Ld", close_reqs->delta); if (create_reqs->delta != 0) printf (" cx:%Ld", create_reqs->delta); - + if (destroy_reqs->delta != 0) printf (" dx:%Ld", destroy_reqs->delta); if (statfs_reqs->delta != 0) printf (" st:%Ld", statfs_reqs->delta); - + if (punch_reqs->delta != 0) printf (" pu:%Ld", punch_reqs->delta); - + printf ("\n"); } @@ -167,9 +167,9 @@ int main (int argc, char **argv) fprintf (stderr, "obd type not specified\n"); return (1); } - + snprintf (basedir, sizeof (basedir), "/proc/sys/%s", argv[1]); - + if (argc > 2) interval = atoi (argv[2]); @@ -190,7 +190,7 @@ int main (int argc, char **argv) if (interval == 0) return (0); - + for (;;) { sleep (interval); do_stat (); diff --git a/lustre/utils/parser.c b/lustre/utils/parser.c index 0e5a9f0..fef987b 100644 --- a/lustre/utils/parser.c +++ b/lustre/utils/parser.c @@ -32,7 +32,10 @@ #define READLINE_LIBRARY #include <readline/readline.h> -//extern char **completion_matches __P((char *, rl_compentry_func_t *)); +/* completion_matches() is #if 0-ed out in modern glibc */ +#ifndef completion_matches +#define completion_matches rl_completion_matches +#endif extern void using_history(void); extern void stifle_history(int); extern void add_history(char *); diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c new file mode 100644 index 0000000..5b6a589 --- /dev/null +++ b/lustre/utils/wirecheck.c @@ -0,0 +1,588 @@ +#include <stdio.h> +#include <liblustre.h> +#include <linux/lustre_lib.h> +#include <linux/lustre_idl.h> + +#define BLANK_LINE() \ +do { \ + printf ("\n"); \ +} while (0) + +#define COMMENT(c) \ +do { \ + printf (" /* "c" */\n"); \ +} while (0) + +#define STRINGIFY(a) #a + +#define CHECK_DEFINE(a) \ +do { \ + printf(" LASSERT ("#a" == "STRINGIFY(a)");\n"); \ +} while (0) + +#define CHECK_VALUE(a) \ +do { \ + printf(" LASSERT ("#a" == %d);\n", a); \ +} while (0) + +#define CHECK_MEMBER_OFFSET(s,m) \ +do { \ + CHECK_VALUE(offsetof (struct s, m)); \ +} while (0) + +#define CHECK_MEMBER_SIZEOF(s,m) \ +do { \ + CHECK_VALUE((int)sizeof(((struct s *)0)->m)); \ +} while (0) + +#define CHECK_MEMBER(s,m) \ +do { \ + CHECK_MEMBER_OFFSET(s, m); \ + CHECK_MEMBER_SIZEOF(s, m); \ +} while (0) + +#define CHECK_STRUCT(s) \ +do { \ + COMMENT("Checks for struct "#s); \ + CHECK_VALUE((int)sizeof(struct s)); \ +} while (0) + + + +void check1 (void) +{ +#define VALUE 1234567 + + CHECK_VALUE (VALUE); + CHECK_DEFINE (VALUE); +} + +void +check_lustre_handle (void) +{ + BLANK_LINE (); + CHECK_STRUCT (lustre_handle); + CHECK_MEMBER (lustre_handle, cookie); +} + +void +check_lustre_msg (void) +{ + BLANK_LINE (); + CHECK_STRUCT (lustre_msg); + CHECK_MEMBER (lustre_msg, handle); + CHECK_MEMBER (lustre_msg, magic); + CHECK_MEMBER (lustre_msg, type); + CHECK_MEMBER (lustre_msg, version); + CHECK_MEMBER (lustre_msg, opc); + CHECK_MEMBER (lustre_msg, last_xid); + CHECK_MEMBER (lustre_msg, last_committed); + CHECK_MEMBER (lustre_msg, transno); + CHECK_MEMBER (lustre_msg, status); + CHECK_MEMBER (lustre_msg, flags); + CHECK_MEMBER (lustre_msg, bufcount); + CHECK_MEMBER (lustre_msg, buflens[7]); +} + +void +check_obdo (void) +{ + BLANK_LINE (); + CHECK_STRUCT (obdo); + CHECK_MEMBER (obdo, o_id); + CHECK_MEMBER (obdo, o_gr); + CHECK_MEMBER (obdo, o_atime); + CHECK_MEMBER (obdo, o_mtime); + CHECK_MEMBER (obdo, o_ctime); + CHECK_MEMBER (obdo, o_size); + CHECK_MEMBER (obdo, o_blocks); + CHECK_MEMBER (obdo, o_rdev); + CHECK_MEMBER (obdo, o_blksize); + CHECK_MEMBER (obdo, o_mode); + CHECK_MEMBER (obdo, o_uid); + CHECK_MEMBER (obdo, o_gid); + CHECK_MEMBER (obdo, o_flags); + CHECK_MEMBER (obdo, o_nlink); + CHECK_MEMBER (obdo, o_generation); + CHECK_MEMBER (obdo, o_valid); + CHECK_MEMBER (obdo, o_obdflags); + CHECK_MEMBER (obdo, o_easize); + CHECK_MEMBER (obdo, o_inline); +} + +void +check_obd_statfs (void) +{ + BLANK_LINE (); + CHECK_STRUCT (obd_statfs); + CHECK_MEMBER (obd_statfs, os_type); + CHECK_MEMBER (obd_statfs, os_blocks); + CHECK_MEMBER (obd_statfs, os_bfree); + CHECK_MEMBER (obd_statfs, os_bavail); + CHECK_MEMBER (obd_statfs, os_ffree); + CHECK_MEMBER (obd_statfs, os_fsid); + CHECK_MEMBER (obd_statfs, os_bsize); + CHECK_MEMBER (obd_statfs, os_namelen); +} + +void +check_obd_ioobj (void) +{ + BLANK_LINE (); + CHECK_STRUCT (obd_ioobj); + CHECK_MEMBER (obd_ioobj, ioo_id); + CHECK_MEMBER (obd_ioobj, ioo_gr); + CHECK_MEMBER (obd_ioobj, ioo_type); + CHECK_MEMBER (obd_ioobj, ioo_bufcnt); +} + +void +check_niobuf_remote (void) +{ + BLANK_LINE (); + CHECK_STRUCT (niobuf_remote); + CHECK_MEMBER (niobuf_remote, offset); + CHECK_MEMBER (niobuf_remote, len); + CHECK_MEMBER (niobuf_remote, flags); +} + +void +check_ost_body (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ost_body); + CHECK_MEMBER (ost_body, oa); +} + +void +check_ll_fid (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ll_fid); + CHECK_MEMBER (ll_fid, id); + CHECK_MEMBER (ll_fid, generation); + CHECK_MEMBER (ll_fid, f_type); +} + +void +check_mds_status_req (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_status_req); + CHECK_MEMBER (mds_status_req, flags); + CHECK_MEMBER (mds_status_req, repbuf); +} + +void +check_mds_fileh_body (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_fileh_body); + CHECK_MEMBER (mds_fileh_body, f_fid); +} + +void +check_mds_body (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_body); + CHECK_MEMBER (mds_body, fid1); + CHECK_MEMBER (mds_body, fid2); + CHECK_MEMBER (mds_body, handle); + CHECK_MEMBER (mds_body, size); + CHECK_MEMBER (mds_body, blocks); + CHECK_MEMBER (mds_body, ino); + CHECK_MEMBER (mds_body, valid); + CHECK_MEMBER (mds_body, fsuid); + CHECK_MEMBER (mds_body, fsgid); + CHECK_MEMBER (mds_body, capability); + CHECK_MEMBER (mds_body, mode); + CHECK_MEMBER (mds_body, uid); + CHECK_MEMBER (mds_body, gid); + CHECK_MEMBER (mds_body, mtime); + CHECK_MEMBER (mds_body, ctime); + CHECK_MEMBER (mds_body, atime); + CHECK_MEMBER (mds_body, flags); + CHECK_MEMBER (mds_body, rdev); + CHECK_MEMBER (mds_body, nlink); + CHECK_MEMBER (mds_body, generation); + CHECK_MEMBER (mds_body, suppgid); +} + +void +check_mds_rec_setattr (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_rec_setattr); + CHECK_MEMBER (mds_rec_setattr, sa_opcode); + CHECK_MEMBER (mds_rec_setattr, sa_fsuid); + CHECK_MEMBER (mds_rec_setattr, sa_fsgid); + CHECK_MEMBER (mds_rec_setattr, sa_cap); + CHECK_MEMBER (mds_rec_setattr, sa_reserved); + CHECK_MEMBER (mds_rec_setattr, sa_valid); + CHECK_MEMBER (mds_rec_setattr, sa_fid); + CHECK_MEMBER (mds_rec_setattr, sa_mode); + CHECK_MEMBER (mds_rec_setattr, sa_uid); + CHECK_MEMBER (mds_rec_setattr, sa_gid); + CHECK_MEMBER (mds_rec_setattr, sa_attr_flags); + CHECK_MEMBER (mds_rec_setattr, sa_size); + CHECK_MEMBER (mds_rec_setattr, sa_atime); + CHECK_MEMBER (mds_rec_setattr, sa_mtime); + CHECK_MEMBER (mds_rec_setattr, sa_ctime); + CHECK_MEMBER (mds_rec_setattr, sa_suppgid); +} + +void +check_mds_rec_create (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_rec_create); + CHECK_MEMBER (mds_rec_create, cr_opcode); + CHECK_MEMBER (mds_rec_create, cr_fsuid); + CHECK_MEMBER (mds_rec_create, cr_fsgid); + CHECK_MEMBER (mds_rec_create, cr_cap); + CHECK_MEMBER (mds_rec_create, cr_flags); + CHECK_MEMBER (mds_rec_create, cr_mode); + CHECK_MEMBER (mds_rec_create, cr_fid); + CHECK_MEMBER (mds_rec_create, cr_replayfid); + CHECK_MEMBER (mds_rec_create, cr_uid); + CHECK_MEMBER (mds_rec_create, cr_gid); + CHECK_MEMBER (mds_rec_create, cr_time); + CHECK_MEMBER (mds_rec_create, cr_rdev); + CHECK_MEMBER (mds_rec_create, cr_suppgid); +} + +void +check_mds_rec_link (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_rec_link); + CHECK_MEMBER (mds_rec_link, lk_opcode); + CHECK_MEMBER (mds_rec_link, lk_fsuid); + CHECK_MEMBER (mds_rec_link, lk_fsgid); + CHECK_MEMBER (mds_rec_link, lk_cap); + CHECK_MEMBER (mds_rec_link, lk_suppgid1); + CHECK_MEMBER (mds_rec_link, lk_suppgid2); + CHECK_MEMBER (mds_rec_link, lk_fid1); + CHECK_MEMBER (mds_rec_link, lk_fid2); +} + +void +check_mds_rec_unlink (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_rec_unlink); + CHECK_MEMBER (mds_rec_unlink, ul_opcode); + CHECK_MEMBER (mds_rec_unlink, ul_fsuid); + CHECK_MEMBER (mds_rec_unlink, ul_fsgid); + CHECK_MEMBER (mds_rec_unlink, ul_cap); + CHECK_MEMBER (mds_rec_unlink, ul_reserved); + CHECK_MEMBER (mds_rec_unlink, ul_mode); + CHECK_MEMBER (mds_rec_unlink, ul_suppgid); + CHECK_MEMBER (mds_rec_unlink, ul_fid1); + CHECK_MEMBER (mds_rec_unlink, ul_fid2); +} + +void +check_mds_rec_rename (void) +{ + BLANK_LINE (); + CHECK_STRUCT (mds_rec_rename); + CHECK_MEMBER (mds_rec_rename, rn_opcode); + CHECK_MEMBER (mds_rec_rename, rn_fsuid); + CHECK_MEMBER (mds_rec_rename, rn_fsgid); + CHECK_MEMBER (mds_rec_rename, rn_cap); + CHECK_MEMBER (mds_rec_rename, rn_suppgid1); + CHECK_MEMBER (mds_rec_rename, rn_suppgid2); + CHECK_MEMBER (mds_rec_rename, rn_fid1); + CHECK_MEMBER (mds_rec_rename, rn_fid2); +} + +void +check_lov_desc (void) +{ + BLANK_LINE (); + CHECK_STRUCT (lov_desc); + CHECK_MEMBER (lov_desc, ld_tgt_count); + CHECK_MEMBER (lov_desc, ld_active_tgt_count); + CHECK_MEMBER (lov_desc, ld_default_stripe_count); + CHECK_MEMBER (lov_desc, ld_default_stripe_size); + CHECK_MEMBER (lov_desc, ld_default_stripe_offset); + CHECK_MEMBER (lov_desc, ld_pattern); + CHECK_MEMBER (lov_desc, ld_uuid); +} + +void +check_ldlm_res_id (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_res_id); + CHECK_MEMBER (ldlm_res_id, name[RES_NAME_SIZE]); +} + +void +check_ldlm_extent (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_extent); + CHECK_MEMBER (ldlm_extent, start); + CHECK_MEMBER (ldlm_extent, end); +} + +void +check_ldlm_intent (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_intent); + CHECK_MEMBER (ldlm_intent, opc); +} + +void +check_ldlm_resource_desc (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_resource_desc); + CHECK_MEMBER (ldlm_resource_desc, lr_type); + CHECK_MEMBER (ldlm_resource_desc, lr_name); + CHECK_MEMBER (ldlm_resource_desc, lr_version[RES_VERSION_SIZE]); +} + +void +check_ldlm_lock_desc (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_lock_desc); + CHECK_MEMBER (ldlm_lock_desc, l_resource); + CHECK_MEMBER (ldlm_lock_desc, l_req_mode); + CHECK_MEMBER (ldlm_lock_desc, l_granted_mode); + CHECK_MEMBER (ldlm_lock_desc, l_extent); + CHECK_MEMBER (ldlm_lock_desc, l_version[RES_VERSION_SIZE]); +} + +void +check_ldlm_request (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_request); + CHECK_MEMBER (ldlm_request, lock_flags); + CHECK_MEMBER (ldlm_request, lock_desc); + CHECK_MEMBER (ldlm_request, lock_handle1); + CHECK_MEMBER (ldlm_request, lock_handle2); +} + +void +check_ldlm_reply (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ldlm_reply); + CHECK_MEMBER (ldlm_reply, lock_flags); + CHECK_MEMBER (ldlm_reply, lock_mode); + CHECK_MEMBER (ldlm_reply, lock_resource_name); + CHECK_MEMBER (ldlm_reply, lock_handle); + CHECK_MEMBER (ldlm_reply, lock_extent); + CHECK_MEMBER (ldlm_reply, lock_policy_res1); + CHECK_MEMBER (ldlm_reply, lock_policy_res2); +} + +void +check_ptlbd_op (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ptlbd_op); + CHECK_MEMBER (ptlbd_op, op_cmd); + CHECK_MEMBER (ptlbd_op, op_lun); + CHECK_MEMBER (ptlbd_op, op_niob_cnt); + CHECK_MEMBER (ptlbd_op, op__padding); + CHECK_MEMBER (ptlbd_op, op_block_cnt); +} + +void +check_ptlbd_niob (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ptlbd_niob); + CHECK_MEMBER (ptlbd_niob, n_xid); + CHECK_MEMBER (ptlbd_niob, n_block_nr); + CHECK_MEMBER (ptlbd_niob, n_offset); + CHECK_MEMBER (ptlbd_niob, n_length); +} + +void +check_ptlbd_rsp (void) +{ + BLANK_LINE (); + CHECK_STRUCT (ptlbd_rsp); + CHECK_MEMBER (ptlbd_rsp, r_status); + CHECK_MEMBER (ptlbd_rsp, r_error_cnt); +} + +int +main (int argc, char **argv) +{ + printf ("void lustre_assert_wire_constants (void)\n" + "{\n"); + + COMMENT ("Wire protocol assertions generated by 'wirecheck'"); + BLANK_LINE (); + + COMMENT ("Constants..."); + CHECK_DEFINE (PTLRPC_MSG_MAGIC); + CHECK_DEFINE (PTLRPC_MSG_VERSION); + + CHECK_VALUE (PTL_RPC_MSG_REQUEST); + CHECK_VALUE (PTL_RPC_MSG_ERR); + CHECK_VALUE (PTL_RPC_MSG_REPLY); + + CHECK_VALUE (MSG_LAST_REPLAY); + CHECK_VALUE (MSG_RESENT); + + CHECK_VALUE (MSG_CONNECT_RECOVERING); + CHECK_VALUE (MSG_CONNECT_RECONNECT); + CHECK_VALUE (MSG_CONNECT_REPLAYABLE); + + CHECK_VALUE (OST_REPLY); + CHECK_VALUE (OST_GETATTR); + CHECK_VALUE (OST_SETATTR); + CHECK_VALUE (OST_READ); + CHECK_VALUE (OST_WRITE); + CHECK_VALUE (OST_CREATE); + CHECK_VALUE (OST_DESTROY); + CHECK_VALUE (OST_GET_INFO); + CHECK_VALUE (OST_CONNECT); + CHECK_VALUE (OST_DISCONNECT); + CHECK_VALUE (OST_PUNCH); + CHECK_VALUE (OST_OPEN); + CHECK_VALUE (OST_CLOSE); + CHECK_VALUE (OST_STATFS); + CHECK_VALUE (OST_SAN_READ); + CHECK_VALUE (OST_SAN_WRITE); + CHECK_VALUE (OST_SYNCFS); + CHECK_VALUE (OST_LAST_OPC); + CHECK_VALUE (OST_FIRST_OPC); + + CHECK_VALUE (OBD_FL_INLINEDATA); + CHECK_VALUE (OBD_FL_OBDMDEXISTS); + + CHECK_VALUE (LOV_MAGIC); + + CHECK_VALUE (OBD_MD_FLALL); + CHECK_VALUE (OBD_MD_FLID); + CHECK_VALUE (OBD_MD_FLATIME); + CHECK_VALUE (OBD_MD_FLMTIME); + CHECK_VALUE (OBD_MD_FLCTIME); + CHECK_VALUE (OBD_MD_FLSIZE); + CHECK_VALUE (OBD_MD_FLBLOCKS); + CHECK_VALUE (OBD_MD_FLBLKSZ); + CHECK_VALUE (OBD_MD_FLMODE); + CHECK_VALUE (OBD_MD_FLTYPE); + CHECK_VALUE (OBD_MD_FLUID); + CHECK_VALUE (OBD_MD_FLGID); + CHECK_VALUE (OBD_MD_FLFLAGS); + CHECK_VALUE (OBD_MD_FLOBDFLG); + CHECK_VALUE (OBD_MD_FLNLINK); + CHECK_VALUE (OBD_MD_FLGENER); + CHECK_VALUE (OBD_MD_FLINLINE); + CHECK_VALUE (OBD_MD_FLRDEV); + CHECK_VALUE (OBD_MD_FLEASIZE); + CHECK_VALUE (OBD_MD_LINKNAME); + CHECK_VALUE (OBD_MD_FLHANDLE); + CHECK_VALUE (OBD_MD_FLCKSUM); + + CHECK_VALUE (OBD_BRW_READ); + CHECK_VALUE (OBD_BRW_WRITE); + CHECK_VALUE (OBD_BRW_CREATE); + CHECK_VALUE (OBD_BRW_SYNC); + + CHECK_DEFINE (OBD_OBJECT_EOF); + + CHECK_VALUE (OST_REQ_HAS_OA1); + + CHECK_VALUE (MDS_GETATTR); + CHECK_VALUE (MDS_GETATTR_NAME); + CHECK_VALUE (MDS_CLOSE); + CHECK_VALUE (MDS_REINT); + CHECK_VALUE (MDS_READPAGE); + CHECK_VALUE (MDS_CONNECT); + CHECK_VALUE (MDS_DISCONNECT); + CHECK_VALUE (MDS_GETSTATUS); + CHECK_VALUE (MDS_STATFS); + CHECK_VALUE (MDS_GETLOVINFO); + CHECK_VALUE (MDS_LAST_OPC); + CHECK_VALUE (MDS_FIRST_OPC); + + CHECK_VALUE (REINT_SETATTR); + CHECK_VALUE (REINT_CREATE); + CHECK_VALUE (REINT_LINK); + CHECK_VALUE (REINT_UNLINK); + CHECK_VALUE (REINT_RENAME); + CHECK_VALUE (REINT_OPEN); + CHECK_VALUE (REINT_MAX); + + CHECK_VALUE (IT_INTENT_EXEC); + CHECK_VALUE (IT_OPEN_LOOKUP); + CHECK_VALUE (IT_OPEN_NEG); + CHECK_VALUE (IT_OPEN_POS); + CHECK_VALUE (IT_OPEN_CREATE); + CHECK_VALUE (IT_OPEN_OPEN); + + CHECK_VALUE (MDS_STATUS_CONN); + CHECK_VALUE (MDS_STATUS_LOV); + + CHECK_VALUE (MDS_OPEN_HAS_EA); + + CHECK_VALUE (LOV_RAID0); + CHECK_VALUE (LOV_RAIDRR); + + CHECK_VALUE (LDLM_ENQUEUE); + CHECK_VALUE (LDLM_CONVERT); + CHECK_VALUE (LDLM_CANCEL); + CHECK_VALUE (LDLM_BL_CALLBACK); + CHECK_VALUE (LDLM_CP_CALLBACK); + CHECK_VALUE (LDLM_LAST_OPC); + CHECK_VALUE (LDLM_FIRST_OPC); + + CHECK_VALUE (PTLBD_QUERY); + CHECK_VALUE (PTLBD_READ); + CHECK_VALUE (PTLBD_WRITE); + CHECK_VALUE (PTLBD_FLUSH); + CHECK_VALUE (PTLBD_CONNECT); + CHECK_VALUE (PTLBD_DISCONNECT); + CHECK_VALUE (PTLBD_LAST_OPC); + CHECK_VALUE (PTLBD_FIRST_OPC); + + CHECK_VALUE (OBD_PING); + + COMMENT ("Sizes and Offsets"); + BLANK_LINE (); + check_lustre_handle (); + check_lustre_msg (); + check_obdo (); + check_obd_statfs (); + check_obd_ioobj (); + check_niobuf_remote (); + check_ost_body (); + check_ll_fid (); + check_mds_status_req (); + check_mds_fileh_body (); + check_mds_body (); + check_mds_rec_setattr (); + check_mds_rec_create (); + check_mds_rec_link (); + check_mds_rec_unlink (); + check_mds_rec_rename (); + check_lov_desc (); + check_ldlm_res_id (); + check_ldlm_extent (); + check_ldlm_intent (); + check_ldlm_resource_desc (); + check_ldlm_lock_desc (); + check_ldlm_request (); + check_ldlm_reply (); + check_ptlbd_op (); + check_ptlbd_niob (); + check_ptlbd_rsp (); + + printf ("}\n\n"); + + return (0); +} -- 1.8.3.1